faiss_rag_enterprise/llama_index/readers/loading.py

53 lines
2.1 KiB
Python

from typing import Any, Dict, Type
from llama_index.readers.base import BasePydanticReader
from llama_index.readers.discord_reader import DiscordReader
from llama_index.readers.elasticsearch import ElasticsearchReader
from llama_index.readers.google_readers.gdocs import GoogleDocsReader
from llama_index.readers.google_readers.gsheets import GoogleSheetsReader
from llama_index.readers.notion import NotionPageReader
from llama_index.readers.slack import SlackReader
from llama_index.readers.string_iterable import StringIterableReader
from llama_index.readers.twitter import TwitterTweetReader
from llama_index.readers.web import (
BeautifulSoupWebReader,
RssReader,
SimpleWebPageReader,
TrafilaturaWebReader,
)
from llama_index.readers.wikipedia import WikipediaReader
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
ALL_READERS: Dict[str, Type[BasePydanticReader]] = {
DiscordReader.class_name(): DiscordReader,
ElasticsearchReader.class_name(): ElasticsearchReader,
GoogleDocsReader.class_name(): GoogleDocsReader,
GoogleSheetsReader.class_name(): GoogleSheetsReader,
NotionPageReader.class_name(): NotionPageReader,
SlackReader.class_name(): SlackReader,
StringIterableReader.class_name(): StringIterableReader,
TwitterTweetReader.class_name(): TwitterTweetReader,
SimpleWebPageReader.class_name(): SimpleWebPageReader,
TrafilaturaWebReader.class_name(): TrafilaturaWebReader,
RssReader.class_name(): RssReader,
BeautifulSoupWebReader.class_name(): BeautifulSoupWebReader,
WikipediaReader.class_name(): WikipediaReader,
YoutubeTranscriptReader.class_name(): YoutubeTranscriptReader,
}
def load_reader(data: Dict[str, Any]) -> BasePydanticReader:
if isinstance(data, BasePydanticReader):
return data
class_name = data.get("class_name", None)
if class_name is None:
raise ValueError("Must specify `class_name` in reader data.")
if class_name not in ALL_READERS:
raise ValueError(f"Reader class name {class_name} not found.")
# remove static attribute
data.pop("is_remote", None)
return ALL_READERS[class_name].from_dict(data)