faiss_rag_enterprise/llama_index/readers/file/base.py

427 lines
15 KiB
Python

"""Simple reader that reads files of different formats from a directory."""
import logging
import mimetypes
import multiprocessing
import os
import warnings
from datetime import datetime
from functools import reduce
from itertools import repeat
from pathlib import Path
from typing import Any, Callable, Dict, Generator, List, Optional, Type
from tqdm import tqdm
from llama_index.readers.base import BaseReader
from llama_index.readers.file.docs_reader import DocxReader, HWPReader, PDFReader
from llama_index.readers.file.epub_reader import EpubReader
from llama_index.readers.file.image_reader import ImageReader
from llama_index.readers.file.ipynb_reader import IPYNBReader
from llama_index.readers.file.markdown_reader import MarkdownReader
from llama_index.readers.file.mbox_reader import MboxReader
from llama_index.readers.file.slides_reader import PptxReader
from llama_index.readers.file.tabular_reader import PandasCSVReader
from llama_index.readers.file.video_audio_reader import VideoAudioReader
from llama_index.schema import Document
DEFAULT_FILE_READER_CLS: Dict[str, Type[BaseReader]] = {
".hwp": HWPReader,
".pdf": PDFReader,
".docx": DocxReader,
".pptx": PptxReader,
".ppt": PptxReader,
".pptm": PptxReader,
".jpg": ImageReader,
".png": ImageReader,
".jpeg": ImageReader,
".mp3": VideoAudioReader,
".mp4": VideoAudioReader,
".csv": PandasCSVReader,
".epub": EpubReader,
".md": MarkdownReader,
".mbox": MboxReader,
".ipynb": IPYNBReader,
}
def default_file_metadata_func(file_path: str) -> Dict:
"""Get some handy metadate from filesystem.
Args:
file_path: str: file path in str
"""
return {
"file_path": file_path,
"file_name": os.path.basename(file_path),
"file_type": mimetypes.guess_type(file_path)[0],
"file_size": os.path.getsize(file_path),
"creation_date": datetime.fromtimestamp(
Path(file_path).stat().st_ctime
).strftime("%Y-%m-%d"),
"last_modified_date": datetime.fromtimestamp(
Path(file_path).stat().st_mtime
).strftime("%Y-%m-%d"),
"last_accessed_date": datetime.fromtimestamp(
Path(file_path).stat().st_atime
).strftime("%Y-%m-%d"),
}
logger = logging.getLogger(__name__)
class SimpleDirectoryReader(BaseReader):
"""Simple directory reader.
Load files from file directory.
Automatically select the best file reader given file extensions.
Args:
input_dir (str): Path to the directory.
input_files (List): List of file paths to read
(Optional; overrides input_dir, exclude)
exclude (List): glob of python file paths to exclude (Optional)
exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
encoding (str): Encoding of the files.
Default is utf-8.
errors (str): how encoding and decoding errors are to be handled,
see https://docs.python.org/3/library/functions.html#open
recursive (bool): Whether to recursively search in subdirectories.
False by default.
filename_as_id (bool): Whether to use the filename as the document id.
False by default.
required_exts (Optional[List[str]]): List of required extensions.
Default is None.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
num_files_limit (Optional[int]): Maximum number of files to read.
Default is None.
file_metadata (Optional[Callable[str, Dict]]): A function that takes
in a filename and returns a Dict of metadata for the Document.
Default is None.
"""
supported_suffix = list(DEFAULT_FILE_READER_CLS.keys())
def __init__(
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = False,
encoding: str = "utf-8",
filename_as_id: bool = False,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseReader]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
) -> None:
"""Initialize with parameters."""
super().__init__()
if not input_dir and not input_files:
raise ValueError("Must provide either `input_dir` or `input_files`.")
self.errors = errors
self.encoding = encoding
self.exclude = exclude
self.recursive = recursive
self.exclude_hidden = exclude_hidden
self.required_exts = required_exts
self.num_files_limit = num_files_limit
if input_files:
self.input_files = []
for path in input_files:
if not os.path.isfile(path):
raise ValueError(f"File {path} does not exist.")
input_file = Path(path)
self.input_files.append(input_file)
elif input_dir:
if not os.path.isdir(input_dir):
raise ValueError(f"Directory {input_dir} does not exist.")
self.input_dir = Path(input_dir)
self.exclude = exclude
self.input_files = self._add_files(self.input_dir)
if file_extractor is not None:
self.file_extractor = file_extractor
else:
self.file_extractor = {}
self.file_metadata = file_metadata or default_file_metadata_func
self.filename_as_id = filename_as_id
def is_hidden(self, path: Path) -> bool:
return any(
part.startswith(".") and part not in [".", ".."] for part in path.parts
)
def _add_files(self, input_dir: Path) -> List[Path]:
"""Add files."""
all_files = set()
rejected_files = set()
if self.exclude is not None:
for excluded_pattern in self.exclude:
if self.recursive:
# Recursive glob
for file in input_dir.rglob(excluded_pattern):
rejected_files.add(Path(file))
else:
# Non-recursive glob
for file in input_dir.glob(excluded_pattern):
rejected_files.add(Path(file))
file_refs: Generator[Path, None, None]
if self.recursive:
file_refs = Path(input_dir).rglob("*")
else:
file_refs = Path(input_dir).glob("*")
for ref in file_refs:
# Manually check if file is hidden or directory instead of
# in glob for backwards compatibility.
is_dir = ref.is_dir()
skip_because_hidden = self.exclude_hidden and self.is_hidden(ref)
skip_because_bad_ext = (
self.required_exts is not None and ref.suffix not in self.required_exts
)
skip_because_excluded = ref in rejected_files
if (
is_dir
or skip_because_hidden
or skip_because_bad_ext
or skip_because_excluded
):
continue
else:
all_files.add(ref)
new_input_files = sorted(all_files)
if len(new_input_files) == 0:
raise ValueError(f"No files found in {input_dir}.")
if self.num_files_limit is not None and self.num_files_limit > 0:
new_input_files = new_input_files[0 : self.num_files_limit]
# print total number of files added
logger.debug(
f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
)
return new_input_files
def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
"""Exclude metadata from documents.
Args:
documents (List[Document]): List of documents.
"""
for doc in documents:
# Keep only metadata['file_path'] in both embedding and llm content
# str, which contain extreme important context that about the chunks.
# Dates is provided for convenience of postprocessor such as
# TimeWeightedPostprocessor, but excluded for embedding and LLMprompts
doc.excluded_embed_metadata_keys.extend(
[
"file_name",
"file_type",
"file_size",
"creation_date",
"last_modified_date",
"last_accessed_date",
]
)
doc.excluded_llm_metadata_keys.extend(
[
"file_name",
"file_type",
"file_size",
"creation_date",
"last_modified_date",
"last_accessed_date",
]
)
return documents
@staticmethod
def load_file(
input_file: Path,
file_metadata: Callable[[str], Dict],
file_extractor: Dict[str, BaseReader],
filename_as_id: bool = False,
encoding: str = "utf-8",
errors: str = "ignore",
) -> List[Document]:
"""Static method for loading file.
NOTE: necessarily as a static method for parallel processing.
Args:
input_file (Path): _description_
file_metadata (Callable[[str], Dict]): _description_
file_extractor (Dict[str, BaseReader]): _description_
filename_as_id (bool, optional): _description_. Defaults to False.
encoding (str, optional): _description_. Defaults to "utf-8".
errors (str, optional): _description_. Defaults to "ignore".
input_file (Path): File path to read
file_metadata ([Callable[str, Dict]]): A function that takes
in a filename and returns a Dict of metadata for the Document.
file_extractor (Dict[str, BaseReader]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text.
filename_as_id (bool): Whether to use the filename as the document id.
encoding (str): Encoding of the files.
Default is utf-8.
errors (str): how encoding and decoding errors are to be handled,
see https://docs.python.org/3/library/functions.html#open
Returns:
List[Document]: loaded documents
"""
metadata: Optional[dict] = None
documents: List[Document] = []
if file_metadata is not None:
metadata = file_metadata(str(input_file))
file_suffix = input_file.suffix.lower()
if (
file_suffix in SimpleDirectoryReader.supported_suffix
or file_suffix in file_extractor
):
# use file readers
if file_suffix not in file_extractor:
# instantiate file reader if not already
reader_cls = DEFAULT_FILE_READER_CLS[file_suffix]
file_extractor[file_suffix] = reader_cls()
reader = file_extractor[file_suffix]
# load data -- catch all errors except for ImportError
try:
docs = reader.load_data(input_file, extra_info=metadata)
except ImportError as e:
# ensure that ImportError is raised so user knows
# about missing dependencies
raise ImportError(str(e))
except Exception as e:
# otherwise, just skip the file and report the error
print(
f"Failed to load file {input_file} with error: {e}. Skipping...",
flush=True,
)
return []
# iterate over docs if needed
if filename_as_id:
for i, doc in enumerate(docs):
doc.id_ = f"{input_file!s}_part_{i}"
documents.extend(docs)
else:
# do standard read
with open(input_file, errors=errors, encoding=encoding) as f:
data = f.read()
doc = Document(text=data, metadata=metadata or {})
if filename_as_id:
doc.id_ = str(input_file)
documents.append(doc)
return documents
def load_data(
self, show_progress: bool = False, num_workers: Optional[int] = None
) -> List[Document]:
"""Load data from the input directory.
Args:
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
Returns:
List[Document]: A list of documents.
"""
documents = []
files_to_process = self.input_files
if num_workers and num_workers > 1:
if num_workers > multiprocessing.cpu_count():
warnings.warn(
"Specified num_workers exceed number of CPUs in the system. "
"Setting `num_workers` down to the maximum CPU count."
)
with multiprocessing.get_context("spawn").Pool(num_workers) as p:
results = p.starmap(
SimpleDirectoryReader.load_file,
zip(
files_to_process,
repeat(self.file_metadata),
repeat(self.file_extractor),
repeat(self.filename_as_id),
repeat(self.encoding),
repeat(self.errors),
),
)
documents = reduce(lambda x, y: x + y, results)
else:
if show_progress:
files_to_process = tqdm(
self.input_files, desc="Loading files", unit="file"
)
for input_file in files_to_process:
documents.extend(
SimpleDirectoryReader.load_file(
input_file=input_file,
file_metadata=self.file_metadata,
file_extractor=self.file_extractor,
filename_as_id=self.filename_as_id,
encoding=self.encoding,
errors=self.errors,
)
)
return self._exclude_metadata(documents)
def iter_data(
self, show_progress: bool = False
) -> Generator[List[Document], Any, Any]:
"""Load data iteratively from the input directory.
Args:
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
Returns:
Generator[List[Document]]: A list of documents.
"""
files_to_process = self.input_files
if show_progress:
files_to_process = tqdm(self.input_files, desc="Loading files", unit="file")
for input_file in files_to_process:
documents = SimpleDirectoryReader.load_file(
input_file=input_file,
file_metadata=self.file_metadata,
file_extractor=self.file_extractor,
filename_as_id=self.filename_as_id,
encoding=self.encoding,
errors=self.errors,
)
documents = self._exclude_metadata(documents)
if len(documents) > 0:
yield documents