faiss_rag_enterprise/llama_index/readers/file/tabular_reader.py

116 lines
3.5 KiB
Python

"""Tabular parser.
Contains parsers for tabular data files.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional
import pandas as pd
from llama_index.readers.base import BaseReader
from llama_index.schema import Document
class CSVReader(BaseReader):
"""CSV parser.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
"""
def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file.
Returns:
Union[str, List[str]]: a string or a List of strings.
"""
try:
import csv
except ImportError:
raise ImportError("csv module is required to read CSV files.")
text_list = []
with open(file) as fp:
csv_reader = csv.reader(fp)
for row in csv_reader:
text_list.append(", ".join(row))
if self._concat_rows:
return [Document(text="\n".join(text_list), metadata=extra_info)]
else:
return [Document(text=text, metadata=extra_info) for text in text_list]
class PandasCSVReader(BaseReader):
r"""Pandas-based CSV parser.
Parses CSVs using the separator detection from Pandas `read_csv`function.
If special parameters are required, use the `pandas_config` dict.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
col_joiner (str): Separator to use for joining cols per row.
Set to ", " by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_csv` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the separators, table head, etc. on its own.
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._col_joiner = col_joiner
self._row_joiner = row_joiner
self._pandas_config = pandas_config
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
df = pd.read_csv(file, **self._pandas_config)
text_list = df.apply(
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()
if self._concat_rows:
return [
Document(
text=(self._row_joiner).join(text_list), metadata=extra_info or {}
)
]
else:
return [
Document(text=text, metadata=extra_info or {}) for text in text_list
]