"""Tabular parser. Contains parsers for tabular data files. """ from pathlib import Path from typing import Any, Dict, List, Optional import pandas as pd from llama_index.readers.base import BaseReader from llama_index.schema import Document class CSVReader(BaseReader): """CSV parser. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. """ def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows def load_data( self, file: Path, extra_info: Optional[Dict] = None ) -> List[Document]: """Parse file. Returns: Union[str, List[str]]: a string or a List of strings. """ try: import csv except ImportError: raise ImportError("csv module is required to read CSV files.") text_list = [] with open(file) as fp: csv_reader = csv.reader(fp) for row in csv_reader: text_list.append(", ".join(row)) if self._concat_rows: return [Document(text="\n".join(text_list), metadata=extra_info)] else: return [Document(text=text, metadata=extra_info) for text in text_list] class PandasCSVReader(BaseReader): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv`function. If special parameters are required, use the `pandas_config` dict. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. col_joiner (str): Separator to use for joining cols per row. Set to ", " by default. row_joiner (str): Separator to use for joining each row. Only used when `concat_rows=True`. Set to "\n" by default. pandas_config (dict): Options for the `pandas.read_csv` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information. Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. """ def __init__( self, *args: Any, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config def load_data( self, file: Path, extra_info: Optional[Dict] = None ) -> List[Document]: """Parse file.""" df = pd.read_csv(file, **self._pandas_config) text_list = df.apply( lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 ).tolist() if self._concat_rows: return [ Document( text=(self._row_joiner).join(text_list), metadata=extra_info or {} ) ] else: return [ Document(text=text, metadata=extra_info or {}) for text in text_list ]