faiss_rag_enterprise/llama_index/readers/file/ipynb_reader.py

41 lines
1.2 KiB
Python

import re
from pathlib import Path
from typing import Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.schema import Document
class IPYNBReader(BaseReader):
"""Image parser."""
def __init__(
self,
parser_config: Optional[Dict] = None,
concatenate: bool = False,
):
"""Init params."""
self._parser_config = parser_config
self._concatenate = concatenate
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
if file.name.endswith(".ipynb"):
try:
import nbconvert
except ImportError:
raise ImportError("Please install nbconvert 'pip install nbconvert' ")
string = nbconvert.exporters.ScriptExporter().from_file(file)[0]
# split each In[] cell into a separate string
splits = re.split(r"In\[\d+\]:", string)
# remove the first element, which is empty
splits.pop(0)
if self._concatenate:
docs = [Document(text="\n\n".join(splits), metadata=extra_info or {})]
else:
docs = [Document(text=s, metadata=extra_info or {}) for s in splits]
return docs