faiss_rag_enterprise/llama_index/node_parser/file/simple_file.py

83 lines
2.7 KiB
Python

"""Simple file node parser."""
from typing import Any, Dict, List, Optional, Sequence, Type
from llama_index.callbacks.base import CallbackManager
from llama_index.node_parser.file.html import HTMLNodeParser
from llama_index.node_parser.file.json import JSONNodeParser
from llama_index.node_parser.file.markdown import MarkdownNodeParser
from llama_index.node_parser.interface import NodeParser
from llama_index.schema import BaseNode
from llama_index.utils import get_tqdm_iterable
FILE_NODE_PARSERS: Dict[str, Type[NodeParser]] = {
".md": MarkdownNodeParser,
".html": HTMLNodeParser,
".json": JSONNodeParser,
}
class SimpleFileNodeParser(NodeParser):
"""Simple file node parser.
Splits a document loaded from a file into Nodes using logic based on the file type
automatically detects the NodeParser to use based on file type
Args:
include_metadata (bool): whether to include metadata in nodes
include_prev_next_rel (bool): whether to include prev/next relationships
"""
@classmethod
def from_defaults(
cls,
include_metadata: bool = True,
include_prev_next_rel: bool = True,
callback_manager: Optional[CallbackManager] = None,
) -> "SimpleFileNodeParser":
callback_manager = callback_manager or CallbackManager([])
return cls(
include_metadata=include_metadata,
include_prev_next_rel=include_prev_next_rel,
callback_manager=callback_manager,
)
@classmethod
def class_name(cls) -> str:
"""Get class name."""
return "SimpleFileNodeParser"
def _parse_nodes(
self,
nodes: Sequence[BaseNode],
show_progress: bool = False,
**kwargs: Any,
) -> List[BaseNode]:
"""Parse document into nodes.
Args:
nodes (Sequence[BaseNode]): nodes to parse
"""
all_nodes: List[BaseNode] = []
documents_with_progress = get_tqdm_iterable(
nodes, show_progress, "Parsing documents into nodes"
)
for document in documents_with_progress:
ext = document.metadata["extension"]
if ext in FILE_NODE_PARSERS:
parser = FILE_NODE_PARSERS[ext](
include_metadata=self.include_metadata,
include_prev_next_rel=self.include_prev_next_rel,
callback_manager=self.callback_manager,
)
nodes = parser.get_nodes_from_documents([document], show_progress)
all_nodes.extend(nodes)
else:
# What to do when file type isn't supported yet?
all_nodes.extend(document)
return all_nodes