83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
"""Simple file node parser."""
|
|
from typing import Any, Dict, List, Optional, Sequence, Type
|
|
|
|
from llama_index.callbacks.base import CallbackManager
|
|
from llama_index.node_parser.file.html import HTMLNodeParser
|
|
from llama_index.node_parser.file.json import JSONNodeParser
|
|
from llama_index.node_parser.file.markdown import MarkdownNodeParser
|
|
from llama_index.node_parser.interface import NodeParser
|
|
from llama_index.schema import BaseNode
|
|
from llama_index.utils import get_tqdm_iterable
|
|
|
|
FILE_NODE_PARSERS: Dict[str, Type[NodeParser]] = {
|
|
".md": MarkdownNodeParser,
|
|
".html": HTMLNodeParser,
|
|
".json": JSONNodeParser,
|
|
}
|
|
|
|
|
|
class SimpleFileNodeParser(NodeParser):
|
|
"""Simple file node parser.
|
|
|
|
Splits a document loaded from a file into Nodes using logic based on the file type
|
|
automatically detects the NodeParser to use based on file type
|
|
|
|
Args:
|
|
include_metadata (bool): whether to include metadata in nodes
|
|
include_prev_next_rel (bool): whether to include prev/next relationships
|
|
|
|
"""
|
|
|
|
@classmethod
|
|
def from_defaults(
|
|
cls,
|
|
include_metadata: bool = True,
|
|
include_prev_next_rel: bool = True,
|
|
callback_manager: Optional[CallbackManager] = None,
|
|
) -> "SimpleFileNodeParser":
|
|
callback_manager = callback_manager or CallbackManager([])
|
|
|
|
return cls(
|
|
include_metadata=include_metadata,
|
|
include_prev_next_rel=include_prev_next_rel,
|
|
callback_manager=callback_manager,
|
|
)
|
|
|
|
@classmethod
|
|
def class_name(cls) -> str:
|
|
"""Get class name."""
|
|
return "SimpleFileNodeParser"
|
|
|
|
def _parse_nodes(
|
|
self,
|
|
nodes: Sequence[BaseNode],
|
|
show_progress: bool = False,
|
|
**kwargs: Any,
|
|
) -> List[BaseNode]:
|
|
"""Parse document into nodes.
|
|
|
|
Args:
|
|
nodes (Sequence[BaseNode]): nodes to parse
|
|
"""
|
|
all_nodes: List[BaseNode] = []
|
|
documents_with_progress = get_tqdm_iterable(
|
|
nodes, show_progress, "Parsing documents into nodes"
|
|
)
|
|
|
|
for document in documents_with_progress:
|
|
ext = document.metadata["extension"]
|
|
if ext in FILE_NODE_PARSERS:
|
|
parser = FILE_NODE_PARSERS[ext](
|
|
include_metadata=self.include_metadata,
|
|
include_prev_next_rel=self.include_prev_next_rel,
|
|
callback_manager=self.callback_manager,
|
|
)
|
|
|
|
nodes = parser.get_nodes_from_documents([document], show_progress)
|
|
all_nodes.extend(nodes)
|
|
else:
|
|
# What to do when file type isn't supported yet?
|
|
all_nodes.extend(document)
|
|
|
|
return all_nodes
|