90 lines
3.2 KiB
Python
90 lines
3.2 KiB
Python
"""General node utils."""
|
|
|
|
|
|
import logging
|
|
import uuid
|
|
from typing import List, Optional, Protocol, runtime_checkable
|
|
|
|
from llama_index.schema import (
|
|
BaseNode,
|
|
Document,
|
|
ImageDocument,
|
|
ImageNode,
|
|
NodeRelationship,
|
|
TextNode,
|
|
)
|
|
from llama_index.utils import truncate_text
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@runtime_checkable
|
|
class IdFuncCallable(Protocol):
|
|
def __call__(self, i: int, doc: BaseNode) -> str:
|
|
...
|
|
|
|
|
|
def default_id_func(i: int, doc: BaseNode) -> str:
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
def build_nodes_from_splits(
|
|
text_splits: List[str],
|
|
document: BaseNode,
|
|
ref_doc: Optional[BaseNode] = None,
|
|
id_func: Optional[IdFuncCallable] = None,
|
|
) -> List[TextNode]:
|
|
"""Build nodes from splits."""
|
|
ref_doc = ref_doc or document
|
|
id_func = id_func or default_id_func
|
|
nodes: List[TextNode] = []
|
|
for i, text_chunk in enumerate(text_splits):
|
|
logger.debug(f"> Adding chunk: {truncate_text(text_chunk, 50)}")
|
|
|
|
if isinstance(document, ImageDocument):
|
|
image_node = ImageNode(
|
|
id_=id_func(i, document),
|
|
text=text_chunk,
|
|
embedding=document.embedding,
|
|
image=document.image,
|
|
image_path=document.image_path,
|
|
image_url=document.image_url,
|
|
excluded_embed_metadata_keys=document.excluded_embed_metadata_keys,
|
|
excluded_llm_metadata_keys=document.excluded_llm_metadata_keys,
|
|
metadata_seperator=document.metadata_seperator,
|
|
metadata_template=document.metadata_template,
|
|
text_template=document.text_template,
|
|
relationships={NodeRelationship.SOURCE: ref_doc.as_related_node_info()},
|
|
)
|
|
nodes.append(image_node) # type: ignore
|
|
elif isinstance(document, Document):
|
|
node = TextNode(
|
|
id_=id_func(i, document),
|
|
text=text_chunk,
|
|
embedding=document.embedding,
|
|
excluded_embed_metadata_keys=document.excluded_embed_metadata_keys,
|
|
excluded_llm_metadata_keys=document.excluded_llm_metadata_keys,
|
|
metadata_seperator=document.metadata_seperator,
|
|
metadata_template=document.metadata_template,
|
|
text_template=document.text_template,
|
|
relationships={NodeRelationship.SOURCE: ref_doc.as_related_node_info()},
|
|
)
|
|
nodes.append(node)
|
|
elif isinstance(document, TextNode):
|
|
node = TextNode(
|
|
id_=id_func(i, document),
|
|
text=text_chunk,
|
|
embedding=document.embedding,
|
|
excluded_embed_metadata_keys=document.excluded_embed_metadata_keys,
|
|
excluded_llm_metadata_keys=document.excluded_llm_metadata_keys,
|
|
metadata_seperator=document.metadata_seperator,
|
|
metadata_template=document.metadata_template,
|
|
text_template=document.text_template,
|
|
relationships={NodeRelationship.SOURCE: ref_doc.as_related_node_info()},
|
|
)
|
|
nodes.append(node)
|
|
else:
|
|
raise ValueError(f"Unknown document type: {type(document)}")
|
|
|
|
return nodes
|