144 lines
4.7 KiB
Python
144 lines
4.7 KiB
Python
"""Summary index.
|
|
|
|
A simple data structure where LlamaIndex iterates through document chunks
|
|
in sequence in order to answer a given query.
|
|
|
|
"""
|
|
|
|
from enum import Enum
|
|
from typing import Any, Dict, Optional, Sequence, Union
|
|
|
|
from llama_index.core.base_retriever import BaseRetriever
|
|
from llama_index.data_structs.data_structs import IndexList
|
|
from llama_index.indices.base import BaseIndex
|
|
from llama_index.schema import BaseNode, IndexNode
|
|
from llama_index.service_context import ServiceContext
|
|
from llama_index.storage.docstore.types import RefDocInfo
|
|
from llama_index.utils import get_tqdm_iterable
|
|
|
|
|
|
class ListRetrieverMode(str, Enum):
|
|
DEFAULT = "default"
|
|
EMBEDDING = "embedding"
|
|
LLM = "llm"
|
|
|
|
|
|
class SummaryIndex(BaseIndex[IndexList]):
|
|
"""Summary Index.
|
|
|
|
The summary index is a simple data structure where nodes are stored in
|
|
a sequence. During index construction, the document texts are
|
|
chunked up, converted to nodes, and stored in a list.
|
|
|
|
During query time, the summary index iterates through the nodes
|
|
with some optional filter parameters, and synthesizes an
|
|
answer from all the nodes.
|
|
|
|
Args:
|
|
text_qa_template (Optional[BasePromptTemplate]): A Question-Answer Prompt
|
|
(see :ref:`Prompt-Templates`).
|
|
NOTE: this is a deprecated field.
|
|
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
|
|
|
|
"""
|
|
|
|
index_struct_cls = IndexList
|
|
|
|
def __init__(
|
|
self,
|
|
nodes: Optional[Sequence[BaseNode]] = None,
|
|
objects: Optional[Sequence[IndexNode]] = None,
|
|
index_struct: Optional[IndexList] = None,
|
|
service_context: Optional[ServiceContext] = None,
|
|
show_progress: bool = False,
|
|
**kwargs: Any,
|
|
) -> None:
|
|
"""Initialize params."""
|
|
super().__init__(
|
|
nodes=nodes,
|
|
index_struct=index_struct,
|
|
service_context=service_context,
|
|
show_progress=show_progress,
|
|
objects=objects,
|
|
**kwargs,
|
|
)
|
|
|
|
def as_retriever(
|
|
self,
|
|
retriever_mode: Union[str, ListRetrieverMode] = ListRetrieverMode.DEFAULT,
|
|
**kwargs: Any,
|
|
) -> BaseRetriever:
|
|
from llama_index.indices.list.retrievers import (
|
|
SummaryIndexEmbeddingRetriever,
|
|
SummaryIndexLLMRetriever,
|
|
SummaryIndexRetriever,
|
|
)
|
|
|
|
if retriever_mode == ListRetrieverMode.DEFAULT:
|
|
return SummaryIndexRetriever(self, object_map=self._object_map, **kwargs)
|
|
elif retriever_mode == ListRetrieverMode.EMBEDDING:
|
|
return SummaryIndexEmbeddingRetriever(
|
|
self, object_map=self._object_map, **kwargs
|
|
)
|
|
elif retriever_mode == ListRetrieverMode.LLM:
|
|
return SummaryIndexLLMRetriever(self, object_map=self._object_map, **kwargs)
|
|
else:
|
|
raise ValueError(f"Unknown retriever mode: {retriever_mode}")
|
|
|
|
def _build_index_from_nodes(
|
|
self, nodes: Sequence[BaseNode], show_progress: bool = False
|
|
) -> IndexList:
|
|
"""Build the index from documents.
|
|
|
|
Args:
|
|
documents (List[BaseDocument]): A list of documents.
|
|
|
|
Returns:
|
|
IndexList: The created summary index.
|
|
"""
|
|
index_struct = IndexList()
|
|
nodes_with_progress = get_tqdm_iterable(
|
|
nodes, show_progress, "Processing nodes"
|
|
)
|
|
for n in nodes_with_progress:
|
|
index_struct.add_node(n)
|
|
return index_struct
|
|
|
|
def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
|
|
"""Insert a document."""
|
|
for n in nodes:
|
|
self._index_struct.add_node(n)
|
|
|
|
def _delete_node(self, node_id: str, **delete_kwargs: Any) -> None:
|
|
"""Delete a node."""
|
|
cur_node_ids = self._index_struct.nodes
|
|
cur_nodes = self._docstore.get_nodes(cur_node_ids)
|
|
nodes_to_keep = [n for n in cur_nodes if n.node_id != node_id]
|
|
self._index_struct.nodes = [n.node_id for n in nodes_to_keep]
|
|
|
|
@property
|
|
def ref_doc_info(self) -> Dict[str, RefDocInfo]:
|
|
"""Retrieve a dict mapping of ingested documents and their nodes+metadata."""
|
|
node_doc_ids = self._index_struct.nodes
|
|
nodes = self.docstore.get_nodes(node_doc_ids)
|
|
|
|
all_ref_doc_info = {}
|
|
for node in nodes:
|
|
ref_node = node.source_node
|
|
if not ref_node:
|
|
continue
|
|
|
|
ref_doc_info = self.docstore.get_ref_doc_info(ref_node.node_id)
|
|
if not ref_doc_info:
|
|
continue
|
|
|
|
all_ref_doc_info[ref_node.node_id] = ref_doc_info
|
|
return all_ref_doc_info
|
|
|
|
|
|
# Legacy
|
|
GPTListIndex = SummaryIndex
|
|
|
|
# New name
|
|
ListIndex = SummaryIndex
|