faiss_rag_enterprise/llama_index/indices/managed/colbert_index/base.py

194 lines
7.2 KiB
Python

import os
import shutil
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence
from llama_index.core.base_retriever import BaseRetriever
from llama_index.data_structs.data_structs import IndexDict
from llama_index.indices.base import BaseIndex, IndexNode
from llama_index.schema import BaseNode, NodeWithScore
from llama_index.service_context import ServiceContext
from llama_index.storage.docstore.types import RefDocInfo
from llama_index.storage.storage_context import StorageContext
# TODO(jon-chuang):
# 1. Add support for updating index (inserts/deletes)
# 2. Add proper support for storage (managing/loading from the index files)
# 3. Normalize scores (not sure what the best practice is here)
class ColbertIndex(BaseIndex[IndexDict]):
"""
Store for ColBERT v2 with PLAID indexing.
ColBERT is a neural retrieval method that tends to work
well in a zero-shot setting on out of domain datasets, due
to it's use of token-level encodings (rather than sentence or
chunk level)
Parameters:
index_path: directory containing PLAID index files.
model_name: ColBERT hugging face model name.
Default: "colbert-ir/colbertv2.0".
show_progress: whether to show progress bar when building index.
Default: False. noop for ColBERT for now.
nbits: number of bits to quantize the residual vectors. Default: 2.
kmeans_niters: number of kmeans clustering iterations. Default: 1.
gpus: number of GPUs to use for indexing. Default: 0.
rank: number of ranks to use for indexing. Default: 1.
doc_maxlen: max document length. Default: 120.
query_maxlen: max query length. Default: 60.
kmeans_niters: number of kmeans iterations. Default: 4.
"""
def __init__(
self,
nodes: Optional[Sequence[BaseNode]] = None,
objects: Optional[Sequence[IndexNode]] = None,
index_struct: Optional[IndexDict] = None,
service_context: Optional[ServiceContext] = None,
storage_context: Optional[StorageContext] = None,
model_name: str = "colbert-ir/colbertv2.0",
index_name: str = "",
show_progress: bool = False,
nbits: int = 2,
gpus: int = 0,
ranks: int = 1,
doc_maxlen: int = 120,
query_maxlen: int = 60,
kmeans_niters: int = 4,
**kwargs: Any,
) -> None:
self.model_name = model_name
self.index_path = "storage/colbert_index"
self.index_name = index_name
self.nbits = nbits
self.gpus = gpus
self.ranks = ranks
self.doc_maxlen = doc_maxlen
self.query_maxlen = query_maxlen
self.kmeans_niters = kmeans_niters
self._docs_pos_to_node_id: Dict[int, str] = {}
try:
pass
except ImportError as exc:
raise ImportError(
"Please install colbert to use this feature from the repo:",
"https://github.com/stanford-futuredata/ColBERT",
) from exc
super().__init__(
nodes=nodes,
index_struct=index_struct,
index_name=index_name,
service_context=service_context,
storage_context=storage_context,
show_progress=show_progress,
objects=objects,
**kwargs,
)
def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
raise NotImplementedError("ColbertStoreIndex does not support insertion yet.")
def _delete_node(self, node_id: str, **delete_kwargs: Any) -> None:
raise NotImplementedError("ColbertStoreIndex does not support deletion yet.")
def as_retriever(self, **kwargs: Any) -> BaseRetriever:
from .retriever import ColbertRetriever
return ColbertRetriever(index=self, object_map=self._object_map, **kwargs)
@property
def ref_doc_info(self) -> Dict[str, RefDocInfo]:
raise NotImplementedError("ColbertStoreIndex does not support ref_doc_info.")
def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexDict:
"""Generate a PLAID index from the ColBERT checkpoint via its hugging face
model_name.
"""
from colbert import Indexer, Searcher
from colbert.infra import ColBERTConfig, Run, RunConfig
index_struct = IndexDict()
docs_list = []
for i, node in enumerate(nodes):
docs_list.append(node.get_content())
self._docs_pos_to_node_id[i] = node.node_id
index_struct.add_node(node, text_id=str(i))
with Run().context(
RunConfig(index_root=self.index_path, nranks=self.ranks, gpus=self.gpus)
):
config = ColBERTConfig(
doc_maxlen=self.doc_maxlen,
query_maxlen=self.query_maxlen,
nbits=self.nbits,
kmeans_niters=self.kmeans_niters,
)
indexer = Indexer(checkpoint=self.model_name, config=config)
indexer.index(name=self.index_name, collection=docs_list, overwrite=True)
self.store = Searcher(
index=self.index_name, collection=docs_list, checkpoint=self.model_name
)
return index_struct
# @staticmethod
# def _normalize_scores(docs: List[Document]) -> None:
# "Normalizing the MaxSim scores using softmax."
# Z = sum(math.exp(doc.score) for doc in docs)
# for doc in docs:
# doc.score = math.exp(doc.score) / Z
def persist(self, persist_dir: str) -> None:
# Check if the destination directory exists
if os.path.exists(persist_dir):
# Remove the existing destination directory
shutil.rmtree(persist_dir)
# Copy PLAID vectors
shutil.copytree(
Path(self.index_path) / self.index_name, Path(persist_dir) / self.index_name
)
self._storage_context.persist(persist_dir=persist_dir)
@classmethod
def load_from_disk(cls, persist_dir: str, index_name: str = "") -> "ColbertIndex":
from colbert import Searcher
from colbert.infra import ColBERTConfig
colbert_config = ColBERTConfig.load_from_index(Path(persist_dir) / index_name)
searcher = Searcher(
index=index_name, index_root=persist_dir, config=colbert_config
)
sc = StorageContext.from_defaults(persist_dir=persist_dir)
colbert_index = ColbertIndex(
index_struct=sc.index_store.index_structs()[0], storage_context=sc
)
docs_pos_to_node_id = {
int(k): v for k, v in colbert_index.index_struct.nodes_dict.items()
}
colbert_index._docs_pos_to_node_id = docs_pos_to_node_id
colbert_index.store = searcher
return colbert_index
def query(self, query_str: str, top_k: int = 10) -> List[NodeWithScore]:
"""
Query the Colbert v2 + Plaid store.
Returns: list of NodeWithScore.
"""
doc_ids, _, scores = self.store.search(text=query_str, k=top_k)
node_doc_ids = [self._docs_pos_to_node_id[id] for id in doc_ids]
nodes = self.docstore.get_nodes(node_doc_ids)
nodes_with_score = []
for node, score in zip(nodes, scores):
nodes_with_score.append(NodeWithScore(node=node, score=score))
return nodes_with_score