faiss_rag_enterprise/llama_index/indices/managed/vectara/base.py

365 lines
13 KiB
Python

"""Managed index.
A managed Index - where the index is accessible via some API that
interfaces a managed service.
"""
import json
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from hashlib import blake2b
from typing import Any, Dict, List, Optional, Sequence, Type
import requests
from llama_index.core.base_query_engine import BaseQueryEngine
from llama_index.core.base_retriever import BaseRetriever
from llama_index.data_structs.data_structs import IndexDict, IndexStructType
from llama_index.indices.managed.base import BaseManagedIndex, IndexType
from llama_index.schema import BaseNode, Document, MetadataMode, TextNode
from llama_index.service_context import ServiceContext
from llama_index.storage.storage_context import StorageContext
_logger = logging.getLogger(__name__)
class VectaraIndexStruct(IndexDict):
"""Vectara Index Struct."""
@classmethod
def get_type(cls) -> IndexStructType:
"""Get index struct type."""
return IndexStructType.VECTARA
class VectaraIndex(BaseManagedIndex):
"""Vectara Index.
The Vectara index implements a managed index that uses Vectara as the backend.
Vectara performs a lot of the functions in traditional indexes in the backend:
- breaks down a document into chunks (nodes)
- Creates the embedding for each chunk (node)
- Performs the search for the top k most similar nodes to a query
- Optionally can perform summarization of the top k nodes
Args:
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
"""
def __init__(
self,
show_progress: bool = False,
nodes: Optional[Sequence[BaseNode]] = None,
vectara_customer_id: Optional[str] = None,
vectara_corpus_id: Optional[str] = None,
vectara_api_key: Optional[str] = None,
use_core_api: bool = False,
parallelize_ingest: bool = False,
**kwargs: Any,
) -> None:
"""Initialize the Vectara API."""
self.parallelize_ingest = parallelize_ingest
index_struct = VectaraIndexStruct(
index_id=str(vectara_corpus_id),
summary="Vectara Index",
)
super().__init__(
show_progress=show_progress,
index_struct=index_struct,
service_context=ServiceContext.from_defaults(
llm=None, llm_predictor=None, embed_model=None
),
**kwargs,
)
self._vectara_customer_id = vectara_customer_id or os.environ.get(
"VECTARA_CUSTOMER_ID"
)
self._vectara_corpus_id = vectara_corpus_id or os.environ.get(
"VECTARA_CORPUS_ID"
)
self._vectara_api_key = vectara_api_key or os.environ.get("VECTARA_API_KEY")
if (
self._vectara_customer_id is None
or self._vectara_corpus_id is None
or self._vectara_api_key is None
):
_logger.warning(
"Can't find Vectara credentials, customer_id or corpus_id in "
"environment."
)
raise ValueError("Missing Vectara credentials")
else:
_logger.debug(f"Using corpus id {self._vectara_corpus_id}")
# setup requests session with max 3 retries and 90s timeout
# for calling Vectara API
self._session = requests.Session() # to reuse connections
adapter = requests.adapters.HTTPAdapter(max_retries=3)
self._session.mount("https://", adapter)
self.vectara_api_timeout = 90
self.use_core_api = use_core_api
self.doc_ids: List[str] = []
# if nodes is specified, consider each node as a single document
# and use _build_index_from_nodes() to add them to the index
if nodes is not None:
self._build_index_from_nodes(nodes, use_core_api)
def _build_index_from_nodes(
self, nodes: Sequence[BaseNode], use_core_api: bool = False
) -> IndexDict:
docs = [
Document(
text=node.get_content(metadata_mode=MetadataMode.NONE),
metadata=node.metadata, # type: ignore
id_=node.id_, # type: ignore
)
for node in nodes
]
self.add_documents(docs, use_core_api)
return self.index_struct
def _get_post_headers(self) -> dict:
"""Returns headers that should be attached to each post request."""
return {
"x-api-key": self._vectara_api_key,
"customer-id": self._vectara_customer_id,
"Content-Type": "application/json",
"X-Source": "llama_index",
}
def _delete_doc(self, doc_id: str) -> bool:
"""
Delete a document from the Vectara corpus.
Args:
url (str): URL of the page to delete.
doc_id (str): ID of the document to delete.
Returns:
bool: True if deletion was successful, False otherwise.
"""
body = {
"customerId": self._vectara_customer_id,
"corpusId": self._vectara_corpus_id,
"documentId": doc_id,
}
response = self._session.post(
"https://api.vectara.io/v1/delete-doc",
data=json.dumps(body),
verify=True,
headers=self._get_post_headers(),
timeout=self.vectara_api_timeout,
)
if response.status_code != 200:
_logger.error(
f"Delete request failed for doc_id = {doc_id} with status code "
f"{response.status_code}, reason {response.reason}, text "
f"{response.text}"
)
return False
return True
def _index_doc(self, doc: dict) -> str:
request: Dict[str, Any] = {}
request["customerId"] = self._vectara_customer_id
request["corpusId"] = self._vectara_corpus_id
request["document"] = doc
if "parts" in doc:
api_url = "https://api.vectara.io/v1/core/index"
else:
api_url = "https://api.vectara.io/v1/index"
response = self._session.post(
headers=self._get_post_headers(),
url=api_url,
data=json.dumps(request),
timeout=self.vectara_api_timeout,
verify=True,
)
status_code = response.status_code
result = response.json()
status_str = result["status"]["code"] if "status" in result else None
if status_code == 409 or status_str and (status_str == "ALREADY_EXISTS"):
return "E_ALREADY_EXISTS"
elif status_code == 200 or status_str and (status_str == "INVALID_ARGUMENT"):
return "E_INVALID_ARGUMENT"
elif status_str and (status_str == "FORBIDDEN"):
return "E_NO_PERMISSIONS"
else:
return "E_SUCCEEDED"
def _insert(
self,
nodes: Sequence[BaseNode],
use_core_api: bool = False,
**insert_kwargs: Any,
) -> None:
"""Insert a set of documents (each a node)."""
def gen_hash(s: str) -> str:
hash_object = blake2b()
hash_object.update(s.encode("utf-8"))
return hash_object.hexdigest()
docs = []
for node in nodes:
metadata = node.metadata.copy()
metadata["framework"] = "llama_index"
section_key = "parts" if use_core_api else "section"
text = node.get_content(metadata_mode=MetadataMode.NONE)
doc_id = gen_hash(text)
doc = {
"documentId": doc_id,
"metadataJson": json.dumps(node.metadata),
section_key: [{"text": text}],
}
docs.append(doc)
if self.parallelize_ingest:
with ThreadPoolExecutor() as executor:
futures = [executor.submit(self._index_doc, doc) for doc in docs]
for future in futures:
ecode = future.result()
if ecode != "E_SUCCEEDED":
_logger.error(
f"Error indexing document in Vectara with error code {ecode}"
)
else:
for doc in docs:
ecode = self._index_doc(doc)
if ecode != "E_SUCCEEDED":
_logger.error(
f"Error indexing document in Vectara with error code {ecode}"
)
self.doc_ids.append(doc_id)
def add_documents(
self,
docs: Sequence[Document],
use_core_api: bool = False,
allow_update: bool = True,
) -> None:
nodes = [
TextNode(text=doc.get_content(), metadata=doc.metadata) for doc in docs # type: ignore
]
self._insert(nodes, use_core_api)
def insert_file(
self,
file_path: str,
metadata: Optional[dict] = None,
**insert_kwargs: Any,
) -> Optional[str]:
"""Vectara provides a way to add files (binary or text) directly via our API
where pre-processing and chunking occurs internally in an optimal way
This method provides a way to use that API in Llama_index.
# ruff: noqa: E501
Full API Docs: https://docs.vectara.com/docs/api-reference/indexing-apis/
file-upload/file-upload-filetypes
Args:
file_path: local file path
Files could be text, HTML, PDF, markdown, doc/docx, ppt/pptx, etc.
see API docs for full list
metadata: Optional list of metadata associated with the file
Returns:
List of ids associated with each of the files indexed
"""
if not os.path.exists(file_path):
_logger.error(f"File {file_path} does not exist")
return None
metadata = metadata or {}
metadata["framework"] = "llama_index"
files: dict = {
"file": (file_path, open(file_path, "rb")),
"doc_metadata": json.dumps(metadata),
}
headers = self._get_post_headers()
headers.pop("Content-Type")
response = self._session.post(
f"https://api.vectara.io/upload?c={self._vectara_customer_id}&o={self._vectara_corpus_id}&d=True",
files=files,
verify=True,
headers=headers,
timeout=self.vectara_api_timeout,
)
if response.status_code == 409:
doc_id = response.json()["document"]["documentId"]
_logger.info(
f"File {file_path} already exists on Vectara "
f"(doc_id={doc_id}), skipping"
)
return None
elif response.status_code == 200:
return response.json()["document"]["documentId"]
else:
_logger.info(f"Error indexing file {file_path}: {response.json()}")
return None
def delete_ref_doc(
self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any
) -> None:
raise NotImplementedError(
"Vectara does not support deleting a reference document"
)
def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None:
raise NotImplementedError(
"Vectara does not support updating a reference document"
)
def as_retriever(self, **kwargs: Any) -> BaseRetriever:
"""Return a Retriever for this managed index."""
from llama_index.indices.managed.vectara.retriever import VectaraRetriever
return VectaraRetriever(self, **kwargs)
def as_query_engine(self, **kwargs: Any) -> BaseQueryEngine:
if kwargs.get("summary_enabled", True):
from llama_index.indices.managed.vectara.query import VectaraQueryEngine
kwargs["summary_enabled"] = True
retriever = self.as_retriever(**kwargs)
return VectaraQueryEngine.from_args(retriever, **kwargs) # type: ignore
else:
from llama_index.query_engine.retriever_query_engine import (
RetrieverQueryEngine,
)
kwargs["retriever"] = self.as_retriever(**kwargs)
return RetrieverQueryEngine.from_args(**kwargs)
@classmethod
def from_documents(
cls: Type[IndexType],
documents: Sequence[Document],
storage_context: Optional[StorageContext] = None,
service_context: Optional[ServiceContext] = None,
show_progress: bool = False,
**kwargs: Any,
) -> IndexType:
"""Build a Vectara index from a sequence of documents."""
nodes = [
TextNode(text=document.get_content(), metadata=document.metadata) # type: ignore
for document in documents
]
return cls(
nodes=nodes,
show_progress=show_progress,
**kwargs,
)