faiss_rag_enterprise/llama_index/indices/managed/zilliz/base.py

407 lines
16 KiB
Python

"""Managed index.
A managed Index - where the index is accessible via some API that
interfaces a managed service.
"""
import logging
from typing import Any, Dict, Optional, Sequence, Type
import requests
from llama_index.core.base_retriever import BaseRetriever
from llama_index.data_structs.data_structs import IndexDict, IndexStructType
from llama_index.indices.managed.base import BaseManagedIndex, IndexType
from llama_index.schema import BaseNode, Document
from llama_index.service_context import ServiceContext
from llama_index.storage.storage_context import StorageContext
logger = logging.getLogger(__name__)
PIPELINE_TYPES = ["INGESTION", "SEARCH", "DELETION"]
def get_zcp_type(value: Any) -> str:
if isinstance(value, str):
return "VarChar"
elif isinstance(value, bool):
return "Bool"
elif isinstance(value, int):
return "Int64"
elif isinstance(value, float):
return "Double"
else:
raise TypeError(
"Invalid data type of metadata: must be str, bool, int, or float."
)
class ZillizCloudPipelineIndexStruct(IndexDict):
"""Zilliz Cloud Pipeline's Index Struct."""
@classmethod
def get_type(cls) -> IndexStructType:
"""Get index struct type."""
return IndexStructType.ZILLIZ_CLOUD_PIPELINE
class ZillizCloudPipelineIndex(BaseManagedIndex):
"""Zilliz Cloud Pipeline's Index.
The Zilliz Cloud Pipeline's index implements a managed index that uses Zilliz Cloud Pipelines as the backend.
Args:
project_id (str): Zilliz Cloud's project ID.
cluster_id (str): Zilliz Cloud's cluster ID.
token (str): Zilliz Cloud's token.
cloud_region (str='gcp-us-west1'): The region of Zilliz Cloud's cluster. Defaults to 'gcp-us-west1'.
pipeline_ids (dict=None): A dictionary of pipeline ids for INGESTION, SEARCH, DELETION. Defaults to None.
collection_name (str='zcp_llamalection'): A collection name, defaults to 'zcp_llamalection'. If no pipeline_ids is given, get pipelines with collection_name.
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
"""
def __init__(
self,
project_id: str,
cluster_id: str,
token: str,
cloud_region: str = "gcp-us-west1",
pipeline_ids: Optional[Dict] = None,
collection_name: str = "zcp_llamalection",
show_progress: bool = False,
**kwargs: Any,
) -> None:
self.project_id = project_id
self.cluster_id = cluster_id
self.token = token
self.cloud_region = cloud_region
self.collection_name = collection_name
self.domain = (
f"https://controller.api.{cloud_region}.zillizcloud.com/v1/pipelines"
)
self.headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/json",
"Content-Type": "application/json",
}
self.pipeline_ids = pipeline_ids or self.get_pipeline_ids()
index_struct = ZillizCloudPipelineIndexStruct(
index_id=collection_name,
summary="Zilliz Cloud Pipeline Index",
)
super().__init__(
show_progress=show_progress, index_struct=index_struct, **kwargs
)
if len(self.pipeline_ids) == 0:
print("No available pipelines. Please create pipelines first.")
else:
assert set(PIPELINE_TYPES).issubset(
set(self.pipeline_ids.keys())
), f"Missing pipeline(s): {set(PIPELINE_TYPES) - set(self.pipeline_ids.keys())}"
def insert_doc_url(self, url: str, metadata: Optional[Dict] = None) -> None:
"""Insert doc from url with an initialized index.
Example:
>>> from llama_index.indices import ZillizCloudPipelineIndex
>>> index = ZillizCloudPipelineIndex(
>>> project_id='YOUR_ZILLIZ_CLOUD_PROJECT_ID',
>>> cluster_id='YOUR_ZILLIZ_CLOUD_CLUSTER_ID',
>>> token='YOUR_ZILLIZ_CLOUD_API_KEY',
>>> collection_name='your_collection_name'
>>> )
>>> index.insert_doc_url(
>>> url='https://oss_bucket.test_doc.ext',
>>> metadata={'year': 2023, 'author': 'zilliz'} # only required when the Index was created with metadata schemas
>>> )
"""
ingest_pipe_id = self.pipeline_ids.get("INGESTION")
ingestion_url = f"{self.domain}/{ingest_pipe_id}/run"
if metadata is None:
metadata = {}
params = {"data": {"doc_url": url}}
params["data"].update(metadata)
response = requests.post(ingestion_url, headers=self.headers, json=params)
if response.status_code != 200:
raise RuntimeError(response.text)
response_dict = response.json()
if response_dict["code"] != 200:
raise RuntimeError(response_dict)
return response_dict["data"]
def delete_by_doc_name(self, doc_name: str) -> int:
deletion_pipe_id = self.pipeline_ids.get("DELETION")
deletion_url = f"{self.domain}/{deletion_pipe_id}/run"
params = {"data": {"doc_name": doc_name}}
response = requests.post(deletion_url, headers=self.headers, json=params)
if response.status_code != 200:
raise RuntimeError(response.text)
response_dict = response.json()
if response_dict["code"] != 200:
raise RuntimeError(response_dict)
try:
return response_dict["data"]
except Exception as e:
raise RuntimeError(f"Run Zilliz Cloud Pipelines failed: {e}")
def as_retriever(self, **kwargs: Any) -> BaseRetriever:
"""Return a retriever."""
from llama_index.indices.managed.zilliz.retriever import (
ZillizCloudPipelineRetriever,
)
return ZillizCloudPipelineRetriever(self, **kwargs)
def get_pipeline_ids(self) -> dict:
"""Get pipeline ids."""
url = f"{self.domain}?projectId={self.project_id}"
# Get pipelines
response = requests.get(url, headers=self.headers)
if response.status_code != 200:
raise RuntimeError(response.text)
response_dict = response.json()
if response_dict["code"] != 200:
raise RuntimeError(response_dict)
data = response_dict["data"]
pipeline_ids = {}
for pipe_info in data:
pipe_id = pipe_info["pipelineId"]
pipe_type = pipe_info["type"]
if pipe_type == "SEARCH":
pipe_clusters = [x["clusterId"] for x in pipe_info["functions"]]
pipe_collections = [x["collectionName"] for x in pipe_info["functions"]]
if (
self.cluster_id in pipe_clusters
and self.collection_name in pipe_collections
):
pipeline_ids[pipe_type] = pipe_id
elif pipe_type == "INGESTION":
if (
self.cluster_id == pipe_info["clusterId"]
and self.collection_name == pipe_info["newCollectionName"]
):
pipeline_ids[pipe_type] = pipe_id
elif pipe_type == "DELETION":
if (
self.cluster_id == pipe_info["clusterId"]
and self.collection_name == pipe_info["collectionName"]
):
pipeline_ids[pipe_type] = pipe_id
return pipeline_ids
def create_pipelines(
self, metadata_schema: Optional[Dict] = None, **kwargs: str
) -> dict:
"""Create INGESTION, SEARCH, DELETION pipelines using self.collection_name.
Args:
metadata_schema (Dict=None): A dictionary of metadata schema, defaults to None. Use metadata name as key and the corresponding data type as value: {'field_name': 'field_type'}.
Only support the following values as the field type: 'Bool', 'Int8', 'Int16', 'Int32', 'Int64', 'Float', 'Double', 'VarChar'.
kwargs: optional parameters to create ingestion pipeline
- chunkSize: An integer within range [20, 500] to customize chunk size.
- language: The language of documents. Available options: "ENGLISH", "CHINESE".
Returns:
A dictionary of pipeline ids for INGESTION, SEARCH, and DELETION pipelines.
Example:
>>> from llama_index.indices import ZillizCloudPipelineIndex
>>> index = ZillizCloudPipelineIndex(
>>> project_id='YOUR_ZILLIZ_CLOUD_PROJECT_ID',
>>> cluster_id='YOUR_ZILLIZ_CLOUD_CLUSTER_ID',
>>> token='YOUR_ZILLIZ_CLOUD_API_KEY',
>>> collection_name='your_new_collection_name'
>>> )
>>> pipeline_ids = index.create_pipelines(
>>> metadata_schema={'year': 'Int32', 'author': 'VarChar'} # optional, defaults to None
>>> )
"""
if len(self.pipeline_ids) > 0:
raise RuntimeError(
f"Pipelines already exist for collection {self.collection_name}: {self.pipeline_ids}"
)
params_dict = {}
index_doc_func = {
"name": "index_my_doc",
"action": "INDEX_DOC",
"inputField": "doc_url",
"language": "ENGLISH",
}
index_doc_func.update(kwargs)
functions = [index_doc_func]
if metadata_schema:
for k, v in metadata_schema.items():
preserve_func = {
"name": f"keep_{k}",
"action": "PRESERVE",
"inputField": k,
"outputField": k,
"fieldType": v,
}
functions.append(preserve_func)
params_dict["INGESTION"] = {
"name": f"{self.collection_name}_ingestion",
"projectId": self.project_id,
"clusterId": self.cluster_id,
"newCollectionName": self.collection_name,
"type": "INGESTION",
"functions": functions,
}
params_dict["SEARCH"] = {
"name": f"{self.collection_name}_search",
"projectId": self.project_id,
"type": "SEARCH",
"functions": [
{
"name": "search_chunk_text",
"action": "SEARCH_DOC_CHUNK",
"inputField": "query_text",
"clusterId": self.cluster_id,
"collectionName": self.collection_name,
}
],
}
params_dict["DELETION"] = {
"name": f"{self.collection_name}_deletion",
"type": "DELETION",
"functions": [
{
"name": "purge_chunks_by_doc_name",
"action": "PURGE_DOC_INDEX",
"inputField": "doc_name",
}
],
"projectId": self.project_id,
"clusterId": self.cluster_id,
"collectionName": self.collection_name,
}
for k, v in params_dict.items():
response = requests.post(self.domain, headers=self.headers, json=v)
if response.status_code != 200:
raise RuntimeError(response.text)
response_dict = response.json()
if response_dict["code"] != 200:
raise RuntimeError(response_dict)
self.pipeline_ids[k] = response_dict["data"]["pipelineId"]
return self.pipeline_ids
@classmethod
def from_document_url(
cls,
url: str,
project_id: str,
cluster_id: str,
token: str,
cloud_region: str = "gcp-us-west1",
pipeline_ids: Optional[Dict] = None,
collection_name: str = "zcp_llamalection",
metadata: Optional[Dict] = None,
show_progress: bool = False,
**kwargs: Any,
) -> BaseManagedIndex:
"""Zilliz Cloud Pipeline loads document from a signed url and then builds auto index for it.
Args:
url: a gcs or s3 signed url.
project_id (str): Zilliz Cloud's project ID.
cluster_id (str): Zilliz Cloud's cluster ID.
token (str): Zilliz Cloud's token.
cloud_region (str='gcp-us-west1'): The region of Zilliz Cloud's cluster. Defaults to 'gcp-us-west1'.
pipeline_ids (dict=None): A dictionary of pipeline ids for INGESTION, SEARCH, DELETION. Defaults to None.
collection_name (str='zcp_llamalection'): A collection name, defaults to 'zcp_llamalection'. If no pipeline_ids is given, get or create pipelines with collection_name.
metadata (Dict=None): A dictionary of metadata. Defaults to None. The key must be string and the value must be a string, float, integer, or boolean.
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
Returns:
An initialized ZillizCloudPipelineIndex
Example:
>>> from llama_index.indices import ZillizCloudPipelineIndex
>>> index = ZillizCloudPipelineIndex.from_document_url(
>>> url='https://oss_bucket.test_doc.ext',
>>> project_id='YOUR_ZILLIZ_CLOUD_PROJECT_ID',
>>> cluster_id='YOUR_ZILLIZ_CLOUD_CLUSTER_ID',
>>> token='YOUR_ZILLIZ_CLOUD_API_KEY',
>>> collection_name='your_collection_name'
>>> )
"""
metadata = metadata or {}
index = cls(
project_id=project_id,
cluster_id=cluster_id,
token=token,
cloud_region=cloud_region,
pipeline_ids=pipeline_ids,
collection_name=collection_name,
show_progress=show_progress,
**kwargs,
)
if len(index.pipeline_ids) == 0:
index.pipeline_ids = index.create_pipelines(
metadata_schema={k: get_zcp_type(v) for k, v in metadata.items()}
)
print("Pipelines are automatically created.")
try:
index.insert_doc_url(url=url, metadata=metadata)
except Exception as e:
logger.error(
"Failed to build managed index given document url (%s):\n%s", url, e
)
return index
def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
raise NotImplementedError(
"Inserting nodes is not yet supported with Zilliz Cloud Pipeline."
)
def delete_ref_doc(
self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any
) -> None:
raise NotImplementedError(
"Deleting a reference document is not yet supported with Zilliz Cloud Pipeline."
)
def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None:
raise NotImplementedError(
"Updating referenced document is not yet supported with Zilliz Cloud Pipeline."
)
@classmethod
def from_documents(
cls: Type[IndexType],
documents: Sequence[Document],
storage_context: Optional[StorageContext] = None,
service_context: Optional[ServiceContext] = None,
show_progress: bool = False,
**kwargs: Any,
) -> IndexType:
"""Build a Zilliz Cloud Pipeline index from a sequence of documents."""
raise NotImplementedError(
"Loading from document texts is not yet supported with Zilliz Cloud Pipeline."
)
def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexDict:
raise NotImplementedError(
"Building index from nodes is not yet supported with Zilliz Cloud Pipeline."
)
def _delete_node(self, node_id: str, **delete_kwargs: Any) -> None:
raise NotImplementedError(
"Deleting nodes is not yet supported with Zilliz Cloud Pipeline."
)