"""Managed index. A managed Index - where the index is accessible via some API that interfaces a managed service. """ import logging from typing import Any, Dict, Optional, Sequence, Type import requests from llama_index.core.base_retriever import BaseRetriever from llama_index.data_structs.data_structs import IndexDict, IndexStructType from llama_index.indices.managed.base import BaseManagedIndex, IndexType from llama_index.schema import BaseNode, Document from llama_index.service_context import ServiceContext from llama_index.storage.storage_context import StorageContext logger = logging.getLogger(__name__) PIPELINE_TYPES = ["INGESTION", "SEARCH", "DELETION"] def get_zcp_type(value: Any) -> str: if isinstance(value, str): return "VarChar" elif isinstance(value, bool): return "Bool" elif isinstance(value, int): return "Int64" elif isinstance(value, float): return "Double" else: raise TypeError( "Invalid data type of metadata: must be str, bool, int, or float." ) class ZillizCloudPipelineIndexStruct(IndexDict): """Zilliz Cloud Pipeline's Index Struct.""" @classmethod def get_type(cls) -> IndexStructType: """Get index struct type.""" return IndexStructType.ZILLIZ_CLOUD_PIPELINE class ZillizCloudPipelineIndex(BaseManagedIndex): """Zilliz Cloud Pipeline's Index. The Zilliz Cloud Pipeline's index implements a managed index that uses Zilliz Cloud Pipelines as the backend. Args: project_id (str): Zilliz Cloud's project ID. cluster_id (str): Zilliz Cloud's cluster ID. token (str): Zilliz Cloud's token. cloud_region (str='gcp-us-west1'): The region of Zilliz Cloud's cluster. Defaults to 'gcp-us-west1'. pipeline_ids (dict=None): A dictionary of pipeline ids for INGESTION, SEARCH, DELETION. Defaults to None. collection_name (str='zcp_llamalection'): A collection name, defaults to 'zcp_llamalection'. If no pipeline_ids is given, get pipelines with collection_name. show_progress (bool): Whether to show tqdm progress bars. Defaults to False. """ def __init__( self, project_id: str, cluster_id: str, token: str, cloud_region: str = "gcp-us-west1", pipeline_ids: Optional[Dict] = None, collection_name: str = "zcp_llamalection", show_progress: bool = False, **kwargs: Any, ) -> None: self.project_id = project_id self.cluster_id = cluster_id self.token = token self.cloud_region = cloud_region self.collection_name = collection_name self.domain = ( f"https://controller.api.{cloud_region}.zillizcloud.com/v1/pipelines" ) self.headers = { "Authorization": f"Bearer {token}", "Accept": "application/json", "Content-Type": "application/json", } self.pipeline_ids = pipeline_ids or self.get_pipeline_ids() index_struct = ZillizCloudPipelineIndexStruct( index_id=collection_name, summary="Zilliz Cloud Pipeline Index", ) super().__init__( show_progress=show_progress, index_struct=index_struct, **kwargs ) if len(self.pipeline_ids) == 0: print("No available pipelines. Please create pipelines first.") else: assert set(PIPELINE_TYPES).issubset( set(self.pipeline_ids.keys()) ), f"Missing pipeline(s): {set(PIPELINE_TYPES) - set(self.pipeline_ids.keys())}" def insert_doc_url(self, url: str, metadata: Optional[Dict] = None) -> None: """Insert doc from url with an initialized index. Example: >>> from llama_index.indices import ZillizCloudPipelineIndex >>> index = ZillizCloudPipelineIndex( >>> project_id='YOUR_ZILLIZ_CLOUD_PROJECT_ID', >>> cluster_id='YOUR_ZILLIZ_CLOUD_CLUSTER_ID', >>> token='YOUR_ZILLIZ_CLOUD_API_KEY', >>> collection_name='your_collection_name' >>> ) >>> index.insert_doc_url( >>> url='https://oss_bucket.test_doc.ext', >>> metadata={'year': 2023, 'author': 'zilliz'} # only required when the Index was created with metadata schemas >>> ) """ ingest_pipe_id = self.pipeline_ids.get("INGESTION") ingestion_url = f"{self.domain}/{ingest_pipe_id}/run" if metadata is None: metadata = {} params = {"data": {"doc_url": url}} params["data"].update(metadata) response = requests.post(ingestion_url, headers=self.headers, json=params) if response.status_code != 200: raise RuntimeError(response.text) response_dict = response.json() if response_dict["code"] != 200: raise RuntimeError(response_dict) return response_dict["data"] def delete_by_doc_name(self, doc_name: str) -> int: deletion_pipe_id = self.pipeline_ids.get("DELETION") deletion_url = f"{self.domain}/{deletion_pipe_id}/run" params = {"data": {"doc_name": doc_name}} response = requests.post(deletion_url, headers=self.headers, json=params) if response.status_code != 200: raise RuntimeError(response.text) response_dict = response.json() if response_dict["code"] != 200: raise RuntimeError(response_dict) try: return response_dict["data"] except Exception as e: raise RuntimeError(f"Run Zilliz Cloud Pipelines failed: {e}") def as_retriever(self, **kwargs: Any) -> BaseRetriever: """Return a retriever.""" from llama_index.indices.managed.zilliz.retriever import ( ZillizCloudPipelineRetriever, ) return ZillizCloudPipelineRetriever(self, **kwargs) def get_pipeline_ids(self) -> dict: """Get pipeline ids.""" url = f"{self.domain}?projectId={self.project_id}" # Get pipelines response = requests.get(url, headers=self.headers) if response.status_code != 200: raise RuntimeError(response.text) response_dict = response.json() if response_dict["code"] != 200: raise RuntimeError(response_dict) data = response_dict["data"] pipeline_ids = {} for pipe_info in data: pipe_id = pipe_info["pipelineId"] pipe_type = pipe_info["type"] if pipe_type == "SEARCH": pipe_clusters = [x["clusterId"] for x in pipe_info["functions"]] pipe_collections = [x["collectionName"] for x in pipe_info["functions"]] if ( self.cluster_id in pipe_clusters and self.collection_name in pipe_collections ): pipeline_ids[pipe_type] = pipe_id elif pipe_type == "INGESTION": if ( self.cluster_id == pipe_info["clusterId"] and self.collection_name == pipe_info["newCollectionName"] ): pipeline_ids[pipe_type] = pipe_id elif pipe_type == "DELETION": if ( self.cluster_id == pipe_info["clusterId"] and self.collection_name == pipe_info["collectionName"] ): pipeline_ids[pipe_type] = pipe_id return pipeline_ids def create_pipelines( self, metadata_schema: Optional[Dict] = None, **kwargs: str ) -> dict: """Create INGESTION, SEARCH, DELETION pipelines using self.collection_name. Args: metadata_schema (Dict=None): A dictionary of metadata schema, defaults to None. Use metadata name as key and the corresponding data type as value: {'field_name': 'field_type'}. Only support the following values as the field type: 'Bool', 'Int8', 'Int16', 'Int32', 'Int64', 'Float', 'Double', 'VarChar'. kwargs: optional parameters to create ingestion pipeline - chunkSize: An integer within range [20, 500] to customize chunk size. - language: The language of documents. Available options: "ENGLISH", "CHINESE". Returns: A dictionary of pipeline ids for INGESTION, SEARCH, and DELETION pipelines. Example: >>> from llama_index.indices import ZillizCloudPipelineIndex >>> index = ZillizCloudPipelineIndex( >>> project_id='YOUR_ZILLIZ_CLOUD_PROJECT_ID', >>> cluster_id='YOUR_ZILLIZ_CLOUD_CLUSTER_ID', >>> token='YOUR_ZILLIZ_CLOUD_API_KEY', >>> collection_name='your_new_collection_name' >>> ) >>> pipeline_ids = index.create_pipelines( >>> metadata_schema={'year': 'Int32', 'author': 'VarChar'} # optional, defaults to None >>> ) """ if len(self.pipeline_ids) > 0: raise RuntimeError( f"Pipelines already exist for collection {self.collection_name}: {self.pipeline_ids}" ) params_dict = {} index_doc_func = { "name": "index_my_doc", "action": "INDEX_DOC", "inputField": "doc_url", "language": "ENGLISH", } index_doc_func.update(kwargs) functions = [index_doc_func] if metadata_schema: for k, v in metadata_schema.items(): preserve_func = { "name": f"keep_{k}", "action": "PRESERVE", "inputField": k, "outputField": k, "fieldType": v, } functions.append(preserve_func) params_dict["INGESTION"] = { "name": f"{self.collection_name}_ingestion", "projectId": self.project_id, "clusterId": self.cluster_id, "newCollectionName": self.collection_name, "type": "INGESTION", "functions": functions, } params_dict["SEARCH"] = { "name": f"{self.collection_name}_search", "projectId": self.project_id, "type": "SEARCH", "functions": [ { "name": "search_chunk_text", "action": "SEARCH_DOC_CHUNK", "inputField": "query_text", "clusterId": self.cluster_id, "collectionName": self.collection_name, } ], } params_dict["DELETION"] = { "name": f"{self.collection_name}_deletion", "type": "DELETION", "functions": [ { "name": "purge_chunks_by_doc_name", "action": "PURGE_DOC_INDEX", "inputField": "doc_name", } ], "projectId": self.project_id, "clusterId": self.cluster_id, "collectionName": self.collection_name, } for k, v in params_dict.items(): response = requests.post(self.domain, headers=self.headers, json=v) if response.status_code != 200: raise RuntimeError(response.text) response_dict = response.json() if response_dict["code"] != 200: raise RuntimeError(response_dict) self.pipeline_ids[k] = response_dict["data"]["pipelineId"] return self.pipeline_ids @classmethod def from_document_url( cls, url: str, project_id: str, cluster_id: str, token: str, cloud_region: str = "gcp-us-west1", pipeline_ids: Optional[Dict] = None, collection_name: str = "zcp_llamalection", metadata: Optional[Dict] = None, show_progress: bool = False, **kwargs: Any, ) -> BaseManagedIndex: """Zilliz Cloud Pipeline loads document from a signed url and then builds auto index for it. Args: url: a gcs or s3 signed url. project_id (str): Zilliz Cloud's project ID. cluster_id (str): Zilliz Cloud's cluster ID. token (str): Zilliz Cloud's token. cloud_region (str='gcp-us-west1'): The region of Zilliz Cloud's cluster. Defaults to 'gcp-us-west1'. pipeline_ids (dict=None): A dictionary of pipeline ids for INGESTION, SEARCH, DELETION. Defaults to None. collection_name (str='zcp_llamalection'): A collection name, defaults to 'zcp_llamalection'. If no pipeline_ids is given, get or create pipelines with collection_name. metadata (Dict=None): A dictionary of metadata. Defaults to None. The key must be string and the value must be a string, float, integer, or boolean. show_progress (bool): Whether to show tqdm progress bars. Defaults to False. Returns: An initialized ZillizCloudPipelineIndex Example: >>> from llama_index.indices import ZillizCloudPipelineIndex >>> index = ZillizCloudPipelineIndex.from_document_url( >>> url='https://oss_bucket.test_doc.ext', >>> project_id='YOUR_ZILLIZ_CLOUD_PROJECT_ID', >>> cluster_id='YOUR_ZILLIZ_CLOUD_CLUSTER_ID', >>> token='YOUR_ZILLIZ_CLOUD_API_KEY', >>> collection_name='your_collection_name' >>> ) """ metadata = metadata or {} index = cls( project_id=project_id, cluster_id=cluster_id, token=token, cloud_region=cloud_region, pipeline_ids=pipeline_ids, collection_name=collection_name, show_progress=show_progress, **kwargs, ) if len(index.pipeline_ids) == 0: index.pipeline_ids = index.create_pipelines( metadata_schema={k: get_zcp_type(v) for k, v in metadata.items()} ) print("Pipelines are automatically created.") try: index.insert_doc_url(url=url, metadata=metadata) except Exception as e: logger.error( "Failed to build managed index given document url (%s):\n%s", url, e ) return index def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None: raise NotImplementedError( "Inserting nodes is not yet supported with Zilliz Cloud Pipeline." ) def delete_ref_doc( self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any ) -> None: raise NotImplementedError( "Deleting a reference document is not yet supported with Zilliz Cloud Pipeline." ) def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None: raise NotImplementedError( "Updating referenced document is not yet supported with Zilliz Cloud Pipeline." ) @classmethod def from_documents( cls: Type[IndexType], documents: Sequence[Document], storage_context: Optional[StorageContext] = None, service_context: Optional[ServiceContext] = None, show_progress: bool = False, **kwargs: Any, ) -> IndexType: """Build a Zilliz Cloud Pipeline index from a sequence of documents.""" raise NotImplementedError( "Loading from document texts is not yet supported with Zilliz Cloud Pipeline." ) def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexDict: raise NotImplementedError( "Building index from nodes is not yet supported with Zilliz Cloud Pipeline." ) def _delete_node(self, node_id: str, **delete_kwargs: Any) -> None: raise NotImplementedError( "Deleting nodes is not yet supported with Zilliz Cloud Pipeline." )