"""Elasticsearch/Opensearch vector store.""" import json import uuid from typing import Any, Dict, Iterable, List, Optional, Union, cast from llama_index.schema import BaseNode, MetadataMode, TextNode from llama_index.vector_stores.types import ( MetadataFilters, VectorStore, VectorStoreQuery, VectorStoreQueryMode, VectorStoreQueryResult, ) from llama_index.vector_stores.utils import metadata_dict_to_node, node_to_metadata_dict IMPORT_OPENSEARCH_PY_ERROR = ( "Could not import OpenSearch. Please install it with `pip install opensearch-py`." ) INVALID_HYBRID_QUERY_ERROR = ( "Please specify the lexical_query and search_pipeline for hybrid search." ) MATCH_ALL_QUERY = {"match_all": {}} # type: Dict def _import_opensearch() -> Any: """Import OpenSearch if available, otherwise raise error.""" try: from opensearchpy import OpenSearch except ImportError: raise ValueError(IMPORT_OPENSEARCH_PY_ERROR) return OpenSearch def _import_bulk() -> Any: """Import bulk if available, otherwise raise error.""" try: from opensearchpy.helpers import bulk except ImportError: raise ValueError(IMPORT_OPENSEARCH_PY_ERROR) return bulk def _import_not_found_error() -> Any: """Import not found error if available, otherwise raise error.""" try: from opensearchpy.exceptions import NotFoundError except ImportError: raise ValueError(IMPORT_OPENSEARCH_PY_ERROR) return NotFoundError def _get_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any: """Get OpenSearch client from the opensearch_url, otherwise raise error.""" try: opensearch = _import_opensearch() client = opensearch(opensearch_url, **kwargs) except ValueError as e: raise ValueError( f"OpenSearch client string provided is not in proper format. " f"Got error: {e} " ) return client def _bulk_ingest_embeddings( client: Any, index_name: str, embeddings: List[List[float]], texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, vector_field: str = "embedding", text_field: str = "content", mapping: Optional[Dict] = None, max_chunk_bytes: Optional[int] = 1 * 1024 * 1024, is_aoss: bool = False, ) -> List[str]: """Bulk Ingest Embeddings into given index.""" if not mapping: mapping = {} bulk = _import_bulk() not_found_error = _import_not_found_error() requests = [] return_ids = [] mapping = mapping try: client.indices.get(index=index_name) except not_found_error: client.indices.create(index=index_name, body=mapping) for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} _id = ids[i] if ids else str(uuid.uuid4()) request = { "_op_type": "index", "_index": index_name, vector_field: embeddings[i], text_field: text, "metadata": metadata, } if is_aoss: request["id"] = _id else: request["_id"] = _id requests.append(request) return_ids.append(_id) bulk(client, requests, max_chunk_bytes=max_chunk_bytes) if not is_aoss: client.indices.refresh(index=index_name) return return_ids def _default_approximate_search_query( query_vector: List[float], k: int = 4, vector_field: str = "embedding", ) -> Dict: """For Approximate k-NN Search, this is the default query.""" return { "size": k, "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}}, } def _parse_filters(filters: Optional[MetadataFilters]) -> Any: pre_filter = [] if filters is not None: for f in filters.legacy_filters(): pre_filter.append({f.key: json.loads(str(f.value))}) return pre_filter def _knn_search_query( embedding_field: str, query_embedding: List[float], k: int, filters: Optional[MetadataFilters] = None, ) -> Dict: """Do knn search. If there are no filters do approx-knn search. If there are (pre)-filters, do an exhaustive exact knn search using 'painless scripting'. Note that approximate knn search does not support pre-filtering. Args: query_embedding: Vector embedding to query. k: Maximum number of results. filters: Optional filters to apply before the search. Supports filter-context queries documented at https://opensearch.org/docs/latest/query-dsl/query-filter-context/ Returns: Up to k docs closest to query_embedding """ if filters is None: search_query = _default_approximate_search_query( query_embedding, k, vector_field=embedding_field ) else: pre_filter = _parse_filters(filters) # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/ search_query = _default_painless_scripting_query( query_embedding, k, space_type="l2Squared", pre_filter={"bool": {"filter": pre_filter}}, vector_field=embedding_field, ) return search_query def _hybrid_search_query( text_field: str, query_str: str, embedding_field: str, query_embedding: List[float], k: int, filters: Optional[MetadataFilters] = None, ) -> Dict: knn_query = _knn_search_query(embedding_field, query_embedding, k, filters)["query"] lexical_query = {"must": {"match": {text_field: {"query": query_str}}}} parsed_filters = _parse_filters(filters) if len(parsed_filters) > 0: lexical_query["filter"] = parsed_filters return { "size": k, "query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}}, } def __get_painless_scripting_source( space_type: str, vector_field: str = "embedding" ) -> str: """For Painless Scripting, it returns the script source based on space type.""" source_value = f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))" if space_type == "cosineSimilarity": return source_value else: return f"1/{source_value}" def _default_painless_scripting_query( query_vector: List[float], k: int = 4, space_type: str = "l2Squared", pre_filter: Optional[Union[Dict, List]] = None, vector_field: str = "embedding", ) -> Dict: """For Painless Scripting Search, this is the default query.""" if not pre_filter: pre_filter = MATCH_ALL_QUERY source = __get_painless_scripting_source(space_type, vector_field) return { "size": k, "query": { "script_score": { "query": pre_filter, "script": { "source": source, "params": { "field": vector_field, "query_value": query_vector, }, }, } }, } def _is_aoss_enabled(http_auth: Any) -> bool: """Check if the service is http_auth is set as `aoss`.""" if ( http_auth is not None and hasattr(http_auth, "service") and http_auth.service == "aoss" ): return True return False class OpensearchVectorClient: """Object encapsulating an Opensearch index that has vector search enabled. If the index does not yet exist, it is created during init. Therefore, the underlying index is assumed to either: 1) not exist yet or 2) be created due to previous usage of this class. Args: endpoint (str): URL (http/https) of elasticsearch endpoint index (str): Name of the elasticsearch index dim (int): Dimension of the vector embedding_field (str): Name of the field in the index to store embedding array in. text_field (str): Name of the field to grab text from method (Optional[dict]): Opensearch "method" JSON obj for configuring the KNN index. This includes engine, metric, and other config params. Defaults to: {"name": "hnsw", "space_type": "l2", "engine": "faiss", "parameters": {"ef_construction": 256, "m": 48}} **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py. """ def __init__( self, endpoint: str, index: str, dim: int, embedding_field: str = "embedding", text_field: str = "content", method: Optional[dict] = None, max_chunk_bytes: int = 1 * 1024 * 1024, search_pipeline: Optional[str] = None, **kwargs: Any, ): """Init params.""" if method is None: method = { "name": "hnsw", "space_type": "l2", "engine": "nmslib", "parameters": {"ef_construction": 256, "m": 48}, } if embedding_field is None: embedding_field = "embedding" self._embedding_field = embedding_field self._endpoint = endpoint self._dim = dim self._index = index self._text_field = text_field self._max_chunk_bytes = max_chunk_bytes self._search_pipeline = search_pipeline http_auth = kwargs.get("http_auth") self.is_aoss = _is_aoss_enabled(http_auth=http_auth) # initialize mapping idx_conf = { "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}}, "mappings": { "properties": { embedding_field: { "type": "knn_vector", "dimension": dim, "method": method, }, } }, } self._os_client = _get_opensearch_client(self._endpoint, **kwargs) not_found_error = _import_not_found_error() try: self._os_client.indices.get(index=self._index) except not_found_error: self._os_client.indices.create(index=self._index, body=idx_conf) self._os_client.indices.refresh(index=self._index) def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]: """Store results in the index.""" embeddings: List[List[float]] = [] texts: List[str] = [] metadatas: List[dict] = [] ids: List[str] = [] for node in nodes: ids.append(node.node_id) embeddings.append(node.get_embedding()) texts.append(node.get_content(metadata_mode=MetadataMode.NONE)) metadatas.append(node_to_metadata_dict(node, remove_text=True)) return _bulk_ingest_embeddings( self._os_client, self._index, embeddings, texts, metadatas=metadatas, ids=ids, vector_field=self._embedding_field, text_field=self._text_field, mapping=None, max_chunk_bytes=self._max_chunk_bytes, is_aoss=self.is_aoss, ) def delete_doc_id(self, doc_id: str) -> None: """Delete a document. Args: doc_id (str): document id """ self._os_client.delete(index=self._index, id=doc_id) def query( self, query_mode: VectorStoreQueryMode, query_str: Optional[str], query_embedding: List[float], k: int, filters: Optional[MetadataFilters] = None, ) -> VectorStoreQueryResult: if query_mode == VectorStoreQueryMode.HYBRID: if query_str is None or self._search_pipeline is None: raise ValueError(INVALID_HYBRID_QUERY_ERROR) search_query = _hybrid_search_query( self._text_field, query_str, self._embedding_field, query_embedding, k, filters=filters, ) params = {"search_pipeline": self._search_pipeline} else: search_query = _knn_search_query( self._embedding_field, query_embedding, k, filters=filters ) params = None res = self._os_client.search( index=self._index, body=search_query, params=params ) nodes = [] ids = [] scores = [] for hit in res["hits"]["hits"]: source = hit["_source"] node_id = hit["_id"] text = source[self._text_field] metadata = source.get("metadata", None) try: node = metadata_dict_to_node(metadata) node.text = text except Exception: # TODO: Legacy support for old nodes node_info = source.get("node_info") relationships = source.get("relationships") or {} start_char_idx = None end_char_idx = None if isinstance(node_info, dict): start_char_idx = node_info.get("start", None) end_char_idx = node_info.get("end", None) node = TextNode( text=text, metadata=metadata, id_=node_id, start_char_idx=start_char_idx, end_char_idx=end_char_idx, relationships=relationships, extra_info=source, ) ids.append(node_id) nodes.append(node) scores.append(hit["_score"]) return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores) class OpensearchVectorStore(VectorStore): """Elasticsearch/Opensearch vector store. Args: client (OpensearchVectorClient): Vector index client to use for data insertion/querying. """ stores_text: bool = True def __init__( self, client: OpensearchVectorClient, ) -> None: """Initialize params.""" self._client = client @property def client(self) -> Any: """Get client.""" return self._client def add( self, nodes: List[BaseNode], **add_kwargs: Any, ) -> List[str]: """Add nodes to index. Args: nodes: List[BaseNode]: list of nodes with embeddings. """ self._client.index_results(nodes) return [result.node_id for result in nodes] def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: """ Delete nodes using with ref_doc_id. Args: ref_doc_id (str): The doc_id of the document to delete. """ self._client.delete_doc_id(ref_doc_id) def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: """Query index for top k most similar nodes. Args: query (VectorStoreQuery): Store query object. """ query_embedding = cast(List[float], query.query_embedding) return self._client.query( query.mode, query.query_str, query_embedding, query.similarity_top_k, filters=query.filters, )