""" Pinecone Vector store index. An index that is built on top of an existing vector store. """ import logging from collections import Counter from functools import partial from typing import Any, Callable, Dict, List, Optional, cast from llama_index.bridge.pydantic import PrivateAttr from llama_index.schema import BaseNode, MetadataMode, TextNode from llama_index.vector_stores.pinecone_utils import _import_pinecone, _is_pinecone_v3 from llama_index.vector_stores.types import ( BasePydanticVectorStore, MetadataFilters, VectorStoreQuery, VectorStoreQueryMode, VectorStoreQueryResult, ) from llama_index.vector_stores.utils import ( DEFAULT_TEXT_KEY, legacy_metadata_dict_to_node, metadata_dict_to_node, node_to_metadata_dict, ) ID_KEY = "id" VECTOR_KEY = "values" SPARSE_VECTOR_KEY = "sparse_values" METADATA_KEY = "metadata" DEFAULT_BATCH_SIZE = 100 _logger = logging.getLogger(__name__) def _transform_pinecone_filter_condition(condition: str) -> str: """Translate standard metadata filter op to Pinecone specific spec.""" if condition == "and": return "$and" elif condition == "or": return "$or" else: raise ValueError(f"Filter condition {condition} not supported") def _transform_pinecone_filter_operator(operator: str) -> str: """Translate standard metadata filter operator to Pinecone specific spec.""" if operator == "!=": return "$ne" elif operator == "==": return "$eq" elif operator == ">": return "$gt" elif operator == "<": return "$lt" elif operator == ">=": return "$gte" elif operator == "<=": return "$lte" elif operator == "in": return "$in" elif operator == "nin": return "$nin" else: raise ValueError(f"Filter operator {operator} not supported") def build_dict(input_batch: List[List[int]]) -> List[Dict[str, Any]]: """Build a list of sparse dictionaries from a batch of input_ids. NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/. """ # store a batch of sparse embeddings sparse_emb = [] # iterate through input batch for token_ids in input_batch: indices = [] values = [] # convert the input_ids list to a dictionary of key to frequency values d = dict(Counter(token_ids)) for idx in d: indices.append(idx) values.append(float(d[idx])) sparse_emb.append({"indices": indices, "values": values}) # return sparse_emb list return sparse_emb def generate_sparse_vectors( context_batch: List[str], tokenizer: Callable ) -> List[Dict[str, Any]]: """Generate sparse vectors from a batch of contexts. NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/. """ # create batch of input_ids inputs = tokenizer(context_batch)["input_ids"] # create sparse dictionaries return build_dict(inputs) def get_default_tokenizer() -> Callable: """Get default tokenizer. NOTE: taken from https://www.pinecone.io/learn/hybrid-search-intro/. """ from transformers import BertTokenizerFast orig_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") # set some default arguments, so input is just a list of strings return partial( orig_tokenizer, padding=True, truncation=True, max_length=512, ) def _to_pinecone_filter(standard_filters: MetadataFilters) -> dict: """Convert from standard dataclass to pinecone filter dict.""" filters = {} filters_list = [] condition = standard_filters.condition or "and" condition = _transform_pinecone_filter_condition(condition) if standard_filters.filters: for filter in standard_filters.filters: if filter.operator: filters_list.append( { filter.key: { _transform_pinecone_filter_operator( filter.operator ): filter.value } } ) else: filters_list.append({filter.key: filter.value}) if len(filters_list) == 1: # If there is only one filter, return it directly return filters_list[0] elif len(filters_list) > 1: filters[condition] = filters_list return filters import_err_msg = ( "`pinecone` package not found, please run `pip install pinecone-client`" ) class PineconeVectorStore(BasePydanticVectorStore): """Pinecone Vector Store. In this vector store, embeddings and docs are stored within a Pinecone index. During query time, the index uses Pinecone to query for the top k most similar nodes. Args: pinecone_index (Optional[Union[pinecone.Pinecone.Index, pinecone.Index]]): Pinecone index instance, pinecone.Pinecone.Index for clients >= 3.0.0; pinecone.Index for older clients. insert_kwargs (Optional[Dict]): insert kwargs during `upsert` call. add_sparse_vector (bool): whether to add sparse vector to index. tokenizer (Optional[Callable]): tokenizer to use to generate sparse default_empty_query_vector (Optional[List[float]]): default empty query vector. Defaults to None. If not None, then this vector will be used as the query vector if the query is empty. """ stores_text: bool = True flat_metadata: bool = False api_key: Optional[str] index_name: Optional[str] environment: Optional[str] namespace: Optional[str] insert_kwargs: Optional[Dict] add_sparse_vector: bool text_key: str batch_size: int remove_text_from_metadata: bool _pinecone_index: Any = PrivateAttr() _tokenizer: Optional[Callable] = PrivateAttr() def __init__( self, pinecone_index: Optional[ Any ] = None, # Dynamic import prevents specific type hinting here api_key: Optional[str] = None, index_name: Optional[str] = None, environment: Optional[str] = None, namespace: Optional[str] = None, insert_kwargs: Optional[Dict] = None, add_sparse_vector: bool = False, tokenizer: Optional[Callable] = None, text_key: str = DEFAULT_TEXT_KEY, batch_size: int = DEFAULT_BATCH_SIZE, remove_text_from_metadata: bool = False, default_empty_query_vector: Optional[List[float]] = None, **kwargs: Any, ) -> None: insert_kwargs = insert_kwargs or {} if tokenizer is None and add_sparse_vector: tokenizer = get_default_tokenizer() self._tokenizer = tokenizer super().__init__( index_name=index_name, environment=environment, api_key=api_key, namespace=namespace, insert_kwargs=insert_kwargs, add_sparse_vector=add_sparse_vector, text_key=text_key, batch_size=batch_size, remove_text_from_metadata=remove_text_from_metadata, ) # TODO: Make following instance check stronger -- check if pinecone_index is not pinecone.Index, else raise # ValueError if isinstance(pinecone_index, str): raise ValueError( f"`pinecone_index` cannot be of type `str`; should be an instance of pinecone.Index, " ) self._pinecone_index = pinecone_index or self._initialize_pinecone_client( api_key, index_name, environment, **kwargs ) @classmethod def _initialize_pinecone_client( cls, api_key: Optional[str], index_name: Optional[str], environment: Optional[str], **kwargs: Any, ) -> Any: """ Initialize Pinecone client based on version. If client version <3.0.0, use pods-based initialization; else, use serverless initialization. """ if not index_name: raise ValueError( "`index_name` is required for Pinecone client initialization" ) pinecone = _import_pinecone() if ( not _is_pinecone_v3() ): # If old version of Pinecone client (version bifurcation temporary): if not environment: raise ValueError("environment is required for Pinecone client < 3.0.0") pinecone.init(api_key=api_key, environment=environment) return pinecone.Index(index_name) else: # If new version of Pinecone client (serverless): pinecone_instance = pinecone.Pinecone(api_key=api_key) return pinecone_instance.Index(index_name) @classmethod def from_params( cls, api_key: Optional[str] = None, index_name: Optional[str] = None, environment: Optional[str] = None, namespace: Optional[str] = None, insert_kwargs: Optional[Dict] = None, add_sparse_vector: bool = False, tokenizer: Optional[Callable] = None, text_key: str = DEFAULT_TEXT_KEY, batch_size: int = DEFAULT_BATCH_SIZE, remove_text_from_metadata: bool = False, default_empty_query_vector: Optional[List[float]] = None, **kwargs: Any, ) -> "PineconeVectorStore": pinecone_index = cls._initialize_pinecone_client( api_key, index_name, environment, **kwargs ) return cls( pinecone_index=pinecone_index, api_key=api_key, index_name=index_name, environment=environment, namespace=namespace, insert_kwargs=insert_kwargs, add_sparse_vector=add_sparse_vector, tokenizer=tokenizer, text_key=text_key, batch_size=batch_size, remove_text_from_metadata=remove_text_from_metadata, default_empty_query_vector=default_empty_query_vector, **kwargs, ) @classmethod def class_name(cls) -> str: return "PinconeVectorStore" def add( self, nodes: List[BaseNode], **add_kwargs: Any, ) -> List[str]: """Add nodes to index. Args: nodes: List[BaseNode]: list of nodes with embeddings """ ids = [] entries = [] for node in nodes: node_id = node.node_id metadata = node_to_metadata_dict( node, remove_text=self.remove_text_from_metadata, flat_metadata=self.flat_metadata, ) entry = { ID_KEY: node_id, VECTOR_KEY: node.get_embedding(), METADATA_KEY: metadata, } if self.add_sparse_vector and self._tokenizer is not None: sparse_vector = generate_sparse_vectors( [node.get_content(metadata_mode=MetadataMode.EMBED)], self._tokenizer, )[0] entry[SPARSE_VECTOR_KEY] = sparse_vector ids.append(node_id) entries.append(entry) self._pinecone_index.upsert( entries, namespace=self.namespace, batch_size=self.batch_size, **self.insert_kwargs, ) return ids def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: """ Delete nodes using with ref_doc_id. Args: ref_doc_id (str): The doc_id of the document to delete. """ # delete by filtering on the doc_id metadata self._pinecone_index.delete( filter={"doc_id": {"$eq": ref_doc_id}}, namespace=self.namespace, **delete_kwargs, ) @property def client(self) -> Any: """Return Pinecone client.""" return self._pinecone_index def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: """Query index for top k most similar nodes. Args: query_embedding (List[float]): query embedding similarity_top_k (int): top k most similar nodes """ sparse_vector = None if ( query.mode in (VectorStoreQueryMode.SPARSE, VectorStoreQueryMode.HYBRID) and self._tokenizer is not None ): if query.query_str is None: raise ValueError( "query_str must be specified if mode is SPARSE or HYBRID." ) sparse_vector = generate_sparse_vectors([query.query_str], self._tokenizer)[ 0 ] if query.alpha is not None: sparse_vector = { "indices": sparse_vector["indices"], "values": [v * (1 - query.alpha) for v in sparse_vector["values"]], } query_embedding = None if query.mode in (VectorStoreQueryMode.DEFAULT, VectorStoreQueryMode.HYBRID): query_embedding = cast(List[float], query.query_embedding) if query.alpha is not None: query_embedding = [v * query.alpha for v in query_embedding] if query.filters is not None: if "filter" in kwargs or "pinecone_query_filters" in kwargs: raise ValueError( "Cannot specify filter via both query and kwargs. " "Use kwargs only for pinecone specific items that are " "not supported via the generic query interface." ) filter = _to_pinecone_filter(query.filters) elif "pinecone_query_filters" in kwargs: filter = kwargs.pop("pinecone_query_filters") else: filter = kwargs.pop("filter", {}) response = self._pinecone_index.query( vector=query_embedding, sparse_vector=sparse_vector, top_k=query.similarity_top_k, include_values=True, include_metadata=True, namespace=self.namespace, filter=filter, **kwargs, ) top_k_nodes = [] top_k_ids = [] top_k_scores = [] for match in response.matches: try: node = metadata_dict_to_node(match.metadata) node.embedding = match.values except Exception: # NOTE: deprecated legacy logic for backward compatibility _logger.debug( "Failed to parse Node metadata, fallback to legacy logic." ) metadata, node_info, relationships = legacy_metadata_dict_to_node( match.metadata, text_key=self.text_key ) text = match.metadata[self.text_key] id = match.id node = TextNode( text=text, id_=id, metadata=metadata, start_char_idx=node_info.get("start", None), end_char_idx=node_info.get("end", None), relationships=relationships, ) top_k_ids.append(match.id) top_k_nodes.append(node) top_k_scores.append(match.score) return VectorStoreQueryResult( nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids )