faiss_rag_enterprise/llama_index/vector_stores/awadb.py

"""AwaDB vector store index.

An index that is built on top of an existing vector store.

"""
import logging
import uuid
from typing import Any, List, Optional, Set

from llama_index.schema import BaseNode, MetadataMode, TextNode
from llama_index.vector_stores.types import (
    VectorStore,
    VectorStoreQuery,
    VectorStoreQueryResult,
)
from llama_index.vector_stores.utils import (
    legacy_metadata_dict_to_node,
    metadata_dict_to_node,
    node_to_metadata_dict,
)

logger = logging.getLogger(__name__)


class AwaDBVectorStore(VectorStore):
    """AwaDB vector store.

    In this vector store, embeddings are stored within a AwaDB table.

    During query time, the index uses AwaDB to query for the top
    k most similar nodes.

    Args:
        chroma_collection (chromadb.api.models.Collection.Collection):
            ChromaDB collection instance

    """

    flat_metadata: bool = True
    stores_text: bool = True
    DEFAULT_TABLE_NAME = "llamaindex_awadb"

    @property
    def client(self) -> Any:
        """Get AwaDB client."""
        return self.awadb_client

    def __init__(
        self,
        table_name: str = DEFAULT_TABLE_NAME,
        log_and_data_dir: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        """Initialize with AwaDB client.
           If table_name is not specified,
           a random table name of `DEFAULT_TABLE_NAME + last segment of uuid`
           would be created automatically.

        Args:
            table_name: Name of the table created, default DEFAULT_TABLE_NAME.
            log_and_data_dir: Optional the root directory of log and data.
            kwargs: Any possible extend parameters in the future.

        Returns:
            None.
        """
        import_err_msg = "`awadb` package not found, please run `pip install awadb`"
        try:
            import awadb
        except ImportError:
            raise ImportError(import_err_msg)
        if log_and_data_dir is not None:
            self.awadb_client = awadb.Client(log_and_data_dir)
        else:
            self.awadb_client = awadb.Client()

        if table_name == self.DEFAULT_TABLE_NAME:
            table_name += "_"
            table_name += str(uuid.uuid4()).split("-")[-1]

        self.awadb_client.Create(table_name)

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """Add nodes to AwaDB.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings

        Returns:
            Added node ids
        """
        if not self.awadb_client:
            raise ValueError("AwaDB client not initialized")

        embeddings = []
        metadatas = []
        ids = []
        texts = []
        for node in nodes:
            embeddings.append(node.get_embedding())
            metadatas.append(
                node_to_metadata_dict(
                    node, remove_text=True, flat_metadata=self.flat_metadata
                )
            )
            ids.append(node.node_id)
            texts.append(node.get_content(metadata_mode=MetadataMode.NONE) or "")

        self.awadb_client.AddTexts(
            "embedding_text",
            "text_embedding",
            texts,
            embeddings,
            metadatas,
            is_duplicate_texts=False,
            ids=ids,
        )

        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """Delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        Returns:
            None
        """
        if len(ref_doc_id) == 0:
            return
        ids: List[str] = []
        ids.append(ref_doc_id)
        self.awadb_client.Delete(ids)

    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
        """Query index for top k most similar nodes.

        Args:
            query : vector store query

        Returns:
            VectorStoreQueryResult: Query results
        """
        meta_filters = {}
        if query.filters is not None:
            for filter in query.filters.legacy_filters():
                meta_filters[filter.key] = filter.value

        not_include_fields: Set[str] = {"text_embedding"}
        results = self.awadb_client.Search(
            query=query.query_embedding,
            topn=query.similarity_top_k,
            meta_filter=meta_filters,
            not_include_fields=not_include_fields,
        )

        nodes = []
        similarities = []
        ids = []

        for item_detail in results[0]["ResultItems"]:
            content = ""
            meta_data = {}
            node_id = ""
            for item_key in item_detail:
                if item_key == "embedding_text":
                    content = item_detail[item_key]
                    continue
                elif item_key == "_id":
                    node_id = item_detail[item_key]
                    ids.append(node_id)
                    continue
                elif item_key == "score":
                    similarities.append(item_detail[item_key])
                    continue
                meta_data[item_key] = item_detail[item_key]

            try:
                node = metadata_dict_to_node(meta_data)
                node.set_content(content)
            except Exception:
                # NOTE: deprecated legacy logic for backward compatibility
                metadata, node_info, relationships = legacy_metadata_dict_to_node(
                    meta_data
                )

                node = TextNode(
                    text=content,
                    id_=node_id,
                    metadata=metadata,
                    start_char_idx=node_info.get("start", None),
                    end_char_idx=node_info.get("end", None),
                    relationships=relationships,
                )

            nodes.append(node)

        return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)