faiss_rag_enterprise/llama_index/indices/query/embedding_utils.py

167 lines
6.0 KiB
Python

"""Embedding utils for queries."""
import heapq
import math
from typing import Any, Callable, List, Optional, Tuple
import numpy as np
from llama_index.core.embeddings.base import similarity as default_similarity_fn
from llama_index.vector_stores.types import VectorStoreQueryMode
def get_top_k_embeddings(
query_embedding: List[float],
embeddings: List[List[float]],
similarity_fn: Optional[Callable[..., float]] = None,
similarity_top_k: Optional[int] = None,
embedding_ids: Optional[List] = None,
similarity_cutoff: Optional[float] = None,
) -> Tuple[List[float], List]:
"""Get top nodes by similarity to the query."""
if embedding_ids is None:
embedding_ids = list(range(len(embeddings)))
similarity_fn = similarity_fn or default_similarity_fn
embeddings_np = np.array(embeddings)
query_embedding_np = np.array(query_embedding)
similarity_heap: List[Tuple[float, Any]] = []
for i, emb in enumerate(embeddings_np):
similarity = similarity_fn(query_embedding_np, emb)
if similarity_cutoff is None or similarity > similarity_cutoff:
heapq.heappush(similarity_heap, (similarity, embedding_ids[i]))
if similarity_top_k and len(similarity_heap) > similarity_top_k:
heapq.heappop(similarity_heap)
result_tups = sorted(similarity_heap, key=lambda x: x[0], reverse=True)
result_similarities = [s for s, _ in result_tups]
result_ids = [n for _, n in result_tups]
return result_similarities, result_ids
def get_top_k_embeddings_learner(
query_embedding: List[float],
embeddings: List[List[float]],
similarity_top_k: Optional[int] = None,
embedding_ids: Optional[List] = None,
query_mode: VectorStoreQueryMode = VectorStoreQueryMode.SVM,
) -> Tuple[List[float], List]:
"""Get top embeddings by fitting a learner against query.
Inspired by Karpathy's SVM demo:
https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb
Can fit SVM, linear regression, and more.
"""
try:
from sklearn import linear_model, svm
except ImportError:
raise ImportError("Please install scikit-learn to use this feature.")
if embedding_ids is None:
embedding_ids = list(range(len(embeddings)))
query_embedding_np = np.array(query_embedding)
embeddings_np = np.array(embeddings)
# create dataset
dataset_len = len(embeddings) + 1
dataset = np.concatenate([query_embedding_np[None, ...], embeddings_np])
y = np.zeros(dataset_len)
y[0] = 1
if query_mode == VectorStoreQueryMode.SVM:
# train our SVM
# TODO: make params configurable
clf = svm.LinearSVC(
class_weight="balanced", verbose=False, max_iter=10000, tol=1e-6, C=0.1
)
elif query_mode == VectorStoreQueryMode.LINEAR_REGRESSION:
clf = linear_model.LinearRegression()
elif query_mode == VectorStoreQueryMode.LOGISTIC_REGRESSION:
clf = linear_model.LogisticRegression(class_weight="balanced")
else:
raise ValueError(f"Unknown query mode: {query_mode}")
clf.fit(dataset, y) # train
# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(dataset[1:])
sorted_ix = np.argsort(-similarities)
top_sorted_ix = sorted_ix[:similarity_top_k]
result_similarities = similarities[top_sorted_ix]
result_ids = [embedding_ids[ix] for ix in top_sorted_ix]
return result_similarities, result_ids
def get_top_k_mmr_embeddings(
query_embedding: List[float],
embeddings: List[List[float]],
similarity_fn: Optional[Callable[..., float]] = None,
similarity_top_k: Optional[int] = None,
embedding_ids: Optional[List] = None,
similarity_cutoff: Optional[float] = None,
mmr_threshold: Optional[float] = None,
) -> Tuple[List[float], List]:
"""Get top nodes by similarity to the query,
discount by their similarity to previous results.
A mmr_threshold of 0 will strongly avoid similarity to previous results.
A mmr_threshold of 1 will check similarity the query and ignore previous results.
"""
threshold = mmr_threshold or 0.5
similarity_fn = similarity_fn or default_similarity_fn
if embedding_ids is None or embedding_ids == []:
embedding_ids = list(range(len(embeddings)))
full_embed_map = dict(zip(embedding_ids, range(len(embedding_ids))))
embed_map = full_embed_map.copy()
embed_similarity = {}
score: float = -math.inf
high_score_id = None
for i, emb in enumerate(embeddings):
similarity = similarity_fn(query_embedding, emb)
embed_similarity[embedding_ids[i]] = similarity
if similarity * threshold > score:
high_score_id = embedding_ids[i]
score = similarity * threshold
results: List[Tuple[Any, Any]] = []
embedding_length = len(embeddings or [])
similarity_top_k_count = similarity_top_k or embedding_length
while len(results) < min(similarity_top_k_count, embedding_length):
# Calculate the similarity score the for the leading one.
results.append((score, high_score_id))
# Reset so a new high scoring result can be found
del embed_map[high_score_id]
recent_embedding_id = high_score_id
score = -math.inf
# Iterate through results to find high score
for embed_id in embed_map:
overlap_with_recent = similarity_fn(
embeddings[embed_map[embed_id]],
embeddings[full_embed_map[recent_embedding_id]],
)
if (
threshold * embed_similarity[embed_id]
- ((1 - threshold) * overlap_with_recent)
> score
):
score = threshold * embed_similarity[embed_id] - (
(1 - threshold) * overlap_with_recent
)
high_score_id = embed_id
result_similarities = [s for s, _ in results]
result_ids = [n for _, n in results]
return result_similarities, result_ids