77 lines
2.8 KiB
Python
77 lines
2.8 KiB
Python
from typing import Any, Callable, Optional, Sequence
|
|
|
|
from llama_index.core.embeddings.base import SimilarityMode, similarity
|
|
from llama_index.evaluation.base import BaseEvaluator, EvaluationResult
|
|
from llama_index.prompts.mixin import PromptDictType
|
|
from llama_index.service_context import ServiceContext
|
|
|
|
|
|
class SemanticSimilarityEvaluator(BaseEvaluator):
|
|
"""Embedding similarity evaluator.
|
|
|
|
Evaluate the quality of a question answering system by
|
|
comparing the similarity between embeddings of the generated answer
|
|
and the reference answer.
|
|
|
|
Inspired by this paper:
|
|
- Semantic Answer Similarity for Evaluating Question Answering Models
|
|
https://arxiv.org/pdf/2108.06130.pdf
|
|
|
|
Args:
|
|
service_context (Optional[ServiceContext]): Service context.
|
|
similarity_threshold (float): Embedding similarity threshold for "passing".
|
|
Defaults to 0.8.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
service_context: Optional[ServiceContext] = None,
|
|
similarity_fn: Optional[Callable[..., float]] = None,
|
|
similarity_mode: Optional[SimilarityMode] = None,
|
|
similarity_threshold: float = 0.8,
|
|
) -> None:
|
|
self._service_context = service_context or ServiceContext.from_defaults()
|
|
if similarity_fn is None:
|
|
similarity_mode = similarity_mode or SimilarityMode.DEFAULT
|
|
self._similarity_fn = lambda x, y: similarity(x, y, mode=similarity_mode)
|
|
else:
|
|
if similarity_mode is not None:
|
|
raise ValueError(
|
|
"Cannot specify both similarity_fn and similarity_mode"
|
|
)
|
|
self._similarity_fn = similarity_fn
|
|
|
|
self._similarity_threshold = similarity_threshold
|
|
|
|
def _get_prompts(self) -> PromptDictType:
|
|
"""Get prompts."""
|
|
return {}
|
|
|
|
def _update_prompts(self, prompts: PromptDictType) -> None:
|
|
"""Update prompts."""
|
|
|
|
async def aevaluate(
|
|
self,
|
|
query: Optional[str] = None,
|
|
response: Optional[str] = None,
|
|
contexts: Optional[Sequence[str]] = None,
|
|
reference: Optional[str] = None,
|
|
**kwargs: Any,
|
|
) -> EvaluationResult:
|
|
del query, contexts, kwargs # Unused
|
|
|
|
if response is None or reference is None:
|
|
raise ValueError("Must specify both response and reference")
|
|
|
|
embed_model = self._service_context.embed_model
|
|
response_embedding = await embed_model.aget_text_embedding(response)
|
|
reference_embedding = await embed_model.aget_text_embedding(reference)
|
|
|
|
similarity_score = self._similarity_fn(response_embedding, reference_embedding)
|
|
passing = similarity_score >= self._similarity_threshold
|
|
return EvaluationResult(
|
|
score=similarity_score,
|
|
passing=passing,
|
|
feedback=f"Similarity score: {similarity_score}",
|
|
)
|