faiss_rag_enterprise/llama_index/evaluation/base.py

126 lines
4.0 KiB
Python

"""Evaluator."""
import asyncio
from abc import abstractmethod
from typing import Any, Optional, Sequence
from llama_index.bridge.pydantic import BaseModel, Field
from llama_index.core.response.schema import Response
from llama_index.prompts.mixin import PromptMixin, PromptMixinType
class EvaluationResult(BaseModel):
"""Evaluation result.
Output of an BaseEvaluator.
"""
query: Optional[str] = Field(None, description="Query string")
contexts: Optional[Sequence[str]] = Field(None, description="Context strings")
response: Optional[str] = Field(None, description="Response string")
passing: Optional[bool] = Field(
None, description="Binary evaluation result (passing or not)"
)
feedback: Optional[str] = Field(
None, description="Feedback or reasoning for the response"
)
score: Optional[float] = Field(None, description="Score for the response")
pairwise_source: Optional[str] = Field(
None,
description=(
"Used only for pairwise and specifies whether it is from original order of"
" presented answers or flipped order"
),
)
invalid_result: bool = Field(
default=False, description="Whether the evaluation result is an invalid one."
)
invalid_reason: Optional[str] = Field(
default=None, description="Reason for invalid evaluation."
)
class BaseEvaluator(PromptMixin):
"""Base Evaluator class."""
def _get_prompt_modules(self) -> PromptMixinType:
"""Get prompt modules."""
return {}
def evaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
**kwargs: Any,
) -> EvaluationResult:
"""Run evaluation with query string, retrieved contexts,
and generated response string.
Subclasses can override this method to provide custom evaluation logic and
take in additional arguments.
"""
return asyncio.run(
self.aevaluate(
query=query,
response=response,
contexts=contexts,
**kwargs,
)
)
@abstractmethod
async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
**kwargs: Any,
) -> EvaluationResult:
"""Run evaluation with query string, retrieved contexts,
and generated response string.
Subclasses can override this method to provide custom evaluation logic and
take in additional arguments.
"""
raise NotImplementedError
def evaluate_response(
self,
query: Optional[str] = None,
response: Optional[Response] = None,
**kwargs: Any,
) -> EvaluationResult:
"""Run evaluation with query string and generated Response object.
Subclasses can override this method to provide custom evaluation logic and
take in additional arguments.
"""
return asyncio.run(
self.aevaluate_response(query=query, response=response, **kwargs)
)
async def aevaluate_response(
self,
query: Optional[str] = None,
response: Optional[Response] = None,
**kwargs: Any,
) -> EvaluationResult:
"""Run evaluation with query string and generated Response object.
Subclasses can override this method to provide custom evaluation logic and
take in additional arguments.
"""
response_str: Optional[str] = None
contexts: Optional[Sequence[str]] = None
if response is not None:
response_str = response.response
contexts = [node.get_content() for node in response.source_nodes]
return await self.aevaluate(
query=query, response=response_str, contexts=contexts, **kwargs
)
# legacy: backward compatibility
Evaluation = EvaluationResult