faiss_rag_enterprise/llama_index/llama_dataset/evaluator_evaluation.py

430 lines
16 KiB
Python

"""Labelled Evaluation Class."""
import asyncio
import time
from typing import List, Optional
from pandas import DataFrame as PandasDataFrame
from llama_index.bridge.pydantic import Field
from llama_index.evaluation import (
BaseEvaluator,
EvaluationResult,
)
from llama_index.evaluation.pairwise import EvaluationSource
from llama_index.llama_dataset.base import (
BaseLlamaDataExample,
BaseLlamaDataset,
BaseLlamaExamplePrediction,
BaseLlamaPredictionDataset,
CreatedBy,
)
class EvaluatorExamplePrediction(BaseLlamaExamplePrediction):
"""Evaluation example prediction class.
Args:
feedback (Optional[str]): The evaluator's feedback.
score (Optional[float]): The evaluator's score.
"""
feedback: str = Field(
default_factory=str,
description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
)
score: Optional[float] = Field(
default=None,
description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
)
invalid_prediction: bool = Field(
default=False, description="Whether or not the prediction is a valid one."
)
invalid_reason: Optional[str] = Field(
default=None, description="Reason as to why prediction is invalid."
)
@property
def class_name(self) -> str:
"""Data example class name."""
return "EvaluatorExamplePrediction"
class LabelledEvaluatorDataExample(BaseLlamaDataExample):
"""Evaluation example class.
This data class contains the ingredients to perform a new "prediction" i.e.,
evaluation. Here an evaluator is meant to evaluate a response against an
associated query as well as optionally contexts.
Args:
query (str): The user query
query_by (CreatedBy): Query generated by human or ai (model-name)
contexts (Optional[List[str]]): The contexts used for response
answer (str): Answer to the query that is to be evaluated.
answer_by: The reference answer generated by human or ai (model-name).
ground_truth_answer (Optional[str]):
ground_truth_answer_by (Optional[CreatedBy]):
reference_feedback (str): The reference feedback evaluation.
reference_score (float): The reference score evaluation.
reference_evaluation_by (CreatedBy): Evaluation generated by human or ai (model-name)
"""
query: str = Field(
default_factory=str, description="The user query for the example."
)
query_by: Optional[CreatedBy] = Field(
default=None, description="What generated the query."
)
contexts: Optional[List[str]] = Field(
default_factory=None,
description="The contexts used to generate the answer.",
)
answer: str = Field(
default_factory=str,
description="The provided answer to the example that is to be evaluated.",
)
answer_by: Optional[CreatedBy] = Field(
default=None, description="What generated the answer."
)
ground_truth_answer: Optional[str] = Field(
default=None,
description="The ground truth answer to the example that is used to evaluate the provided `answer`.",
)
ground_truth_answer_by: Optional[CreatedBy] = Field(
default=None, description="What generated the ground-truth answer."
)
reference_feedback: Optional[str] = Field(
default=None,
description="The reference feedback (ground-truth).",
)
reference_score: float = Field(
default_factory=float, description="The reference score (ground-truth)."
)
reference_evaluation_by: Optional[CreatedBy] = Field(
default=None, description="What generated the evaluation (feedback and score)."
)
@property
def class_name(self) -> str:
"""Data example class name."""
return "LabelledEvaluatorDataExample"
class EvaluatorPredictionDataset(BaseLlamaPredictionDataset):
"""Evaluation Prediction Dataset Class."""
_prediction_type = EvaluatorExamplePrediction
def to_pandas(self) -> PandasDataFrame:
"""Create pandas dataframe."""
data = {}
if self.predictions:
data = {
"feedback": [t.feedback for t in self.predictions],
"score": [t.score for t in self.predictions],
}
return PandasDataFrame(data)
@property
def class_name(self) -> str:
"""Class name."""
return "EvaluatorPredictionDataset"
class LabelledEvaluatorDataset(BaseLlamaDataset[BaseEvaluator]):
"""LabelledEvalationDataset class."""
_example_type = LabelledEvaluatorDataExample
def to_pandas(self) -> PandasDataFrame:
"""Create pandas dataframe."""
data = {
"query": [t.query for t in self.examples],
"answer": [t.answer for t in self.examples],
"contexts": [t.contexts for t in self.examples],
"ground_truth_answer": [t.ground_truth_answer for t in self.examples],
"query_by": [str(t.query_by) for t in self.examples],
"answer_by": [str(t.answer_by) for t in self.examples],
"ground_truth_answer_by": [
str(t.ground_truth_answer_by) for t in self.examples
],
"reference_feedback": [t.reference_feedback for t in self.examples],
"reference_score": [t.reference_score for t in self.examples],
"reference_evaluation_by": [
t.reference_evaluation_by for t in self.examples
],
}
return PandasDataFrame(data)
async def _apredict_example(
self,
predictor: BaseEvaluator,
example: LabelledEvaluatorDataExample,
sleep_time_in_seconds: int,
) -> EvaluatorExamplePrediction:
"""Async predict RAG example with a query engine."""
await asyncio.sleep(sleep_time_in_seconds)
try:
eval_result: EvaluationResult = await predictor.aevaluate(
query=example.query,
response=example.answer,
contexts=example.contexts,
reference=example.ground_truth_answer,
sleep_time_in_seconds=sleep_time_in_seconds,
)
except Exception as err:
# TODO: raise warning here as well
return EvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
)
if not eval_result.invalid_result:
return EvaluatorExamplePrediction(
feedback=eval_result.feedback, score=eval_result.score
)
else:
return EvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=eval_result.invalid_reason
)
def _predict_example(
self,
predictor: BaseEvaluator,
example: LabelledEvaluatorDataExample,
sleep_time_in_seconds: int = 0,
) -> EvaluatorExamplePrediction:
"""Predict RAG example with a query engine."""
time.sleep(sleep_time_in_seconds)
try:
eval_result: EvaluationResult = predictor.evaluate(
query=example.query,
response=example.answer,
contexts=example.contexts,
reference=example.ground_truth_answer,
sleep_time_in_seconds=sleep_time_in_seconds,
)
except Exception as err:
# TODO: raise warning here as well
return EvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
)
if not eval_result.invalid_result:
return EvaluatorExamplePrediction(
feedback=eval_result.feedback, score=eval_result.score
)
else:
return EvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=eval_result.invalid_reason
)
def _construct_prediction_dataset(
self, predictions: List[EvaluatorExamplePrediction]
) -> EvaluatorPredictionDataset:
"""Construct prediction dataset."""
return EvaluatorPredictionDataset(predictions=predictions)
@property
def class_name(self) -> str:
"""Class name."""
return "LabelledEvaluatorDataset"
class PairwiseEvaluatorExamplePrediction(BaseLlamaExamplePrediction):
"""Pairwise evaluation example prediction class.
Args:
feedback (Optional[str]): The evaluator's feedback.
score (Optional[float]): The evaluator's score.
evaluation_source (EvaluationSource): If the evaluation came from original order or flipped; or inconclusive.
"""
feedback: str = Field(
default_factory=str,
description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
)
score: Optional[float] = Field(
default=None,
description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
)
evaluation_source: Optional[EvaluationSource] = Field(
default=None,
description=(
"Whether the evaluation comes from original, or flipped ordering. Can also be neither here indicating inconclusive judgement."
),
)
invalid_prediction: bool = Field(
default=False, description="Whether or not the prediction is a valid one."
)
invalid_reason: Optional[str] = Field(
default=None, description="Reason as to why prediction is invalid."
)
@property
def class_name(self) -> str:
"""Data example class name."""
return "PairwiseEvaluatorExamplePrediction"
class PairwiseEvaluatorPredictionDataset(BaseLlamaPredictionDataset):
"""Pairwise evaluation predictions dataset class."""
_prediction_type = PairwiseEvaluatorExamplePrediction
def to_pandas(self) -> PandasDataFrame:
"""Create pandas dataframe."""
data = {}
if self.predictions:
data = {
"feedback": [t.feedback for t in self.predictions],
"score": [t.score for t in self.predictions],
"ordering": [t.evaluation_source.value for t in self.predictions],
}
return PandasDataFrame(data)
@property
def class_name(self) -> str:
"""Class name."""
return "PairwiseEvaluatorPredictionDataset"
class LabelledPairwiseEvaluatorDataExample(LabelledEvaluatorDataExample):
"""Labelled pairwise evaluation data example class."""
second_answer: str = Field(
default_factory=str,
description="The second answer to the example that is to be evaluated along versus `answer`.",
)
second_answer_by: Optional[CreatedBy] = Field(
default=None, description="What generated the second answer."
)
@property
def class_name(self) -> str:
"""Data example class name."""
return "LabelledPairwiseEvaluatorDataExample"
class LabelledPairwiseEvaluatorDataset(BaseLlamaDataset[BaseEvaluator]):
"""Labelled pairwise evaluation dataset. For evaluating the evaluator in
performing pairwise evaluations.
Args:
BaseLlamaDataset (_type_): _description_
"""
_example_type = LabelledPairwiseEvaluatorDataExample
def to_pandas(self) -> PandasDataFrame:
"""Create pandas dataframe."""
data = {
"query": [t.query for t in self.examples],
"answer": [t.answer for t in self.examples],
"second_answer": [t.second_answer for t in self.examples],
"contexts": [t.contexts for t in self.examples],
"ground_truth_answer": [t.ground_truth_answer for t in self.examples],
"query_by": [str(t.query_by) for t in self.examples],
"answer_by": [str(t.answer_by) for t in self.examples],
"second_answer_by": [str(t.second_answer_by) for t in self.examples],
"ground_truth_answer_by": [
str(t.ground_truth_answer_by) for t in self.examples
],
"reference_feedback": [t.reference_feedback for t in self.examples],
"reference_score": [t.reference_score for t in self.examples],
"reference_evaluation_by": [
t.reference_evaluation_by for t in self.examples
],
}
return PandasDataFrame(data)
async def _apredict_example(
self,
predictor: BaseEvaluator,
example: LabelledPairwiseEvaluatorDataExample,
sleep_time_in_seconds: int,
) -> PairwiseEvaluatorExamplePrediction:
"""Async predict evaluation example with an Evaluator."""
await asyncio.sleep(sleep_time_in_seconds)
try:
eval_result: EvaluationResult = await predictor.aevaluate(
query=example.query,
response=example.answer,
second_response=example.second_answer,
contexts=example.contexts,
reference=example.ground_truth_answer,
sleep_time_in_seconds=sleep_time_in_seconds,
)
except Exception as err:
# TODO: raise warning here as well
return PairwiseEvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
)
if not eval_result.invalid_result:
return PairwiseEvaluatorExamplePrediction(
feedback=eval_result.feedback,
score=eval_result.score,
evaluation_source=eval_result.pairwise_source,
)
else:
return PairwiseEvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=eval_result.invalid_reason
)
def _predict_example(
self,
predictor: BaseEvaluator,
example: LabelledPairwiseEvaluatorDataExample,
sleep_time_in_seconds: int = 0,
) -> PairwiseEvaluatorExamplePrediction:
"""Predict RAG example with a query engine."""
time.sleep(sleep_time_in_seconds)
try:
eval_result: EvaluationResult = predictor.evaluate(
query=example.query,
response=example.answer,
second_response=example.second_answer,
contexts=example.contexts,
reference=example.ground_truth_answer,
sleep_time_in_seconds=sleep_time_in_seconds,
)
except Exception as err:
# TODO: raise warning here as well
return PairwiseEvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
)
if not eval_result.invalid_result:
return PairwiseEvaluatorExamplePrediction(
feedback=eval_result.feedback,
score=eval_result.score,
evaluation_source=eval_result.pairwise_source,
)
else:
return PairwiseEvaluatorExamplePrediction(
invalid_prediction=True, invalid_reason=eval_result.invalid_reason
)
def _construct_prediction_dataset(
self, predictions: List[PairwiseEvaluatorExamplePrediction]
) -> PairwiseEvaluatorPredictionDataset:
"""Construct prediction dataset."""
return PairwiseEvaluatorPredictionDataset(predictions=predictions)
@property
def class_name(self) -> str:
"""Class name."""
return "LabelledPairwiseEvaluatorDataset"
# British English + American English
LabeledEvaluatorDataExample = LabelledEvaluatorDataExample
LabeledEvaluatorDataset = LabelledEvaluatorDataset
LabeledPairwiseEvaluatorDataExample = LabelledPairwiseEvaluatorDataExample
LabeledPairwiseEvaluatorDataset = LabelledPairwiseEvaluatorDataset