faiss_rag_enterprise/llama_index/evaluation/eval_utils.py

79 lines
2.1 KiB
Python

"""Get evaluation utils.
NOTE: These are beta functions, might change.
"""
import asyncio
from collections import defaultdict
from typing import Any, List, Optional, Tuple
import numpy as np
import pandas as pd
from llama_index.async_utils import asyncio_module
from llama_index.core.base_query_engine import BaseQueryEngine
from llama_index.evaluation.base import EvaluationResult
async def aget_responses(
questions: List[str], query_engine: BaseQueryEngine, show_progress: bool = False
) -> List[str]:
"""Get responses."""
tasks = []
for question in questions:
tasks.append(query_engine.aquery(question))
asyncio_mod = asyncio_module(show_progress=show_progress)
return await asyncio_mod.gather(*tasks)
def get_responses(
*args: Any,
**kwargs: Any,
) -> List[str]:
"""Get responses.
Sync version of aget_responses.
"""
return asyncio.run(aget_responses(*args, **kwargs))
def get_results_df(
eval_results_list: List[EvaluationResult], names: List[str], metric_keys: List[str]
) -> pd.DataFrame:
"""Get results df.
Args:
eval_results_list (List[EvaluationResult]):
List of evaluation results.
names (List[str]):
Names of the evaluation results.
metric_keys (List[str]):
List of metric keys to get.
"""
metric_dict = defaultdict(list)
metric_dict["names"] = names
for metric_key in metric_keys:
for eval_results in eval_results_list:
mean_score = np.array([r.score for r in eval_results[metric_key]]).mean()
metric_dict[metric_key].append(mean_score)
return pd.DataFrame(metric_dict)
def default_parser(eval_response: str) -> Tuple[Optional[float], Optional[str]]:
"""
Default parser function for evaluation response.
Args:
eval_response (str): The response string from the evaluation.
Returns:
Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
"""
score_str, reasoning_str = eval_response.split("\n", 1)
score = float(score_str)
reasoning = reasoning_str.lstrip("\n")
return score, reasoning