329 lines
11 KiB
Python
329 lines
11 KiB
Python
import asyncio
|
|
from typing import Any, Dict, List, Optional, Sequence, Tuple, cast
|
|
|
|
from llama_index.async_utils import asyncio_module
|
|
from llama_index.core.base_query_engine import BaseQueryEngine
|
|
from llama_index.core.response.schema import RESPONSE_TYPE, Response
|
|
from llama_index.evaluation.base import BaseEvaluator, EvaluationResult
|
|
|
|
|
|
async def eval_response_worker(
|
|
semaphore: asyncio.Semaphore,
|
|
evaluator: BaseEvaluator,
|
|
evaluator_name: str,
|
|
query: Optional[str] = None,
|
|
response: Optional[Response] = None,
|
|
eval_kwargs: Optional[Dict[str, Any]] = None,
|
|
) -> Tuple[str, EvaluationResult]:
|
|
"""Get aevaluate_response tasks with semaphore."""
|
|
eval_kwargs = eval_kwargs or {}
|
|
async with semaphore:
|
|
return (
|
|
evaluator_name,
|
|
await evaluator.aevaluate_response(
|
|
query=query, response=response, **eval_kwargs
|
|
),
|
|
)
|
|
|
|
|
|
async def eval_worker(
|
|
semaphore: asyncio.Semaphore,
|
|
evaluator: BaseEvaluator,
|
|
evaluator_name: str,
|
|
query: Optional[str] = None,
|
|
response_str: Optional[str] = None,
|
|
contexts: Optional[Sequence[str]] = None,
|
|
eval_kwargs: Optional[Dict[str, Any]] = None,
|
|
) -> Tuple[str, EvaluationResult]:
|
|
"""Get aevaluate tasks with semaphore."""
|
|
eval_kwargs = eval_kwargs or {}
|
|
async with semaphore:
|
|
return (
|
|
evaluator_name,
|
|
await evaluator.aevaluate(
|
|
query=query, response=response_str, contexts=contexts, **eval_kwargs
|
|
),
|
|
)
|
|
|
|
|
|
async def response_worker(
|
|
semaphore: asyncio.Semaphore,
|
|
query_engine: BaseQueryEngine,
|
|
query: str,
|
|
) -> RESPONSE_TYPE:
|
|
"""Get aquery tasks with semaphore."""
|
|
async with semaphore:
|
|
return await query_engine.aquery(query)
|
|
|
|
|
|
class BatchEvalRunner:
|
|
"""Batch evaluation runner.
|
|
|
|
Args:
|
|
evaluators (Dict[str, BaseEvaluator]): Dictionary of evaluators.
|
|
workers (int): Number of workers to use for parallelization.
|
|
Defaults to 2.
|
|
show_progress (bool): Whether to show progress bars. Defaults to False.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
evaluators: Dict[str, BaseEvaluator],
|
|
workers: int = 2,
|
|
show_progress: bool = False,
|
|
):
|
|
self.evaluators = evaluators
|
|
self.workers = workers
|
|
self.semaphore = asyncio.Semaphore(self.workers)
|
|
self.show_progress = show_progress
|
|
self.asyncio_mod = asyncio_module(show_progress=self.show_progress)
|
|
|
|
def _format_results(
|
|
self, results: List[EvaluationResult]
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Format results."""
|
|
# Format results
|
|
results_dict: Dict[str, List[EvaluationResult]] = {
|
|
name: [] for name in self.evaluators
|
|
}
|
|
for name, result in results:
|
|
results_dict[name].append(result)
|
|
|
|
return results_dict
|
|
|
|
def _validate_and_clean_inputs(
|
|
self,
|
|
*inputs_list: Any,
|
|
) -> List[Any]:
|
|
"""Validate and clean input lists.
|
|
|
|
Enforce that at least one of the inputs is not None.
|
|
Make sure that all inputs have the same length.
|
|
Make sure that None inputs are replaced with [None] * len(inputs).
|
|
|
|
"""
|
|
assert len(inputs_list) > 0
|
|
# first, make sure at least one of queries or response_strs is not None
|
|
input_len: Optional[int] = None
|
|
for inputs in inputs_list:
|
|
if inputs is not None:
|
|
input_len = len(inputs)
|
|
break
|
|
if input_len is None:
|
|
raise ValueError("At least one item in inputs_list must be provided.")
|
|
|
|
new_inputs_list = []
|
|
for inputs in inputs_list:
|
|
if inputs is None:
|
|
new_inputs_list.append([None] * input_len)
|
|
else:
|
|
if len(inputs) != input_len:
|
|
raise ValueError("All inputs must have the same length.")
|
|
new_inputs_list.append(inputs)
|
|
return new_inputs_list
|
|
|
|
def _get_eval_kwargs(
|
|
self, eval_kwargs_lists: Dict[str, Any], idx: int
|
|
) -> Dict[str, Any]:
|
|
"""Get eval kwargs from eval_kwargs_lists at a given idx.
|
|
|
|
Since eval_kwargs_lists is a dict of lists, we need to get the
|
|
value at idx for each key.
|
|
|
|
"""
|
|
return {k: v[idx] for k, v in eval_kwargs_lists.items()}
|
|
|
|
async def aevaluate_response_strs(
|
|
self,
|
|
queries: Optional[List[str]] = None,
|
|
response_strs: Optional[List[str]] = None,
|
|
contexts_list: Optional[List[List[str]]] = None,
|
|
**eval_kwargs_lists: List,
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Evaluate query, response pairs.
|
|
|
|
This evaluates queries, responses, contexts as string inputs.
|
|
Can supply additional kwargs to the evaluator in eval_kwargs_lists.
|
|
|
|
Args:
|
|
queries (Optional[List[str]]): List of query strings. Defaults to None.
|
|
response_strs (Optional[List[str]]): List of response strings.
|
|
Defaults to None.
|
|
contexts_list (Optional[List[List[str]]]): List of context lists.
|
|
Defaults to None.
|
|
**eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
|
|
pass to evaluator. Defaults to None.
|
|
|
|
"""
|
|
queries, response_strs, contexts_list = self._validate_and_clean_inputs(
|
|
queries, response_strs, contexts_list
|
|
)
|
|
for k in eval_kwargs_lists:
|
|
v = eval_kwargs_lists[k]
|
|
if not isinstance(v, list):
|
|
raise ValueError(
|
|
f"Each value in eval_kwargs must be a list. Got {k}: {v}"
|
|
)
|
|
eval_kwargs_lists[k] = self._validate_and_clean_inputs(v)[0]
|
|
|
|
# run evaluations
|
|
eval_jobs = []
|
|
for idx, query in enumerate(cast(List[str], queries)):
|
|
response_str = cast(List, response_strs)[idx]
|
|
contexts = cast(List, contexts_list)[idx]
|
|
eval_kwargs = self._get_eval_kwargs(eval_kwargs_lists, idx)
|
|
for name, evaluator in self.evaluators.items():
|
|
eval_jobs.append(
|
|
eval_worker(
|
|
self.semaphore,
|
|
evaluator,
|
|
name,
|
|
query=query,
|
|
response_str=response_str,
|
|
contexts=contexts,
|
|
eval_kwargs=eval_kwargs,
|
|
)
|
|
)
|
|
results = await self.asyncio_mod.gather(*eval_jobs)
|
|
|
|
# Format results
|
|
return self._format_results(results)
|
|
|
|
async def aevaluate_responses(
|
|
self,
|
|
queries: Optional[List[str]] = None,
|
|
responses: Optional[List[Response]] = None,
|
|
**eval_kwargs_lists: Dict[str, Any],
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Evaluate query, response pairs.
|
|
|
|
This evaluates queries and response objects.
|
|
|
|
Args:
|
|
queries (Optional[List[str]]): List of query strings. Defaults to None.
|
|
responses (Optional[List[Response]]): List of response objects.
|
|
Defaults to None.
|
|
**eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
|
|
pass to evaluator. Defaults to None.
|
|
|
|
"""
|
|
queries, responses = self._validate_and_clean_inputs(queries, responses)
|
|
for k in eval_kwargs_lists:
|
|
v = eval_kwargs_lists[k]
|
|
if not isinstance(v, list):
|
|
raise ValueError(
|
|
f"Each value in eval_kwargs must be a list. Got {k}: {v}"
|
|
)
|
|
eval_kwargs_lists[k] = self._validate_and_clean_inputs(v)[0]
|
|
|
|
# run evaluations
|
|
eval_jobs = []
|
|
for idx, query in enumerate(cast(List[str], queries)):
|
|
response = cast(List, responses)[idx]
|
|
eval_kwargs = self._get_eval_kwargs(eval_kwargs_lists, idx)
|
|
for name, evaluator in self.evaluators.items():
|
|
eval_jobs.append(
|
|
eval_response_worker(
|
|
self.semaphore,
|
|
evaluator,
|
|
name,
|
|
query=query,
|
|
response=response,
|
|
eval_kwargs=eval_kwargs,
|
|
)
|
|
)
|
|
results = await self.asyncio_mod.gather(*eval_jobs)
|
|
|
|
# Format results
|
|
return self._format_results(results)
|
|
|
|
async def aevaluate_queries(
|
|
self,
|
|
query_engine: BaseQueryEngine,
|
|
queries: Optional[List[str]] = None,
|
|
**eval_kwargs_lists: Dict[str, Any],
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Evaluate queries.
|
|
|
|
Args:
|
|
query_engine (BaseQueryEngine): Query engine.
|
|
queries (Optional[List[str]]): List of query strings. Defaults to None.
|
|
**eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
|
|
pass to evaluator. Defaults to None.
|
|
|
|
"""
|
|
if queries is None:
|
|
raise ValueError("`queries` must be provided")
|
|
|
|
# gather responses
|
|
response_jobs = []
|
|
for query in queries:
|
|
response_jobs.append(response_worker(self.semaphore, query_engine, query))
|
|
responses = await self.asyncio_mod.gather(*response_jobs)
|
|
|
|
return await self.aevaluate_responses(
|
|
queries=queries,
|
|
responses=responses,
|
|
**eval_kwargs_lists,
|
|
)
|
|
|
|
def evaluate_response_strs(
|
|
self,
|
|
queries: Optional[List[str]] = None,
|
|
response_strs: Optional[List[str]] = None,
|
|
contexts_list: Optional[List[List[str]]] = None,
|
|
**eval_kwargs_lists: List,
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Evaluate query, response pairs.
|
|
|
|
Sync version of aevaluate_response_strs.
|
|
|
|
"""
|
|
return asyncio.run(
|
|
self.aevaluate_response_strs(
|
|
queries=queries,
|
|
response_strs=response_strs,
|
|
contexts_list=contexts_list,
|
|
**eval_kwargs_lists,
|
|
)
|
|
)
|
|
|
|
def evaluate_responses(
|
|
self,
|
|
queries: Optional[List[str]] = None,
|
|
responses: Optional[List[Response]] = None,
|
|
**eval_kwargs_lists: Dict[str, Any],
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Evaluate query, response objs.
|
|
|
|
Sync version of aevaluate_responses.
|
|
|
|
"""
|
|
return asyncio.run(
|
|
self.aevaluate_responses(
|
|
queries=queries,
|
|
responses=responses,
|
|
**eval_kwargs_lists,
|
|
)
|
|
)
|
|
|
|
def evaluate_queries(
|
|
self,
|
|
query_engine: BaseQueryEngine,
|
|
queries: Optional[List[str]] = None,
|
|
**eval_kwargs_lists: Dict[str, Any],
|
|
) -> Dict[str, List[EvaluationResult]]:
|
|
"""Evaluate queries.
|
|
|
|
Sync version of aevaluate_queries.
|
|
|
|
"""
|
|
return asyncio.run(
|
|
self.aevaluate_queries(
|
|
query_engine=query_engine,
|
|
queries=queries,
|
|
**eval_kwargs_lists,
|
|
)
|
|
)
|