faiss_rag_enterprise/llama_index/evaluation/batch_runner.py

import asyncio
from typing import Any, Dict, List, Optional, Sequence, Tuple, cast

from llama_index.async_utils import asyncio_module
from llama_index.core.base_query_engine import BaseQueryEngine
from llama_index.core.response.schema import RESPONSE_TYPE, Response
from llama_index.evaluation.base import BaseEvaluator, EvaluationResult


async def eval_response_worker(
    semaphore: asyncio.Semaphore,
    evaluator: BaseEvaluator,
    evaluator_name: str,
    query: Optional[str] = None,
    response: Optional[Response] = None,
    eval_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[str, EvaluationResult]:
    """Get aevaluate_response tasks with semaphore."""
    eval_kwargs = eval_kwargs or {}
    async with semaphore:
        return (
            evaluator_name,
            await evaluator.aevaluate_response(
                query=query, response=response, **eval_kwargs
            ),
        )


async def eval_worker(
    semaphore: asyncio.Semaphore,
    evaluator: BaseEvaluator,
    evaluator_name: str,
    query: Optional[str] = None,
    response_str: Optional[str] = None,
    contexts: Optional[Sequence[str]] = None,
    eval_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[str, EvaluationResult]:
    """Get aevaluate tasks with semaphore."""
    eval_kwargs = eval_kwargs or {}
    async with semaphore:
        return (
            evaluator_name,
            await evaluator.aevaluate(
                query=query, response=response_str, contexts=contexts, **eval_kwargs
            ),
        )


async def response_worker(
    semaphore: asyncio.Semaphore,
    query_engine: BaseQueryEngine,
    query: str,
) -> RESPONSE_TYPE:
    """Get aquery tasks with semaphore."""
    async with semaphore:
        return await query_engine.aquery(query)


class BatchEvalRunner:
    """Batch evaluation runner.

    Args:
        evaluators (Dict[str, BaseEvaluator]): Dictionary of evaluators.
        workers (int): Number of workers to use for parallelization.
            Defaults to 2.
        show_progress (bool): Whether to show progress bars. Defaults to False.

    """

    def __init__(
        self,
        evaluators: Dict[str, BaseEvaluator],
        workers: int = 2,
        show_progress: bool = False,
    ):
        self.evaluators = evaluators
        self.workers = workers
        self.semaphore = asyncio.Semaphore(self.workers)
        self.show_progress = show_progress
        self.asyncio_mod = asyncio_module(show_progress=self.show_progress)

    def _format_results(
        self, results: List[EvaluationResult]
    ) -> Dict[str, List[EvaluationResult]]:
        """Format results."""
        # Format results
        results_dict: Dict[str, List[EvaluationResult]] = {
            name: [] for name in self.evaluators
        }
        for name, result in results:
            results_dict[name].append(result)

        return results_dict

    def _validate_and_clean_inputs(
        self,
        *inputs_list: Any,
    ) -> List[Any]:
        """Validate and clean input lists.

        Enforce that at least one of the inputs is not None.
        Make sure that all inputs have the same length.
        Make sure that None inputs are replaced with [None] * len(inputs).

        """
        assert len(inputs_list) > 0
        # first, make sure at least one of queries or response_strs is not None
        input_len: Optional[int] = None
        for inputs in inputs_list:
            if inputs is not None:
                input_len = len(inputs)
                break
        if input_len is None:
            raise ValueError("At least one item in inputs_list must be provided.")

        new_inputs_list = []
        for inputs in inputs_list:
            if inputs is None:
                new_inputs_list.append([None] * input_len)
            else:
                if len(inputs) != input_len:
                    raise ValueError("All inputs must have the same length.")
                new_inputs_list.append(inputs)
        return new_inputs_list

    def _get_eval_kwargs(
        self, eval_kwargs_lists: Dict[str, Any], idx: int
    ) -> Dict[str, Any]:
        """Get eval kwargs from eval_kwargs_lists at a given idx.

        Since eval_kwargs_lists is a dict of lists, we need to get the
        value at idx for each key.

        """
        return {k: v[idx] for k, v in eval_kwargs_lists.items()}

    async def aevaluate_response_strs(
        self,
        queries: Optional[List[str]] = None,
        response_strs: Optional[List[str]] = None,
        contexts_list: Optional[List[List[str]]] = None,
        **eval_kwargs_lists: List,
    ) -> Dict[str, List[EvaluationResult]]:
        """Evaluate query, response pairs.

        This evaluates queries, responses, contexts as string inputs.
        Can supply additional kwargs to the evaluator in eval_kwargs_lists.

        Args:
            queries (Optional[List[str]]): List of query strings. Defaults to None.
            response_strs (Optional[List[str]]): List of response strings.
                Defaults to None.
            contexts_list (Optional[List[List[str]]]): List of context lists.
                Defaults to None.
            **eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
                pass to evaluator. Defaults to None.

        """
        queries, response_strs, contexts_list = self._validate_and_clean_inputs(
            queries, response_strs, contexts_list
        )
        for k in eval_kwargs_lists:
            v = eval_kwargs_lists[k]
            if not isinstance(v, list):
                raise ValueError(
                    f"Each value in eval_kwargs must be a list. Got {k}: {v}"
                )
            eval_kwargs_lists[k] = self._validate_and_clean_inputs(v)[0]

        # run evaluations
        eval_jobs = []
        for idx, query in enumerate(cast(List[str], queries)):
            response_str = cast(List, response_strs)[idx]
            contexts = cast(List, contexts_list)[idx]
            eval_kwargs = self._get_eval_kwargs(eval_kwargs_lists, idx)
            for name, evaluator in self.evaluators.items():
                eval_jobs.append(
                    eval_worker(
                        self.semaphore,
                        evaluator,
                        name,
                        query=query,
                        response_str=response_str,
                        contexts=contexts,
                        eval_kwargs=eval_kwargs,
                    )
                )
        results = await self.asyncio_mod.gather(*eval_jobs)

        # Format results
        return self._format_results(results)

    async def aevaluate_responses(
        self,
        queries: Optional[List[str]] = None,
        responses: Optional[List[Response]] = None,
        **eval_kwargs_lists: Dict[str, Any],
    ) -> Dict[str, List[EvaluationResult]]:
        """Evaluate query, response pairs.

        This evaluates queries and response objects.

        Args:
            queries (Optional[List[str]]): List of query strings. Defaults to None.
            responses (Optional[List[Response]]): List of response objects.
                Defaults to None.
            **eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
                pass to evaluator. Defaults to None.

        """
        queries, responses = self._validate_and_clean_inputs(queries, responses)
        for k in eval_kwargs_lists:
            v = eval_kwargs_lists[k]
            if not isinstance(v, list):
                raise ValueError(
                    f"Each value in eval_kwargs must be a list. Got {k}: {v}"
                )
            eval_kwargs_lists[k] = self._validate_and_clean_inputs(v)[0]

        # run evaluations
        eval_jobs = []
        for idx, query in enumerate(cast(List[str], queries)):
            response = cast(List, responses)[idx]
            eval_kwargs = self._get_eval_kwargs(eval_kwargs_lists, idx)
            for name, evaluator in self.evaluators.items():
                eval_jobs.append(
                    eval_response_worker(
                        self.semaphore,
                        evaluator,
                        name,
                        query=query,
                        response=response,
                        eval_kwargs=eval_kwargs,
                    )
                )
        results = await self.asyncio_mod.gather(*eval_jobs)

        # Format results
        return self._format_results(results)

    async def aevaluate_queries(
        self,
        query_engine: BaseQueryEngine,
        queries: Optional[List[str]] = None,
        **eval_kwargs_lists: Dict[str, Any],
    ) -> Dict[str, List[EvaluationResult]]:
        """Evaluate queries.

        Args:
            query_engine (BaseQueryEngine): Query engine.
            queries (Optional[List[str]]): List of query strings. Defaults to None.
            **eval_kwargs_lists (Dict[str, Any]): Dict of lists of kwargs to
                pass to evaluator. Defaults to None.

        """
        if queries is None:
            raise ValueError("`queries` must be provided")

        # gather responses
        response_jobs = []
        for query in queries:
            response_jobs.append(response_worker(self.semaphore, query_engine, query))
        responses = await self.asyncio_mod.gather(*response_jobs)

        return await self.aevaluate_responses(
            queries=queries,
            responses=responses,
            **eval_kwargs_lists,
        )

    def evaluate_response_strs(
        self,
        queries: Optional[List[str]] = None,
        response_strs: Optional[List[str]] = None,
        contexts_list: Optional[List[List[str]]] = None,
        **eval_kwargs_lists: List,
    ) -> Dict[str, List[EvaluationResult]]:
        """Evaluate query, response pairs.

        Sync version of aevaluate_response_strs.

        """
        return asyncio.run(
            self.aevaluate_response_strs(
                queries=queries,
                response_strs=response_strs,
                contexts_list=contexts_list,
                **eval_kwargs_lists,
            )
        )

    def evaluate_responses(
        self,
        queries: Optional[List[str]] = None,
        responses: Optional[List[Response]] = None,
        **eval_kwargs_lists: Dict[str, Any],
    ) -> Dict[str, List[EvaluationResult]]:
        """Evaluate query, response objs.

        Sync version of aevaluate_responses.

        """
        return asyncio.run(
            self.aevaluate_responses(
                queries=queries,
                responses=responses,
                **eval_kwargs_lists,
            )
        )

    def evaluate_queries(
        self,
        query_engine: BaseQueryEngine,
        queries: Optional[List[str]] = None,
        **eval_kwargs_lists: Dict[str, Any],
    ) -> Dict[str, List[EvaluationResult]]:
        """Evaluate queries.

        Sync version of aevaluate_queries.

        """
        return asyncio.run(
            self.aevaluate_queries(
                query_engine=query_engine,
                queries=queries,
                **eval_kwargs_lists,
            )
        )