evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/aigc/t2i/base.py

from typing import List, Optional, Union

from evalscope.benchmarks import DataAdapter
from evalscope.metrics import mean, metric_registry
from evalscope.utils.logger import get_logger

logger = get_logger()


class T2IBaseAdapter(DataAdapter):

    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        logger.info(f'Initializing metrics: {self.metric_list}')
        self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}

    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
        # dummy prompt for general t2i
        return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))

    def get_gold_answer(self, input_d: dict) -> str:
        # dummy gold answer for general t2i
        return input_d.get('prompt', '')

    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
        # dummy parse pred result for general t2i
        return result or raw_input_d.get('image_path', '')

    def match(self, gold: str, pred: str) -> dict:
        # dummy match for general t2i
        # pred is the image path, gold is the prompt
        res = {}
        for metric_name, metric_func in self.metrics.items():
            score = metric_func(images=[pred], texts=[gold])[0][0]
            if isinstance(score, dict):
                for k, v in score.items():
                    res[f'{metric_name}_{k}'] = v.cpu().item()
            else:
                res[metric_name] = score.cpu().item()  # Updated to use score.cpu().item()
        return res

    def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
        """
        compute weighted mean of the bleu score of all samples

        Args:
            review_res_list: [score1, score2, ...]

        Returns:
            avg_res: List[dict]

        """
        items = super().compute_dict_metric(review_res_list, **kwargs)
        return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]