evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/general_qa/general_qa_adapter.py

155 lines
5.3 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path
from collections import defaultdict
from typing import List, Optional, Union
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import mean
from evalscope.utils.io_utils import jsonl_to_list
from evalscope.utils.logger import get_logger
logger = get_logger()
@Benchmark.register(
name='general_qa',
pretty_name='General-QA',
description='General Question Answering dataset',
tags=['QA', 'Custom'],
dataset_id='general_qa',
subset_list=['default'],
metric_list=['AverageBLEU', 'AverageRouge'],
few_shot_num=0,
train_split=None,
eval_split='test',
prompt_template='请回答问题\n{query}',
)
class GeneralQAAdapter(DataAdapter):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
"""
Load dataset from the given path or dataset name.
Args:
dataset_name_or_path (str): Path to dataset directory or file.
subset_list (list): List of subset names to load.
Returns:
dict: Loaded dataset organized by subset.
"""
dataset_name_or_path = dataset_name_or_path or self.dataset_id
subset_list = subset_list or self.subset_list
data_file_dict = defaultdict(str)
data_item_dict = defaultdict(list)
# get data file path and subset name
if os.path.isdir(dataset_name_or_path):
for subset_name in subset_list:
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
elif os.path.isfile(dataset_name_or_path):
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
data_file_dict[cur_subset_name] = dataset_name_or_path
else:
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
# load data from local disk
try:
for subset_name, file_path in data_file_dict.items():
data_item_dict[subset_name] = jsonl_to_list(file_path)
except Exception as e:
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
return data_dict
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
"""
Generate prompt for the model based on input data.
Args:
input_d (dict): Input data dictionary.
subset_name (str): Name of the subset.
few_shot_list (list): List of few-shot examples.
Returns:
dict: Dictionary containing the generated prompt.
"""
messages = input_d.get('messages')
query = input_d.get('question', '') or input_d.get('query', '')
system_prompt = input_d.get('system')
prompt = self.prompt_template.format(query=query)
return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
def get_gold_answer(self, input_d: dict) -> str:
"""
Extract the gold (reference) answer from the input data.
Args:
input_d (dict): Input data dictionary.
Returns:
str: Gold answer string.
"""
return input_d.get('answer') or input_d.get('response')
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
"""
Parse the prediction result.
Args:
result (str): Model prediction result.
raw_input_d (dict, optional): Original input data.
eval_type (str): Evaluation type.
Returns:
str: Parsed prediction result.
"""
return result
def match(self, gold: str, pred: str) -> dict:
"""
Compute metric scores between gold and predicted answers.
Args:
gold (str): Gold answer.
pred (str): Predicted answer.
Returns:
dict: Dictionary of computed metric scores.
"""
# reference free metrics
if gold is None:
return {'AverageAccuracy': -1}
# calculate rouge and bleu scores
res = dict()
if 'AverageRouge' in self.metric_list:
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
res.update(rouge_dict)
if 'AverageBLEU' in self.metric_list:
from evalscope.metrics import bleu_ngram_one_sample
bleu_dict = bleu_ngram_one_sample(pred, gold)
res.update(bleu_dict)
return res
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
"""
Compute weighted mean of the metric scores for all samples.
Args:
review_res_list (list): List of metric score dictionaries.
Returns:
list: List of dictionaries with averaged metric results.
"""
items = super().compute_dict_metric(review_res_list, **kwargs)
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]