evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py

83 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import Any
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import exact_match
from evalscope.metrics.completion_parsers import ResponseParser
SUBSET_LIST = ['default']
@Benchmark.register(
name='maritime_bench',
pretty_name='MaritimeBench',
tags=['Maritime', 'MCQ', 'Knowledge'],
description=
'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
dataset_id='HiDolphin/MaritimeBench',
model_adapter=OutputType.GENERATION,
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
subset_list=SUBSET_LIST,
metric_list=['AverageAccuracy'],
eval_split='test',
prompt_template=
'题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目在船舶主推进动力装置中传动轴系在运转中承受以下复杂的应力和负荷但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
)
class MaritimeBenchAdapter(DataAdapter):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.choices = ['A', 'B', 'C', 'D']
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
prefix = ''
query = prefix + input_d['question'] + '\n'
available_choices = []
for option in self.choices:
if option in input_d and input_d[option]:
query += option + ':' + input_d[option] + '\n'
available_choices.append(option)
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
return self.gen_prompt_data(full_prompt, choices=available_choices)
def get_gold_answer(self, input_d: dict) -> str:
"""
Parse the raw input labels (gold).
Args:
input_d: input raw data. Depending on the dataset.
Returns:
The parsed input. e.g. gold answer ... Depending on the dataset.
"""
return input_d['answer']
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the raw model prediction (pred).
Args:
pred: model prediction. Depending on the model.
Returns:
The parsed prediction. e.g. model answer... Depending on the model.
"""
return ResponseParser.parse_bracketed_answer(result, options=self.choices)
def match(self, gold: Any, pred: Any) -> Any:
"""
Match the gold answer with the predicted answer.
Args:
gold: The gold answer.
pred: The predicted answer.
Returns:
The result of the match.
"""
return exact_match(gold=gold, pred=pred)