evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/frames/frames_adapter.py

92 lines
3.1 KiB
Python

from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import EvalType, OutputType
from evalscope.metrics import LLMJudge, exact_match
TEMPLATE_0SHOT = """Please read the following text and answer the question below.
<text>
{context}
</text>
{question}
Format your response as follows: "Therefore, the answer is (insert answer here)"."""
@Benchmark.register(
name='frames',
pretty_name='FRAMES',
tags=['Reasoning', 'Long Context'],
description=
'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
dataset_id='iic/frames',
model_adapter=OutputType.GENERATION,
output_types=[OutputType.GENERATION],
metric_list=['AverageAccuracy'],
few_shot_num=0,
train_split=None,
eval_split='test',
prompt_template=TEMPLATE_0SHOT,
)
class FramesAdapter(DataAdapter):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def load(self, **kwargs):
# default load with snapshot
kwargs['file_structure'] = {'default': ['test.jsonl']}
data_dict = super().load_with_snapshot(**kwargs)
return data_dict
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
"""
Generate model prompt from input data.
"""
context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
question = input_d['Prompt']
prompt = self.prompt_template.format(context=context, question=question)
return self.gen_prompt_data(prompt)
def get_gold_answer(self, input_d: dict) -> str:
"""
Parse the raw input labels (gold).
"""
return input_d['Answer']
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
"""
Parse the predicted result and extract proper answer.
"""
response = result.replace('*', '')
if 'the answer is' in response:
ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
else:
ans = ''
return ans
def match(self, gold: str, pred: str) -> float:
"""
Match the gold answer and the predicted answer.
"""
from .utils import normalize_answer
gold = normalize_answer(gold)
pred = normalize_answer(pred)
return exact_match(gold=gold, pred=pred)
def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
raw_input = kwargs.get('raw_input', None)
question = raw_input['Prompt']
# get grading response
prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
# parse grading response
if 'YES' in orm_response:
return 1.0
else:
return 0.0