evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/humaneval/humaneval_adapter.py

110 lines
5.4 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
import re
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.utils.logger import get_logger
logger = get_logger()
# Example:
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
@Benchmark.register(
name='humaneval',
pretty_name='HumanEval',
tags=['Coding'],
description=
'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
dataset_id='modelscope/humaneval',
subset_list=['openai_humaneval'],
metric_list=['Pass@1'],
few_shot_num=0,
train_split=None,
eval_split='test',
prompt_template='Complete the following python code:\n{query}',
extra_params={
'num_workers': 4,
'timeout': 4
},
)
class HumanevalAdapter(DataAdapter):
"""
A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
"""
def __init__(self, **kwargs):
try:
from human_eval.data import stream_jsonl, write_jsonl
from human_eval.evaluation import check_correctness
except ImportError:
raise ImportError('Please install human_eval:'
'https://github.com/openai/human-eval/tree/master#installation , '
'Note that you need to enable the execution code in the human_eval/execution.py first.')
super().__init__(**kwargs)
extra_params = kwargs.get('extra_params', {})
self.k = [1]
self.num_workers = extra_params.get('num_workers', 4)
self.timeout = extra_params.get('timeout', 4)
self.read_problems_func = stream_jsonl
self.write_jsonl_func = write_jsonl
self.eval_func = check_correctness
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
data_dict = {}
for subset_name in subset_list:
data_dict[subset_name] = {}
# [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
return data_dict
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
"""
Generate prompt for the model.
Args:
input_d (dict): The raw input. A single data format of the Humaneval:
{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
"""
query = input_d['prompt']
full_prompt = self.prompt_template.format(query=query)
return self.gen_prompt_data(full_prompt)
@classmethod
def _postprocess(cls, text: str) -> str:
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
text = text.split('```')[1] # fall back to default strategy
else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # in case starting with ```python
text = text[max(text.find('\n') + 1, 0):]
if text.strip().startswith('from') or text.strip().startswith('import'):
def_idx = text.find('def')
if def_idx != -1:
text = text[max(text.find('\n', def_idx) + 1, 0):]
text = text.split('\n\n')[0]
if text.strip().startswith('def'):
text = '\n'.join(text.split('\n')[1:])
if not text.startswith(' '):
if text.startswith(' '):
text = ' ' + text.lstrip()
else:
text = '\n'.join([' ' + line for line in text.split('\n')])
return text
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
return self._postprocess(result)
def get_gold_answer(self, input_d: dict) -> str:
return input_d
def match(self, gold: str, pred: str) -> float:
res = self.eval_func(gold, pred, self.timeout)
return float(res['passed'])