186 lines
9.7 KiB
Python
186 lines
9.7 KiB
Python
from collections import defaultdict
|
|
from typing import Any, Dict
|
|
|
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
from evalscope.constants import EvalType, OutputType
|
|
from evalscope.metrics import exact_match
|
|
from evalscope.metrics.completion_parsers import ResponseParser
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
SUBSET_LIST = [
|
|
'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
|
|
'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
|
|
'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
|
|
'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
|
|
'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
|
|
'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
|
|
'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
|
|
'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
|
|
'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
|
|
'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
|
|
'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
|
|
'world_religions'
|
|
]
|
|
|
|
SUBJECT_MAPPING = {
|
|
'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
|
|
'anatomy': ['Anatomy', 'health', 'Other'],
|
|
'astronomy': ['Astronomy', 'physics', 'STEM'],
|
|
'business_ethics': ['Business Ethics', 'business', 'Other'],
|
|
'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
|
|
'college_biology': ['College Biology', 'biology', 'STEM'],
|
|
'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
|
|
'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
|
|
'college_mathematics': ['College Mathematics', 'math', 'STEM'],
|
|
'college_medicine': ['College Medicine', 'health', 'Other'],
|
|
'college_physics': ['College Physics', 'physics', 'STEM'],
|
|
'computer_security': ['Computer Security', 'computer science', 'STEM'],
|
|
'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
|
|
'econometrics': ['Econometrics', 'economics', 'Social Science'],
|
|
'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
|
|
'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
|
|
'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
|
|
'global_facts': ['Global Facts', 'other', 'Other'],
|
|
'high_school_biology': ['High School Biology', 'biology', 'STEM'],
|
|
'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
|
|
'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
|
|
'high_school_european_history': ['High School European History', 'history', 'Humanities'],
|
|
'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
|
|
'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
|
|
'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
|
|
'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
|
|
'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
|
|
'high_school_physics': ['High School Physics', 'physics', 'STEM'],
|
|
'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
|
|
'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
|
|
'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
|
|
'high_school_world_history': ['High School World History', 'history', 'Humanities'],
|
|
'human_aging': ['Human Aging', 'health', 'Other'],
|
|
'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
|
|
'international_law': ['International Law', 'law', 'Humanities'],
|
|
'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
|
|
'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
|
|
'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
|
|
'management': ['Management', 'business', 'Other'],
|
|
'marketing': ['Marketing', 'business', 'Other'],
|
|
'medical_genetics': ['Medical Genetics', 'health', 'Other'],
|
|
'miscellaneous': ['Miscellaneous', 'other', 'Other'],
|
|
'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
|
|
'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
|
|
'nutrition': ['Nutrition', 'health', 'Other'],
|
|
'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
|
|
'prehistory': ['Prehistory', 'history', 'Humanities'],
|
|
'professional_accounting': ['Professional Accounting', 'other', 'Other'],
|
|
'professional_law': ['Professional Law', 'law', 'Humanities'],
|
|
'professional_medicine': ['Professional Medicine', 'health', 'Other'],
|
|
'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
|
|
'public_relations': ['Public Relations', 'politics', 'Social Science'],
|
|
'security_studies': ['Security Studies', 'politics', 'Social Science'],
|
|
'sociology': ['Sociology', 'culture', 'Social Science'],
|
|
'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
|
|
'virology': ['Virology', 'health', 'Other'],
|
|
'world_religions': ['World Religions', 'philosophy', 'Humanities'],
|
|
}
|
|
|
|
|
|
@Benchmark.register(
|
|
name='mmlu_redux',
|
|
pretty_name='MMLU-Redux',
|
|
tags=['MCQ', 'Knowledge'],
|
|
description=
|
|
'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
|
|
dataset_id='AI-ModelScope/mmlu-redux-2.0',
|
|
model_adapter=OutputType.GENERATION,
|
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
subset_list=SUBSET_LIST,
|
|
metric_list=['AverageAccuracy'],
|
|
few_shot_num=0,
|
|
train_split=None,
|
|
eval_split='test',
|
|
prompt_template=
|
|
'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
|
|
)
|
|
class MMLUReduxAdapter(DataAdapter):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
if self.few_shot_num > 0:
|
|
self.few_shot_num = 0
|
|
logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
|
|
|
|
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
|
|
def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
if self.few_shot_num > 0:
|
|
prefix = self.format_fewshot_examples(few_shot_list)
|
|
else:
|
|
prefix = ''
|
|
query = prefix + 'Q: ' + input_d['question'] + '\n' + \
|
|
self.__form_options(input_d['choices']) + '\n'
|
|
|
|
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
return self.gen_prompt_data(full_prompt)
|
|
|
|
def format_fewshot_examples(self, few_shot_list):
|
|
# load few-shot prompts for each category
|
|
prompts = ''
|
|
for index, d in enumerate(few_shot_list):
|
|
prompts += 'Q: ' + d['question'] + '\n' + \
|
|
self.__form_options(d['choices']) + '\n'
|
|
return prompts
|
|
|
|
def __form_options(self, options: list):
|
|
option_str = 'Options are:\n'
|
|
for opt, choice in zip(options, self.choices):
|
|
option_str += f'({choice}): {opt}' + '\n'
|
|
return option_str
|
|
|
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
"""
|
|
Parse the raw input labels (gold).
|
|
|
|
Args:
|
|
input_d: input raw data. Depending on the dataset.
|
|
|
|
Returns:
|
|
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
"""
|
|
answer_index = int(input_d['answer'])
|
|
return self.choices[answer_index]
|
|
|
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
"""
|
|
Parse the predicted result and extract proper answer.
|
|
|
|
Args:
|
|
result: Predicted answer from the model. Usually a string for chat.
|
|
raw_input_d: The raw input. Depending on the dataset.
|
|
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
|
|
Returns:
|
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
"""
|
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
return result
|
|
else:
|
|
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
|
|
def match(self, gold: str, pred: str) -> float:
|
|
"""
|
|
Match the gold answer and the predicted answer.
|
|
|
|
Args:
|
|
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
e.g. 'A', extracted from get_gold_answer method.
|
|
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
e.g. 'B', extracted from parse_pred_result method.
|
|
|
|
Returns:
|
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
"""
|
|
return exact_match(gold=gold, pred=pred)
|