# isort: skip_file # Copyright (c) Alibaba, Inc. and its affiliates. # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa import datasets import os import pandas as pd """The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY.""" _CITATION = """\ @article{hendryckstest2021, title={Measuring Massive Multitask Language Understanding}, author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt}, journal={Proceedings of the International Conference on Learning Representations (ICLR)}, year={2021} } """ _DESCRIPTION = """\ Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021). """ _HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/mmlu/summary' _LICENSE = 'MIT' # _URL = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" _URL = 'https://modelscope.cn/api/v1/datasets/modelscope/mmlu/repo?Revision=master&FilePath=data.tar' task_list = [ 'high_school_european_history', 'business_ethics', 'clinical_knowledge', 'medical_genetics', 'high_school_us_history', 'high_school_physics', 'high_school_world_history', 'virology', 'high_school_microeconomics', 'econometrics', 'college_computer_science', 'high_school_biology', 'abstract_algebra', 'professional_accounting', 'philosophy', 'professional_medicine', 'nutrition', 'global_facts', 'machine_learning', 'security_studies', 'public_relations', 'professional_psychology', 'prehistory', 'anatomy', 'human_sexuality', 'college_medicine', 'high_school_government_and_politics', 'college_chemistry', 'logical_fallacies', 'high_school_geography', 'elementary_mathematics', 'human_aging', 'college_mathematics', 'high_school_psychology', 'formal_logic', 'high_school_statistics', 'international_law', 'high_school_mathematics', 'high_school_computer_science', 'conceptual_physics', 'miscellaneous', 'high_school_chemistry', 'marketing', 'professional_law', 'management', 'college_physics', 'jurisprudence', 'world_religions', 'sociology', 'us_foreign_policy', 'high_school_macroeconomics', 'computer_security', 'moral_scenarios', 'moral_disputes', 'electrical_engineering', 'astronomy', 'college_biology', ] class MMLUConfig(datasets.BuilderConfig): def __init__(self, **kwargs): super().__init__(version=datasets.Version('1.0.0'), **kwargs) class MMLU(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [MMLUConfig(name=task_name, ) for task_name in task_list] def _info(self): features = datasets.Features({ 'input': datasets.Value('string'), 'A': datasets.Value('string'), 'B': datasets.Value('string'), 'C': datasets.Value('string'), 'D': datasets.Value('string'), 'target': datasets.Value('string'), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): data_dir = dl_manager.download_and_extract(_URL) task_name = self.config.name return [ datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ 'filepath': os.path.join(data_dir, 'data', 'test', f'{task_name}_test.csv'), }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ 'filepath': os.path.join(data_dir, 'data', 'val', f'{task_name}_val.csv'), }, ), datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ 'filepath': os.path.join(data_dir, 'data', 'dev', f'{task_name}_dev.csv'), }, ), ] def _generate_examples(self, filepath): df = pd.read_csv(filepath) df.columns = ['input', 'A', 'B', 'C', 'D', 'target'] for i, instance in enumerate(df.to_dict(orient='records')): yield i, instance