# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa import datasets import os import pandas as pd _CITATION = """\ @misc{li2023cmmlu, title={CMMLU: Measuring massive multitask language understanding in Chinese}, author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin}, year={2023}, eprint={2306.09212}, archivePrefix={arXiv}, primaryClass={cs.CL} } """ _DESCRIPTION = """\ CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context. """ _HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/cmmlu/summary' # _URL = r"https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip" _URL = r'https://modelscope.cn/api/v1/datasets/modelscope/cmmlu/repo?Revision=master&FilePath=cmmlu_v1_0_1.zip' # contains 67 sub-tasks task_list = [ 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', 'chinese_teacher_qualification', 'clinical_knowledge', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology', 'college_law', 'college_mathematics', 'college_medical_statistics', 'college_medicine', 'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics', 'education', 'electrical_engineering', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics', 'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism', 'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing', 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology', 'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions', ] class CMMLUConfig(datasets.BuilderConfig): def __init__(self, **kwargs): super().__init__(version=datasets.Version('1.0.1'), **kwargs) # V1.0.1 Fix: One comma missing in word_religions.csv # V1.0.0 Init version class CMMLU(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [CMMLUConfig(name=task_name) for task_name in task_list] def _info(self): features = datasets.Features({ 'Question': datasets.Value('string'), 'A': datasets.Value('string'), 'B': datasets.Value('string'), 'C': datasets.Value('string'), 'D': datasets.Value('string'), 'Answer': datasets.Value('string'), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, citation=_CITATION, ) def _split_generators(self, dl_manager): data_dir = dl_manager.download_and_extract(_URL) task_name = self.config.name return [ datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ 'filepath': os.path.join(data_dir, f'test/{task_name}.csv'), }, ), datasets.SplitGenerator( name=datasets.Split('dev'), gen_kwargs={ 'filepath': os.path.join(data_dir, f'dev/{task_name}.csv'), }, ), ] def _generate_examples(self, filepath): df = pd.read_csv(filepath, header=0, index_col=0, encoding='utf-8') for i, instance in enumerate(df.to_dict(orient='records')): yield i, instance