evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/race/race.py

105 lines
3.5 KiB
Python

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets
import os
import pandas as pd
_CITATION = """\
@inproceedings{lai-etal-2017-race,
title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
author = "Lai, Guokun and
Xie, Qizhe and
Liu, Hanxiao and
Yang, Yiming and
Hovy, Eduard",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1082",
doi = "10.18653/v1/D17-1082",
pages = "785--794",
}
"""
_DESCRIPTION = """\
RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
"""
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'
task_list = [
'high',
'middle',
]
class RACEConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
class RACE(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]
def _info(self):
features = datasets.Features({
'example_id': datasets.Value('string'),
'article': datasets.Value('string'),
'answer': datasets.Value('string'),
'question': datasets.Value('string'),
'options': [datasets.Value('string')],
})
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
data_dir = dl_manager.download_and_extract(_URL)
task_name = self.config.name
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
},
),
]
def _generate_examples(self, filepath):
df = pd.read_parquet(filepath)
df.columns = ['example_id', 'article', 'answer', 'question', 'options']
for i, instance in enumerate(df.to_dict(orient='records')):
yield i, instance