evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/race/race.py

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets
import os
import pandas as pd

_CITATION = """\
@inproceedings{lai-etal-2017-race,
    title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
    author = "Lai, Guokun  and
      Xie, Qizhe  and
      Liu, Hanxiao  and
      Yang, Yiming  and
      Hovy, Eduard",
    booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
    month = sep,
    year = "2017",
    address = "Copenhagen, Denmark",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D17-1082",
    doi = "10.18653/v1/D17-1082",
    pages = "785--794",
}
"""

_DESCRIPTION = """\
RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
"""

_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'

_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'

task_list = [
    'high',
    'middle',
]


class RACEConfig(datasets.BuilderConfig):

    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version('1.0.0'), **kwargs)


class RACE(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]

    def _info(self):
        features = datasets.Features({
            'example_id': datasets.Value('string'),
            'article': datasets.Value('string'),
            'answer': datasets.Value('string'),
            'question': datasets.Value('string'),
            'options': [datasets.Value('string')],
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URL)
        task_name = self.config.name
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
                },
            ),
        ]

    def _generate_examples(self, filepath):
        df = pd.read_parquet(filepath)
        df.columns = ['example_id', 'article', 'answer', 'question', 'options']

        for i, instance in enumerate(df.to_dict(orient='records')):
            yield i, instance