evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/trivia_qa/trivia_qa.py

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets
import json
import os
import pandas as pd

_CITATION = """\
@article{2017arXivtriviaqa,
       author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
                 Daniel and {Zettlemoyer}, Luke},
        title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
      journal = {arXiv e-prints},
         year = 2017,
          eid = {arXiv:1705.03551},
        pages = {arXiv:1705.03551},
archivePrefix = {arXiv},
       eprint = {1705.03551},
}
"""

_DESCRIPTION = """\
TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
"""

_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'

_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'

task_list = ['default']


class TriviaQAConfig(datasets.BuilderConfig):

    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version('1.0.0'), **kwargs)


class TriviaQA(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]

    def _info(self):
        features = datasets.Features({
            'input': [{
                'role': datasets.features.Value('string'),
                'content': datasets.features.Value('string'),
            }],
            'ideal': [datasets.Value('string')],
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split('dev'),
                gen_kwargs={
                    'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
                },
            ),
        ]

    def _generate_examples(self, filepath):
        with open(filepath, encoding='utf-8') as f:
            contents = [json.loads(line) for line in f.readlines()]
            for i, instance in enumerate(contents):
                yield i, instance