evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/trivia_qa/trivia_qa.py

90 lines
3.0 KiB
Python

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets
import json
import os
import pandas as pd
_CITATION = """\
@article{2017arXivtriviaqa,
author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
Daniel and {Zettlemoyer}, Luke},
title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
journal = {arXiv e-prints},
year = 2017,
eid = {arXiv:1705.03551},
pages = {arXiv:1705.03551},
archivePrefix = {arXiv},
eprint = {1705.03551},
}
"""
_DESCRIPTION = """\
TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
"""
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'
task_list = ['default']
class TriviaQAConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
class TriviaQA(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]
def _info(self):
features = datasets.Features({
'input': [{
'role': datasets.features.Value('string'),
'content': datasets.features.Value('string'),
}],
'ideal': [datasets.Value('string')],
})
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
data_dir = dl_manager.download_and_extract(_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
},
),
datasets.SplitGenerator(
name=datasets.Split('dev'),
gen_kwargs={
'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
},
),
]
def _generate_examples(self, filepath):
with open(filepath, encoding='utf-8') as f:
contents = [json.loads(line) for line in f.readlines()]
for i, instance in enumerate(contents):
yield i, instance