80 lines
2.6 KiB
Python
80 lines
2.6 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
"""Mathematics Aptitude Test of Heuristics (MATH) dataset."""
|
|
|
|
import datasets
|
|
import json
|
|
import os
|
|
|
|
_CITATION = """\
|
|
@article{hendrycksmath2021,
|
|
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
|
author={Dan Hendrycks
|
|
and Collin Burns
|
|
and Saurav Kadavath
|
|
and Akul Arora
|
|
and Steven Basart
|
|
and Eric Tang
|
|
and Dawn Song
|
|
and Jacob Steinhardt},
|
|
journal={arXiv preprint arXiv:2103.03874},
|
|
year={2021}
|
|
}
|
|
"""
|
|
|
|
_DESCRIPTION = """\
|
|
The Mathematics Aptitude Test of Heuristics (MATH) dataset consists of problems
|
|
from mathematics competitions, including the AMC 10, AMC 12, AIME, and more.
|
|
Each problem in MATH has a full step-by-step solution, which can be used to teach
|
|
models to generate answer derivations and explanations.
|
|
"""
|
|
|
|
_HOMEPAGE = 'https://github.com/hendrycks/math'
|
|
|
|
_LICENSE = 'https://github.com/hendrycks/math/blob/main/LICENSE'
|
|
|
|
# Original data URL: "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
|
|
_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/math/MATH.zip'
|
|
|
|
|
|
class CompetitionMathDataset(datasets.GeneratorBasedBuilder):
|
|
"""Mathematics Aptitude Test of Heuristics (MATH) dataset."""
|
|
|
|
VERSION = datasets.Version('1.0.0')
|
|
|
|
def _info(self):
|
|
features = datasets.Features({
|
|
'problem': datasets.Value('string'),
|
|
'level': datasets.Value('string'),
|
|
'type': datasets.Value('string'),
|
|
'solution': datasets.Value('string'),
|
|
})
|
|
return datasets.DatasetInfo(
|
|
description=_DESCRIPTION,
|
|
features=features,
|
|
supervised_keys=None,
|
|
homepage=_HOMEPAGE,
|
|
license=_LICENSE,
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager):
|
|
"""Returns SplitGenerators."""
|
|
download_dir = dl_manager.download_and_extract(_URL)
|
|
return [
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TRAIN,
|
|
gen_kwargs={'data_dir': dl_manager.iter_files(os.path.join(download_dir, 'MATH', 'train'))},
|
|
),
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TEST,
|
|
gen_kwargs={'data_dir': dl_manager.iter_files(os.path.join(download_dir, 'MATH', 'test'))},
|
|
),
|
|
]
|
|
|
|
def _generate_examples(self, data_dir):
|
|
"""Yields examples as (key, example) tuples."""
|
|
for id_, filepath in enumerate(data_dir):
|
|
with open(filepath, 'rb') as fin:
|
|
example = json.load(fin)
|
|
yield id_, example
|