# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. """HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019. """ """DO NOT EDIT.""" import datasets import json # flake8: noqa # HomePage: https://rowanzellers.com/hellaswag/ # GitHub: https://github.com/rowanz/hellaswag _CITATION = """\ @inproceedings{zellers2019hellaswag, title={HellaSwag: Can a Machine Really Finish Your Sentence?}, author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, year={2019} } """ _DESCRIPTION = """ HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019. """ _URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/hellaswag/' _URLS = { 'train': _URL + 'hellaswag_train.jsonl', 'test': _URL + 'hellaswag_test.jsonl', 'dev': _URL + 'hellaswag_val.jsonl', } class Hellaswag(datasets.GeneratorBasedBuilder): """TODO(hellaswag): Short description of my dataset.""" # TODO(hellaswag): Set up version. VERSION = datasets.Version('0.1.0') def _info(self): # TODO(hellaswag): Specifies the datasets.DatasetInfo object return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features({ # These are the features of your dataset like images, labels ... 'ind': datasets.Value('int32'), 'activity_label': datasets.Value('string'), 'ctx_a': datasets.Value('string'), 'ctx_b': datasets.Value('string'), 'ctx': datasets.Value('string'), 'endings': datasets.features.Sequence(datasets.Value('string')), 'source_id': datasets.Value('string'), 'split': datasets.Value('string'), 'split_type': datasets.Value('string'), 'label': datasets.Value('string'), }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage='https://rowanzellers.com/hellaswag/', citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(hellaswag): Downloads the data and defines the splits # dl_manager is a datasets.download.DownloadManager that can be used to # download and extract URLs urls_to_download = _URLS dl_dir = dl_manager.download_and_extract(urls_to_download) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={'filepath': dl_dir['train']}, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={'filepath': dl_dir['test']}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={'filepath': dl_dir['dev']}, ), ] def _generate_examples(self, filepath): """Yields examples.""" # TODO(hellaswag): Yields (key, example) tuples from the dataset with open(filepath, encoding='utf-8') as f: for id_, row in enumerate(f): data = json.loads(row) yield id_, { 'ind': int(data['ind']), 'activity_label': data['activity_label'], 'ctx_a': data.get('ctx_a', ''), 'ctx_b': data.get('ctx_b', ''), 'ctx': data['ctx'], 'endings': data.get('endings', []), 'source_id': data['source_id'], 'split': data['split'], 'split_type': data['split_type'], 'label': str(data.get('label', '')), }