embed-bge-m3/FlagEmbedding/research/C_MTEB/C_MTEB/tasks/PairClassification.py

116 lines
3.9 KiB
Python

from __future__ import annotations
from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification
from mteb.abstasks.TaskMetadata import TaskMetadata
class Ocnli(AbsTaskPairClassification):
metadata = TaskMetadata(
name="Ocnli",
description="Original Chinese Natural Language Inference dataset",
reference="https://arxiv.org/abs/2010.05444",
dataset={
"path": "C-MTEB/OCNLI",
"revision": "66e76a618a34d6d565d5538088562851e6daa7ec",
},
type="PairClassification",
category="s2s",
modalities=["text"],
eval_splits=["validation"],
eval_langs=["cmn-Hans"],
main_score="max_accuracy",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
bibtex_citation="""@misc{hu2020ocnli,
title={OCNLI: Original Chinese Natural Language Inference},
author={Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Lawrence S. Moss},
year={2020},
eprint={2010.05444},
archivePrefix={arXiv},
primaryClass={cs.CL}
}""",
descriptive_stats={"n_samples": None, "avg_character_length": None},
)
def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")
class Cmnli(AbsTaskPairClassification):
metadata = TaskMetadata(
name="Cmnli",
description="Chinese Multi-Genre NLI",
reference="https://huggingface.co/datasets/clue/viewer/cmnli",
dataset={
"path": "C-MTEB/CMNLI",
"revision": "41bc36f332156f7adc9e38f53777c959b2ae9766",
},
type="PairClassification",
category="s2s",
modalities=["text"],
eval_splits=["validation"],
eval_langs=["cmn-Hans"],
main_score="max_accuracy",
date=None,
domains=None,
task_subtypes=None,
license=None,
annotations_creators=None,
dialect=None,
sample_creation=None,
bibtex_citation="""@inproceedings{xu-etal-2020-clue,
title = "{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark",
author = "Xu, Liang and
Hu, Hai and
Zhang, Xuanwei and
Li, Lu and
Cao, Chenjie and
Li, Yudong and
Xu, Yechen and
Sun, Kai and
Yu, Dian and
Yu, Cong and
Tian, Yin and
Dong, Qianqian and
Liu, Weitang and
Shi, Bo and
Cui, Yiming and
Li, Junyi and
Zeng, Jun and
Wang, Rongzhao and
Xie, Weijian and
Li, Yanting and
Patterson, Yina and
Tian, Zuoyu and
Zhang, Yiwen and
Zhou, He and
Liu, Shaoweihua and
Zhao, Zhe and
Zhao, Qipeng and
Yue, Cong and
Zhang, Xinrui and
Yang, Zhengliang and
Richardson, Kyle and
Lan, Zhenzhong",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.419",
doi = "10.18653/v1/2020.coling-main.419",
pages = "4762--4772",
}""",
descriptive_stats={"n_samples": None, "avg_character_length": None},
)
def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")