evalscope_v0.17.0/evalscope.0.17.0/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py

248 lines
8.8 KiB
Python

import itertools
from datasets import Dataset, DatasetDict
from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, check_label_distribution
from mteb.abstasks.TaskMetadata import TaskMetadata
NUM_SAMPLES = 2048
class CLSClusteringFastS2S(AbsTaskClusteringFast):
max_document_to_embed = NUM_SAMPLES
max_fraction_of_documents_to_embed = None
metadata = TaskMetadata(
name='CLSClusteringS2S',
description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
reference='https://arxiv.org/abs/2209.05034',
dataset={
'path': 'C-MTEB/CLSClusteringS2S',
'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
},
type='Clustering',
category='s2s',
modalities=['text'],
eval_splits=['test'],
eval_langs=['cmn-Hans'],
main_score='v_measure',
date=('2022-01-01', '2022-09-12'),
domains=['Academic', 'Written'],
task_subtypes=['Thematic clustering', 'Topic classification'],
license='apache-2.0',
annotations_creators='derived',
dialect=[],
sample_creation='found',
bibtex_citation="""@misc{li2022csl,
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
year={2022},
eprint={2209.05034},
archivePrefix={arXiv},
primaryClass={cs.CL}
}""", # noqa
descriptive_stats={
'n_samples': {
'test': NUM_SAMPLES
},
'avg_character_length': {},
},
)
def dataset_transform(self):
ds = {}
for split in self.metadata.eval_splits:
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
check_label_distribution(self.dataset[split])
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
self.dataset = DatasetDict(ds)
self.dataset = self.stratified_subsampling(
self.dataset,
self.seed,
self.metadata.eval_splits,
label='labels',
n_samples=NUM_SAMPLES,
)
class CLSClusteringFastP2P(AbsTaskClusteringFast):
max_document_to_embed = NUM_SAMPLES
max_fraction_of_documents_to_embed = None
metadata = TaskMetadata(
name='CLSClusteringP2P',
description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
reference='https://arxiv.org/abs/2209.05034',
dataset={
'path': 'C-MTEB/CLSClusteringP2P',
'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
},
type='Clustering',
category='p2p',
modalities=['text'],
eval_splits=['test'],
eval_langs=['cmn-Hans'],
main_score='v_measure',
date=('2022-01-01', '2022-09-12'),
domains=['Academic', 'Written'],
task_subtypes=['Thematic clustering', 'Topic classification'],
license='apache-2.0',
annotations_creators='derived',
dialect=[],
sample_creation='found',
bibtex_citation="""@misc{li2022csl,
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
year={2022},
eprint={2209.05034},
archivePrefix={arXiv},
primaryClass={cs.CL}
}""", # noqa
descriptive_stats={
'n_samples': {
'test': NUM_SAMPLES
},
'avg_character_length': {},
},
)
def dataset_transform(self):
ds = {}
for split in self.metadata.eval_splits:
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
check_label_distribution(self.dataset[split])
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
self.dataset = DatasetDict(ds)
self.dataset = self.stratified_subsampling(
self.dataset,
self.seed,
self.metadata.eval_splits,
label='labels',
n_samples=NUM_SAMPLES,
)
class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
max_document_to_embed = NUM_SAMPLES
max_fraction_of_documents_to_embed = None
metadata = TaskMetadata(
name='ThuNewsClusteringS2S',
dataset={
'path': 'C-MTEB/ThuNewsClusteringS2S',
'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
},
description='Clustering of titles from the THUCNews dataset',
reference='http://thuctc.thunlp.org/',
type='Clustering',
category='s2s',
modalities=['text'],
eval_splits=['test'],
eval_langs=['cmn-Hans'],
main_score='v_measure',
date=('2006-01-01', '2007-01-01'),
domains=['News', 'Written'],
task_subtypes=['Thematic clustering', 'Topic classification'],
license='apache-2.0',
annotations_creators='derived',
dialect=[],
sample_creation='found',
bibtex_citation="""@software{THUCTC,
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
title = {THUCTC: An Efficient Chinese Text Classifier},
year = {2016},
note = {THU Chinese Text Classification Toolkit},
publisher = {THU Natural Language Processing Lab},
url = {https://github.com/thunlp/THUCTC}
}""",
descriptive_stats={
'n_samples': {
'test': NUM_SAMPLES
},
'avg_character_length': {},
},
)
def dataset_transform(self):
ds = {}
for split in self.metadata.eval_splits:
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
check_label_distribution(self.dataset[split])
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
self.dataset = DatasetDict(ds)
self.dataset = self.stratified_subsampling(
self.dataset,
self.seed,
self.metadata.eval_splits,
label='labels',
n_samples=NUM_SAMPLES,
)
class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
max_document_to_embed = NUM_SAMPLES
max_fraction_of_documents_to_embed = None
metadata = TaskMetadata(
name='ThuNewsClusteringP2P',
dataset={
'path': 'C-MTEB/ThuNewsClusteringP2P',
'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
},
description='Clustering of titles + abstracts from the THUCNews dataset',
reference='http://thuctc.thunlp.org/',
type='Clustering',
category='p2p',
modalities=['text'],
eval_splits=['test'],
eval_langs=['cmn-Hans'],
main_score='v_measure',
date=('2006-01-01', '2007-01-01'),
domains=['News', 'Written'],
task_subtypes=['Thematic clustering', 'Topic classification'],
license='apache-2.0',
annotations_creators='derived',
dialect=[],
sample_creation='found',
bibtex_citation="""@software{THUCTC,
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
title = {THUCTC: An Efficient Chinese Text Classifier},
year = {2016},
note = {THU Chinese Text Classification Toolkit},
publisher = {THU Natural Language Processing Lab},
url = {https://github.com/thunlp/THUCTC}
}""",
descriptive_stats={
'n_samples': {
'test': NUM_SAMPLES
},
'avg_character_length': {},
},
)
def dataset_transform(self):
ds = {}
for split in self.metadata.eval_splits:
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
check_label_distribution(self.dataset[split])
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
self.dataset = DatasetDict(ds)
self.dataset = self.stratified_subsampling(
self.dataset,
self.seed,
self.metadata.eval_splits,
label='labels',
n_samples=NUM_SAMPLES,
)