248 lines
8.8 KiB
Python
248 lines
8.8 KiB
Python
import itertools
|
|
from datasets import Dataset, DatasetDict
|
|
from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
|
|
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, check_label_distribution
|
|
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
|
|
NUM_SAMPLES = 2048
|
|
|
|
|
|
class CLSClusteringFastS2S(AbsTaskClusteringFast):
|
|
max_document_to_embed = NUM_SAMPLES
|
|
max_fraction_of_documents_to_embed = None
|
|
|
|
metadata = TaskMetadata(
|
|
name='CLSClusteringS2S',
|
|
description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
|
|
reference='https://arxiv.org/abs/2209.05034',
|
|
dataset={
|
|
'path': 'C-MTEB/CLSClusteringS2S',
|
|
'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
|
|
},
|
|
type='Clustering',
|
|
category='s2s',
|
|
modalities=['text'],
|
|
eval_splits=['test'],
|
|
eval_langs=['cmn-Hans'],
|
|
main_score='v_measure',
|
|
date=('2022-01-01', '2022-09-12'),
|
|
domains=['Academic', 'Written'],
|
|
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
license='apache-2.0',
|
|
annotations_creators='derived',
|
|
dialect=[],
|
|
sample_creation='found',
|
|
bibtex_citation="""@misc{li2022csl,
|
|
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
|
|
year={2022},
|
|
eprint={2209.05034},
|
|
archivePrefix={arXiv},
|
|
primaryClass={cs.CL}
|
|
}""", # noqa
|
|
descriptive_stats={
|
|
'n_samples': {
|
|
'test': NUM_SAMPLES
|
|
},
|
|
'avg_character_length': {},
|
|
},
|
|
)
|
|
|
|
def dataset_transform(self):
|
|
ds = {}
|
|
for split in self.metadata.eval_splits:
|
|
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
|
|
|
|
check_label_distribution(self.dataset[split])
|
|
|
|
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
self.dataset = DatasetDict(ds)
|
|
self.dataset = self.stratified_subsampling(
|
|
self.dataset,
|
|
self.seed,
|
|
self.metadata.eval_splits,
|
|
label='labels',
|
|
n_samples=NUM_SAMPLES,
|
|
)
|
|
|
|
|
|
class CLSClusteringFastP2P(AbsTaskClusteringFast):
|
|
max_document_to_embed = NUM_SAMPLES
|
|
max_fraction_of_documents_to_embed = None
|
|
|
|
metadata = TaskMetadata(
|
|
name='CLSClusteringP2P',
|
|
description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
|
|
reference='https://arxiv.org/abs/2209.05034',
|
|
dataset={
|
|
'path': 'C-MTEB/CLSClusteringP2P',
|
|
'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
|
|
},
|
|
type='Clustering',
|
|
category='p2p',
|
|
modalities=['text'],
|
|
eval_splits=['test'],
|
|
eval_langs=['cmn-Hans'],
|
|
main_score='v_measure',
|
|
date=('2022-01-01', '2022-09-12'),
|
|
domains=['Academic', 'Written'],
|
|
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
license='apache-2.0',
|
|
annotations_creators='derived',
|
|
dialect=[],
|
|
sample_creation='found',
|
|
bibtex_citation="""@misc{li2022csl,
|
|
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
|
|
year={2022},
|
|
eprint={2209.05034},
|
|
archivePrefix={arXiv},
|
|
primaryClass={cs.CL}
|
|
}""", # noqa
|
|
descriptive_stats={
|
|
'n_samples': {
|
|
'test': NUM_SAMPLES
|
|
},
|
|
'avg_character_length': {},
|
|
},
|
|
)
|
|
|
|
def dataset_transform(self):
|
|
ds = {}
|
|
for split in self.metadata.eval_splits:
|
|
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
|
|
|
|
check_label_distribution(self.dataset[split])
|
|
|
|
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
self.dataset = DatasetDict(ds)
|
|
self.dataset = self.stratified_subsampling(
|
|
self.dataset,
|
|
self.seed,
|
|
self.metadata.eval_splits,
|
|
label='labels',
|
|
n_samples=NUM_SAMPLES,
|
|
)
|
|
|
|
|
|
class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
|
|
max_document_to_embed = NUM_SAMPLES
|
|
max_fraction_of_documents_to_embed = None
|
|
|
|
metadata = TaskMetadata(
|
|
name='ThuNewsClusteringS2S',
|
|
dataset={
|
|
'path': 'C-MTEB/ThuNewsClusteringS2S',
|
|
'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
|
|
},
|
|
description='Clustering of titles from the THUCNews dataset',
|
|
reference='http://thuctc.thunlp.org/',
|
|
type='Clustering',
|
|
category='s2s',
|
|
modalities=['text'],
|
|
eval_splits=['test'],
|
|
eval_langs=['cmn-Hans'],
|
|
main_score='v_measure',
|
|
date=('2006-01-01', '2007-01-01'),
|
|
domains=['News', 'Written'],
|
|
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
license='apache-2.0',
|
|
annotations_creators='derived',
|
|
dialect=[],
|
|
sample_creation='found',
|
|
bibtex_citation="""@software{THUCTC,
|
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
title = {THUCTC: An Efficient Chinese Text Classifier},
|
|
year = {2016},
|
|
note = {THU Chinese Text Classification Toolkit},
|
|
publisher = {THU Natural Language Processing Lab},
|
|
url = {https://github.com/thunlp/THUCTC}
|
|
}""",
|
|
descriptive_stats={
|
|
'n_samples': {
|
|
'test': NUM_SAMPLES
|
|
},
|
|
'avg_character_length': {},
|
|
},
|
|
)
|
|
|
|
def dataset_transform(self):
|
|
ds = {}
|
|
for split in self.metadata.eval_splits:
|
|
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
|
|
|
|
check_label_distribution(self.dataset[split])
|
|
|
|
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
self.dataset = DatasetDict(ds)
|
|
self.dataset = self.stratified_subsampling(
|
|
self.dataset,
|
|
self.seed,
|
|
self.metadata.eval_splits,
|
|
label='labels',
|
|
n_samples=NUM_SAMPLES,
|
|
)
|
|
|
|
|
|
class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
|
|
max_document_to_embed = NUM_SAMPLES
|
|
max_fraction_of_documents_to_embed = None
|
|
|
|
metadata = TaskMetadata(
|
|
name='ThuNewsClusteringP2P',
|
|
dataset={
|
|
'path': 'C-MTEB/ThuNewsClusteringP2P',
|
|
'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
|
|
},
|
|
description='Clustering of titles + abstracts from the THUCNews dataset',
|
|
reference='http://thuctc.thunlp.org/',
|
|
type='Clustering',
|
|
category='p2p',
|
|
modalities=['text'],
|
|
eval_splits=['test'],
|
|
eval_langs=['cmn-Hans'],
|
|
main_score='v_measure',
|
|
date=('2006-01-01', '2007-01-01'),
|
|
domains=['News', 'Written'],
|
|
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
license='apache-2.0',
|
|
annotations_creators='derived',
|
|
dialect=[],
|
|
sample_creation='found',
|
|
bibtex_citation="""@software{THUCTC,
|
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
title = {THUCTC: An Efficient Chinese Text Classifier},
|
|
year = {2016},
|
|
note = {THU Chinese Text Classification Toolkit},
|
|
publisher = {THU Natural Language Processing Lab},
|
|
url = {https://github.com/thunlp/THUCTC}
|
|
}""",
|
|
descriptive_stats={
|
|
'n_samples': {
|
|
'test': NUM_SAMPLES
|
|
},
|
|
'avg_character_length': {},
|
|
},
|
|
)
|
|
|
|
def dataset_transform(self):
|
|
ds = {}
|
|
for split in self.metadata.eval_splits:
|
|
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
|
|
|
|
check_label_distribution(self.dataset[split])
|
|
|
|
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
self.dataset = DatasetDict(ds)
|
|
self.dataset = self.stratified_subsampling(
|
|
self.dataset,
|
|
self.seed,
|
|
self.metadata.eval_splits,
|
|
label='labels',
|
|
n_samples=NUM_SAMPLES,
|
|
)
|