100 lines
3.2 KiB
Python
100 lines
3.2 KiB
Python
import mteb
|
|
import os
|
|
from tabulate import tabulate
|
|
|
|
from evalscope.backend.rag_eval import EmbeddingModel, cmteb
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
def show_results(output_folder, model, results):
|
|
model_name = model.mteb_model_meta.model_name_as_path()
|
|
revision = model.mteb_model_meta.revision
|
|
|
|
data = []
|
|
for model_res in results:
|
|
main_res = model_res.only_main_score()
|
|
for split, score in main_res.scores.items():
|
|
for sub_score in score:
|
|
data.append({
|
|
'Model': model_name.replace('eval__', ''),
|
|
'Revision': revision,
|
|
'Task Type': main_res.task_type,
|
|
'Task': main_res.task_name,
|
|
'Split': split,
|
|
'Subset': sub_score['hf_subset'],
|
|
'Main Score': sub_score['main_score'],
|
|
})
|
|
|
|
save_path = os.path.join(
|
|
output_folder,
|
|
model_name,
|
|
revision,
|
|
)
|
|
logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
|
|
logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
|
|
|
|
|
|
def one_stage_eval(
|
|
model_args,
|
|
eval_args,
|
|
) -> None:
|
|
# load model
|
|
model = EmbeddingModel.load(**model_args)
|
|
custom_dataset_path = eval_args.pop('dataset_path', None)
|
|
# load task first to update instructions
|
|
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
|
|
evaluation = mteb.MTEB(tasks=tasks)
|
|
|
|
eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
|
|
# run evaluation
|
|
results = evaluation.run(model, **eval_args)
|
|
|
|
# save and log results
|
|
show_results(eval_args['output_folder'], model, results)
|
|
|
|
|
|
def two_stage_eval(
|
|
model1_args,
|
|
model2_args,
|
|
eval_args,
|
|
) -> None:
|
|
"""a two-stage run with the second stage reading results saved from the first stage."""
|
|
# load model
|
|
dual_encoder = EmbeddingModel.load(**model1_args)
|
|
cross_encoder = EmbeddingModel.load(**model2_args)
|
|
|
|
first_stage_path = f"{eval_args['output_folder']}/stage1"
|
|
second_stage_path = f"{eval_args['output_folder']}/stage2"
|
|
|
|
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'])
|
|
for task in tasks:
|
|
evaluation = mteb.MTEB(tasks=[task])
|
|
|
|
# stage 1: run dual encoder
|
|
evaluation.run(
|
|
dual_encoder,
|
|
save_predictions=True,
|
|
output_folder=first_stage_path,
|
|
overwrite_results=True,
|
|
hub=eval_args['hub'],
|
|
limits=eval_args['limits'],
|
|
encode_kwargs=model1_args.get('encode_kwargs', {}),
|
|
)
|
|
# stage 2: run cross encoder
|
|
results = evaluation.run(
|
|
cross_encoder,
|
|
top_k=eval_args['top_k'],
|
|
save_predictions=True,
|
|
output_folder=second_stage_path,
|
|
previous_results=f'{first_stage_path}/{task.metadata.name}_default_predictions.json',
|
|
overwrite_results=True,
|
|
hub=eval_args['hub'],
|
|
limits=eval_args['limits'],
|
|
encode_kwargs=model2_args.get('encode_kwargs', {}),
|
|
)
|
|
|
|
# save and log results
|
|
show_results(second_stage_path, cross_encoder, results)
|