105 lines
2.8 KiB
Python
105 lines
2.8 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
from evalscope.run import run_task
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
gpt_4_o = {
|
|
'model_name': 'gpt-4o',
|
|
'api_base': 'http://localhost:8088/v1',
|
|
'api_key': 'EMPTY',
|
|
}
|
|
|
|
qwen2 = {
|
|
'model_name': 'qwen2.5',
|
|
'api_base': 'http://localhost:8000/v1',
|
|
'api_key': 'EMPTY',
|
|
}
|
|
|
|
|
|
def run_eval():
|
|
generate_testset_task_cfg = {
|
|
'eval_backend': 'RAGEval',
|
|
'eval_config': {
|
|
'tool': 'RAGAS',
|
|
'testset_generation': {
|
|
'docs': ['README.md'],
|
|
'test_size': 10,
|
|
'output_file': 'outputs/testset_chinese.json', # json file
|
|
'generator_llm': {
|
|
'model_name_or_path': 'Qwen/Qwen2.5-72B-Instruct',
|
|
},
|
|
'embeddings': {
|
|
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
generate_zh_cfg = {
|
|
'eval_backend': 'RAGEval',
|
|
'eval_config': {
|
|
'tool': 'RAGAS',
|
|
'testset_generation': {
|
|
'docs': ['test_zh.md'],
|
|
'test_size': 10,
|
|
'output_file': 'outputs/testset_chinese.json', # json file
|
|
'generator_llm': gpt_4_o,
|
|
'embeddings': {
|
|
'model_name_or_path': 'AI-ModelScope/bge-large-zh',
|
|
},
|
|
'language': 'chinese',
|
|
},
|
|
},
|
|
}
|
|
|
|
eval_task_cfg = {
|
|
'eval_backend': 'RAGEval',
|
|
'eval_config': {
|
|
'tool': 'RAGAS',
|
|
'eval': {
|
|
'testset_file': 'outputs/testset_chinese_with_answer.json',
|
|
'critic_llm': qwen2,
|
|
'embeddings': {
|
|
'model_name_or_path': 'AI-ModelScope/bge-large-zh',
|
|
},
|
|
'metrics': [
|
|
'Faithfulness',
|
|
'AnswerRelevancy',
|
|
'ContextPrecision',
|
|
'AnswerCorrectness',
|
|
],
|
|
'language': 'chinese',
|
|
},
|
|
},
|
|
}
|
|
|
|
multi_modal_eval_task_cfg = {
|
|
'eval_backend': 'RAGEval',
|
|
'eval_config': {
|
|
'tool': 'RAGAS',
|
|
'eval': {
|
|
'testset_file': 'outputs/testset_multi_modal.json',
|
|
'critic_llm': gpt_4_o,
|
|
'embeddings': {
|
|
'model_name_or_path': 'AI-ModelScope/bge-large-zh',
|
|
},
|
|
'metrics': [
|
|
'MultiModalFaithfulness',
|
|
'MultiModalRelevance',
|
|
],
|
|
'language': 'chinese',
|
|
},
|
|
},
|
|
}
|
|
|
|
# Run task
|
|
# run_task(task_cfg=eval_task_cfg)
|
|
# or
|
|
run_task(task_cfg=generate_testset_task_cfg)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
run_eval()
|