57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
import asyncio
|
|
import os
|
|
from datasets import Dataset
|
|
|
|
from evalscope.backend.rag_eval import LLM, EmbeddingModel
|
|
from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts
|
|
from evalscope.utils.logger import get_logger
|
|
from .arguments import EvaluationArguments
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
def rag_eval(args: EvaluationArguments, ) -> None:
|
|
|
|
import importlib
|
|
from ragas import RunConfig, evaluate
|
|
from ragas.llms import LangchainLLMWrapper
|
|
|
|
def dynamic_import(*function_names):
|
|
functions = []
|
|
for name in function_names:
|
|
module = importlib.import_module('ragas.metrics')
|
|
functions.append(getattr(module, name)())
|
|
return functions
|
|
|
|
llm = LLM.load(**args.critic_llm)
|
|
embedding = EmbeddingModel.load(**args.embeddings)
|
|
|
|
# load metrics
|
|
metrics = dynamic_import(*args.metrics)
|
|
asyncio.run(
|
|
translate_prompts(
|
|
prompts=metrics,
|
|
target_lang=args.language,
|
|
llm=LangchainLLMWrapper(llm),
|
|
adapt_instruction=True,
|
|
))
|
|
# load dataset
|
|
dataset = Dataset.from_json(args.testset_file)
|
|
|
|
# evaluate
|
|
runconfig = RunConfig(timeout=600, max_retries=2, max_wait=60, max_workers=1)
|
|
score = evaluate(
|
|
dataset,
|
|
metrics=metrics,
|
|
llm=llm,
|
|
embeddings=embedding,
|
|
run_config=runconfig,
|
|
)
|
|
score_df = score.to_pandas()
|
|
logger.info(score_df)
|
|
|
|
output_path = args.testset_file.replace('.json', '_score.json')
|
|
score_df.to_json(output_path, indent=4, index=False, orient='records', force_ascii=False)
|
|
|
|
logger.info(f'Eval score saved to {output_path}')
|