197 lines
8.1 KiB
Python
197 lines
8.1 KiB
Python
import os
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from evalscope.utils.logger import get_logger
|
|
|
|
logger = get_logger()
|
|
|
|
DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
|
|
|
|
[Question]
|
|
{question}
|
|
|
|
[Reference Answer]
|
|
{gold}
|
|
|
|
[Predicted Answer]
|
|
{pred}
|
|
|
|
Evaluate the model's answer based on correctness compared to the reference answer.
|
|
Grade the predicted answer of this new question as one of:
|
|
A: CORRECT
|
|
B: INCORRECT
|
|
|
|
Just return the letters "A" or "B", with no text around it.
|
|
""" # noqa: E501
|
|
|
|
|
|
DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
|
|
Begin your evaluation by providing a short explanation. Be as objective as possible.
|
|
After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"
|
|
|
|
[Question]
|
|
{question}
|
|
|
|
[Response]
|
|
{pred}
|
|
""" # noqa: E501
|
|
|
|
DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
|
|
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
|
|
|
|
|
|
class LLMJudge:
|
|
"""
|
|
A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
api_key: Optional[str] = None,
|
|
api_url: Optional[str] = None,
|
|
model_id: Optional[str] = None,
|
|
system_prompt: Optional[str] = None,
|
|
prompt_template: Optional[str] = None,
|
|
generation_config: Optional[Dict[str, Any]] = None,
|
|
score_pattern: Optional[str] = None,
|
|
score_mapping: Optional[Dict[str, float]] = None,
|
|
score_type: str = 'pattern', # 'pattern', 'numeric'
|
|
**kwargs):
|
|
"""
|
|
Initialize LLMJudge metric.
|
|
|
|
Args:
|
|
api_key (str, optional): API key for OpenAI or compatible service
|
|
api_base (str, optional): API base URL
|
|
model_id (str, optional): Model ID for LLM
|
|
system_prompt (str, optional): System prompt for the judge
|
|
prompt_template (str, optional): Prompt template for the judge
|
|
generation_config (dict, optional): Generation configuration for the judge
|
|
score_pattern (str, optional): Regex pattern to extract score from LLM response
|
|
score_mapping (dict, optional): Mapping from extracted score to float value
|
|
score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
|
|
- 'pattern': Use score_pattern and score_mapping to extract categorical scores
|
|
- 'numeric': Treat the extracted value as a direct numerical score
|
|
"""
|
|
self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
|
|
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
self.generation_config = generation_config or {}
|
|
|
|
# Default score mapping for A/B pattern
|
|
self.score_type = score_type
|
|
if self.score_type == 'numeric':
|
|
self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
|
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
|
|
DEFAULT_NUMERIC_SCORE_TEMPLATE)
|
|
elif self.score_type == 'pattern':
|
|
self.score_pattern = score_pattern or r'(A|B)'
|
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
else:
|
|
raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
|
|
self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}
|
|
|
|
self._init_server_adapter()
|
|
|
|
def _init_server_adapter(self):
|
|
from evalscope.models import ServerModelAdapter
|
|
|
|
# Initialize ServerModelAdapter
|
|
self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
|
|
|
|
def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""
|
|
Args:
|
|
prompt (str): The prompt to evaluate
|
|
system_prompt (str, optional): The system prompt to use for the evaluation
|
|
Returns:
|
|
str: The response from the LLM
|
|
"""
|
|
input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
|
|
|
|
# Inference configuration
|
|
infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
|
|
if self.generation_config:
|
|
infer_cfg.update(self.generation_config)
|
|
|
|
if self.model_id == DEFAULT_JUDGE_MODEL:
|
|
# Disable thinking for the default judge model
|
|
infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
|
|
|
|
try:
|
|
# Send request using ServerModelAdapter
|
|
response = self.server_adapter.process_single_input(input_data, infer_cfg)
|
|
|
|
# Extract content from response
|
|
llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
return llm_response
|
|
except Exception as e:
|
|
logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
|
|
return ''
|
|
|
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
|
if question is None:
|
|
question = 'Not provided'
|
|
|
|
# check variables in prompt_template
|
|
prompt = self.prompt_template
|
|
if '{question}' in self.prompt_template:
|
|
prompt = prompt.replace('{question}', question)
|
|
if '{pred}' in self.prompt_template:
|
|
prompt = prompt.replace('{pred}', pred)
|
|
if '{gold}' in self.prompt_template:
|
|
prompt = prompt.replace('{gold}', gold)
|
|
return prompt
|
|
|
|
def get_score(self, response: str) -> float:
|
|
"""
|
|
Extract score from LLM response using the configured pattern and mapping.
|
|
|
|
Args:
|
|
response (str): The response from the LLM
|
|
|
|
Returns:
|
|
float: The numeric score extracted from the response
|
|
"""
|
|
if response is None:
|
|
return 0.0
|
|
|
|
# choose extraction method based on score_type
|
|
if self.score_type == 'numeric':
|
|
return self._extract_numeric_score(response)
|
|
elif self.score_type == 'pattern':
|
|
return self._extract_pattern_score(response)
|
|
|
|
def _extract_numeric_score(self, response: str) -> Optional[float]:
|
|
"""extract numeric score from the response using the score_pattern"""
|
|
match = re.search(self.score_pattern, response)
|
|
|
|
if match:
|
|
# try to convert each captured group to float
|
|
for group in match.groups():
|
|
if group is not None:
|
|
try:
|
|
return float(group)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
# if not found in groups, try the whole match
|
|
try:
|
|
return float(match.group(0))
|
|
except (ValueError, TypeError):
|
|
logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')
|
|
|
|
return None
|
|
|
|
def _extract_pattern_score(self, response: str) -> float:
|
|
"""use the score_pattern to extract categorical scores"""
|
|
match = re.search(self.score_pattern, response)
|
|
if match:
|
|
answer = match.group(0)
|
|
return self.score_mapping.get(answer, 0.0)
|
|
else:
|
|
logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
|
|
return 0.0
|