import os import re from typing import Any, Dict, List, Optional from evalscope.utils.logger import get_logger logger = get_logger() DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect. [Question] {question} [Reference Answer] {gold} [Predicted Answer] {pred} Evaluate the model's answer based on correctness compared to the reference answer. Grade the predicted answer of this new question as one of: A: CORRECT B: INCORRECT Just return the letters "A" or "B", with no text around it. """ # noqa: E501 DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\" [Question] {question} [Response] {pred} """ # noqa: E501 DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B' DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/' class LLMJudge: """ A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers. """ def __init__( self, api_key: Optional[str] = None, api_url: Optional[str] = None, model_id: Optional[str] = None, system_prompt: Optional[str] = None, prompt_template: Optional[str] = None, generation_config: Optional[Dict[str, Any]] = None, score_pattern: Optional[str] = None, score_mapping: Optional[Dict[str, float]] = None, score_type: str = 'pattern', # 'pattern', 'numeric' **kwargs): """ Initialize LLMJudge metric. Args: api_key (str, optional): API key for OpenAI or compatible service api_base (str, optional): API base URL model_id (str, optional): Model ID for LLM system_prompt (str, optional): System prompt for the judge prompt_template (str, optional): Prompt template for the judge generation_config (dict, optional): Generation configuration for the judge score_pattern (str, optional): Regex pattern to extract score from LLM response score_mapping (dict, optional): Mapping from extracted score to float value score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'. - 'pattern': Use score_pattern and score_mapping to extract categorical scores - 'numeric': Treat the extracted value as a direct numerical score """ self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY') self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL) self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL) self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None) self.generation_config = generation_config or {} # Default score mapping for A/B pattern self.score_type = score_type if self.score_type == 'numeric': self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]' self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_NUMERIC_SCORE_TEMPLATE) elif self.score_type == 'pattern': self.score_pattern = score_pattern or r'(A|B)' self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE) else: raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.") self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0} self._init_server_adapter() def _init_server_adapter(self): from evalscope.models import ServerModelAdapter # Initialize ServerModelAdapter self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key) def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str: """ Args: prompt (str): The prompt to evaluate system_prompt (str, optional): The system prompt to use for the evaluation Returns: str: The response from the LLM """ input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt} # Inference configuration infer_cfg = {'temperature': 0.0, 'max_tokens': 1024} if self.generation_config: infer_cfg.update(self.generation_config) if self.model_id == DEFAULT_JUDGE_MODEL: # Disable thinking for the default judge model infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False) try: # Send request using ServerModelAdapter response = self.server_adapter.process_single_input(input_data, infer_cfg) # Extract content from response llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '') return llm_response except Exception as e: logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}') return '' def build_prompt(self, pred: str, gold: str, question: Optional[str] = None): if question is None: question = 'Not provided' # check variables in prompt_template prompt = self.prompt_template if '{question}' in self.prompt_template: prompt = prompt.replace('{question}', question) if '{pred}' in self.prompt_template: prompt = prompt.replace('{pred}', pred) if '{gold}' in self.prompt_template: prompt = prompt.replace('{gold}', gold) return prompt def get_score(self, response: str) -> float: """ Extract score from LLM response using the configured pattern and mapping. Args: response (str): The response from the LLM Returns: float: The numeric score extracted from the response """ if response is None: return 0.0 # choose extraction method based on score_type if self.score_type == 'numeric': return self._extract_numeric_score(response) elif self.score_type == 'pattern': return self._extract_pattern_score(response) def _extract_numeric_score(self, response: str) -> Optional[float]: """extract numeric score from the response using the score_pattern""" match = re.search(self.score_pattern, response) if match: # try to convert each captured group to float for group in match.groups(): if group is not None: try: return float(group) except (ValueError, TypeError): continue # if not found in groups, try the whole match try: return float(match.group(0)) except (ValueError, TypeError): logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}') return None def _extract_pattern_score(self, response: str) -> float: """use the score_pattern to extract categorical scores""" match = re.search(self.score_pattern, response) if match: answer = match.group(0) return self.score_mapping.get(answer, 0.0) else: logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}") return 0.0