evalscope_v0.17.0/evalscope.0.17.0/evalscope/metrics/llm_judge.py

import os
import re
from typing import Any, Dict, List, Optional

from evalscope.utils.logger import get_logger

logger = get_logger()

DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.

[Question]
{question}

[Reference Answer]
{gold}

[Predicted Answer]
{pred}

Evaluate the model's answer based on correctness compared to the reference answer.
Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT

Just return the letters "A" or "B", with no text around it.
"""  # noqa: E501


DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
Begin your evaluation by providing a short explanation. Be as objective as possible.
After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"

[Question]
{question}

[Response]
{pred}
"""  # noqa: E501

DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'


class LLMJudge:
    """
    A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
    """

    def __init__(
            self,
            api_key: Optional[str] = None,
            api_url: Optional[str] = None,
            model_id: Optional[str] = None,
            system_prompt: Optional[str] = None,
            prompt_template: Optional[str] = None,
            generation_config: Optional[Dict[str, Any]] = None,
            score_pattern: Optional[str] = None,
            score_mapping: Optional[Dict[str, float]] = None,
            score_type: str = 'pattern',  # 'pattern', 'numeric'
            **kwargs):
        """
        Initialize LLMJudge metric.

        Args:
            api_key (str, optional): API key for OpenAI or compatible service
            api_base (str, optional): API base URL
            model_id (str, optional): Model ID for LLM
            system_prompt (str, optional): System prompt for the judge
            prompt_template (str, optional): Prompt template for the judge
            generation_config (dict, optional): Generation configuration for the judge
            score_pattern (str, optional): Regex pattern to extract score from LLM response
            score_mapping (dict, optional): Mapping from extracted score to float value
            score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
                - 'pattern': Use score_pattern and score_mapping to extract categorical scores
                - 'numeric': Treat the extracted value as a direct numerical score
        """
        self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
        self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
        self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
        self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
        self.generation_config = generation_config or {}

        # Default score mapping for A/B pattern
        self.score_type = score_type
        if self.score_type == 'numeric':
            self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
            self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
                                                                     DEFAULT_NUMERIC_SCORE_TEMPLATE)
        elif self.score_type == 'pattern':
            self.score_pattern = score_pattern or r'(A|B)'
            self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
        else:
            raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
        self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}

        self._init_server_adapter()

    def _init_server_adapter(self):
        from evalscope.models import ServerModelAdapter

        # Initialize ServerModelAdapter
        self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)

    def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
        """
        Args:
            prompt (str): The prompt to evaluate
            system_prompt (str, optional): The system prompt to use for the evaluation
        Returns:
            str: The response from the LLM
        """
        input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}

        # Inference configuration
        infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
        if self.generation_config:
            infer_cfg.update(self.generation_config)

        if self.model_id == DEFAULT_JUDGE_MODEL:
            # Disable thinking for the default judge model
            infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)

        try:
            # Send request using ServerModelAdapter
            response = self.server_adapter.process_single_input(input_data, infer_cfg)

            # Extract content from response
            llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
            return llm_response
        except Exception as e:
            logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
            return ''

    def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
        if question is None:
            question = 'Not provided'

        # check variables in prompt_template
        prompt = self.prompt_template
        if '{question}' in self.prompt_template:
            prompt = prompt.replace('{question}', question)
        if '{pred}' in self.prompt_template:
            prompt = prompt.replace('{pred}', pred)
        if '{gold}' in self.prompt_template:
            prompt = prompt.replace('{gold}', gold)
        return prompt

    def get_score(self, response: str) -> float:
        """
        Extract score from LLM response using the configured pattern and mapping.

        Args:
            response (str): The response from the LLM

        Returns:
            float: The numeric score extracted from the response
        """
        if response is None:
            return 0.0

        # choose extraction method based on score_type
        if self.score_type == 'numeric':
            return self._extract_numeric_score(response)
        elif self.score_type == 'pattern':
            return self._extract_pattern_score(response)

    def _extract_numeric_score(self, response: str) -> Optional[float]:
        """extract numeric score from the response using the score_pattern"""
        match = re.search(self.score_pattern, response)

        if match:
            # try to convert each captured group to float
            for group in match.groups():
                if group is not None:
                    try:
                        return float(group)
                    except (ValueError, TypeError):
                        continue

            # if not found in groups, try the whole match
            try:
                return float(match.group(0))
            except (ValueError, TypeError):
                logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')

        return None

    def _extract_pattern_score(self, response: str) -> float:
        """use the score_pattern to extract categorical scores"""
        match = re.search(self.score_pattern, response)
        if match:
            answer = match.group(0)
            return self.score_mapping.get(answer, 0.0)
        else:
            logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
            return 0.0