import json import os import pandas as pd import plotly.graph_objects as go import re from collections import defaultdict from functools import lru_cache from modelscope import AutoTokenizer from plotly.subplots import make_subplots from tqdm.contrib.concurrent import thread_map from typing import List from evalscope.third_party.thinkbench.tools.llm import request_url from evalscope.third_party.thinkbench.tools.utils import extract_answer from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list cur_path = os.path.dirname(os.path.abspath(__file__)) class EvalThink: def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None): self.report_path = report_path self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read() self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read() self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle'] self.subset_dict = defaultdict(lambda: defaultdict(list)) self.think_end_token = '' self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) self.model_name = model_name self.dataset_name = dataset_name self.subsets = subsets self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy'] self.split_strategies = split_strategies # split by llm, keywords, separator self.judge_config = judge_config self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl') self.model_parse_dict = self.__init_parse_file() def __init_parse_file(self): if not os.path.exists(self.model_parse_file_path): return {} else: list_file = jsonl_to_list(self.model_parse_file_path) # convert to dict prompt as key, answer_index as value return {item['prompt']: item['answer_index'] for item in list_file} def get_think_part(self, message: dict) -> str: if 'reasoning_content' in message and message['reasoning_content']: return message['reasoning_content'] else: text = message['content'] last_think_end = text.rfind(self.think_end_token) return text[:last_think_end] @lru_cache(maxsize=None) def cal_tokens(self, text: str): return len(self.tokenizer.encode(text, add_special_tokens=False)) def process_choice(self, choice, problem): think_part = self.get_think_part(choice['message']) answer = choice['review']['gold'] tokens = self.cal_tokens(think_part) switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens) useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer)) reflection_tokens = tokens - useful_tokens # score = choice['review']['result'] score = 0 if useful_tokens == 0 else 1 return tokens, switch_count, useful_tokens, reflection_tokens, score def process_item(self, item): problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or '' results = [] for choice in item['choices']: results.append(self.process_choice(choice, problem)) break # only process the first choice total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results) avg_tokens = sum(total_tokens) / len(total_tokens) avg_thought_num = sum(switch_counts) / len(switch_counts) avg_token_efficiency = sum(useful_tokens) / sum(total_tokens) avg_accuracy = sum(scores) / len(scores) avg_useful_tokens = sum(useful_tokens) / len(useful_tokens) avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens) return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens def split_by_llm(self, response, problem) -> List[str]: response = response.replace('\n', ' ') # remove newline characters prompt = self.reformat_template.format(problem=problem, response=response) llm_response = request_url(self.judge_config, prompt) return llm_response.split('\n\n') def split_by_keywords(self, text) -> List[str]: pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens))) segments = re.split(pattern, text) # remove empty segments segments = [segment.strip() for segment in segments if segment.strip()] return segments if segments else [text] def split_by_separator(self, text) -> List[str]: return text.split('\n\n') def get_answer_index(self, response: List[str], problem: str, answer: str) -> int: tagged_response = '' for sdx, step in enumerate(response): tagged_response += f'\n{step}\n\n\n' tagged_response = tagged_response.strip() prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response) if prompt in self.model_parse_dict: answer_index = self.model_parse_dict[prompt] else: llm_response = request_url(self.judge_config, prompt) if not llm_response: answer_index = -1 else: answer_index = extract_answer(llm_response) dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index}, self.model_parse_file_path, dump_mode='append') try: answer_index = int(answer_index) except Exception: answer_index = -1 return answer_index def get_first_correct(self, response: str, problem: str, answer: str) -> str: if self.split_strategies == 'llm': text_list = self.split_by_llm(response, problem) elif self.split_strategies == 'keywords': text_list = self.split_by_keywords(response) else: text_list = self.split_by_separator(response) answer_index = self.get_answer_index(text_list, problem, answer) if answer_index == -1: # no correct answer found first_correct = '' else: first_correct = '\n\n'.join(text_list[: answer_index]) return first_correct def plot_metrics(self, results, output_dir): # Change layout to 2x3 fig = make_subplots(rows=2, cols=3, subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens', 'Token Efficiency', 'Thought Num', 'Accuracy'), shared_xaxes=True, x_title='Subsets', vertical_spacing=0.1, # Decrease vertical spacing between subplots horizontal_spacing=0.1) # Decrease horizontal spacing between subplots metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens', 'token_efficiency', 'thought_num', 'accuracy'] for i, metric in enumerate(metrics_order, start=1): y_values = [results[metric][subset] for subset in self.subsets] # Determine row and column for 2x3 layout row = (i - 1) // 3 + 1 col = (i - 1) % 3 + 1 fig.add_trace( go.Scatter(x=list(range(len(self.subsets))), y=y_values, mode='lines+markers', name=metric.replace('_', ' ').title()), row=row, col=col ) # Add annotations for each data point for j, y in enumerate(y_values): fig.add_annotation( x=j, y=y, text=f'{y:.2f}', showarrow=False, yshift=10, row=row, col=col ) fig.update_layout( height=800, # Adjust height for 2x3 layout width=1200, # Adjust width for 2x3 layout title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}', legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1) ) for i in range(1, len(metrics_order) + 1): row = (i - 1) // 3 + 1 col = (i - 1) % 3 + 1 fig.update_xaxes( ticktext=self.subsets, tickvals=list(range(len(self.subsets))), row=row, col=col ) fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col) # Update y-axis ranges fig.update_yaxes(range=[500, 5000], row=1, col=1) # Reasoning Tokens fig.update_yaxes(range=[0, 3000], row=1, col=2) # First Correct Tokens fig.update_yaxes(range=[0, 3000], row=1, col=3) # Reflection Tokens fig.update_yaxes(range=[0, 1], row=2, col=1) # Token Efficiency fig.update_yaxes(range=[0, 13], row=2, col=2) # Thought Num fig.update_yaxes(range=[0, 1], row=2, col=3) # Accuracy os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png') fig.write_image(output_path) print(f'save figure to: {output_path}') def filter_df(self, df, response_len: int = 8000, count: int=10): def is_valid_row(row): return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices']) bools = df.apply(is_valid_row, axis=1) return df[bools].head(count) def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128): for subset in self.subsets: review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl') review_df = pd.read_json(review_path, lines=True) review_df = self.filter_df(review_df, response_len=max_tokens, count=count) results = thread_map( self.process_item, (item for _, item in review_df.iterrows()), desc=f'Evaluating {subset}', total=len(review_df), max_workers=workers ) avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results) self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens) self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num) self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency) self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy) self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens) self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens) results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets} for metric in self.metrics} self.plot_metrics(results, output_dir) # save results to json dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json')) return results def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128): evaluator = EvalThink(**config,) results = evaluator.evaluate(output_dir, max_tokens, count, workers) print(results) def combine_results(configs: List[dict], output_path: str): """ Combine evaluation results from multiple model configs into one plot. All models' results for the same metric will be shown in the same subplot for easy comparison. Args: configs: List of model config dicts containing model_name and report_path """ # Combine results from different runs combined_results = defaultdict(lambda: defaultdict(dict)) for config in configs: model_name = config['model_name'] report_path = config['report_path'] # Results is a dict with metric as key and subset as value results = json_to_dict(os.path.join(report_path, f'think_eval_results.json')) combined_results[model_name] = results # Create a 2x3 subplot layout, one subplot per metric fig = make_subplots(rows=2, cols=3, subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens', 'Token Efficiency', 'Thought Num', 'Accuracy'), shared_xaxes=True, x_title='Subsets', vertical_spacing=0.08, # 减小垂直间距 horizontal_spacing=0.05) # 减小水平间距 metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens', 'token_efficiency', 'thought_num', 'accuracy'] # Assign different colors for each model colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'] # Plot each metric in a separate subplot for i, metric in enumerate(metrics_order, start=1): row = (i - 1) // 3 + 1 col = (i - 1) % 3 + 1 # Get subsets from first model (assuming all models have same subsets) subsets = list(next(iter(combined_results.values()))[metric].keys()) # Add all models' data for this metric to the same subplot for j, (model_name, results) in enumerate(combined_results.items()): y_values = [results[metric][subset] for subset in subsets] fig.add_trace( go.Scatter(x=subsets, y=y_values, mode='lines+markers', name=model_name, # Just model name since metrics are shown in subplot titles line=dict(color=colors[j % len(colors)]), showlegend=(i == 1)), # Only show legend for first metric row=row, col=col ) # Add value annotations for k, y in enumerate(y_values): fig.add_annotation( x=subsets[k], y=y, text=f'{y:.2f}', showarrow=False, yshift=10, font=dict(size=12, color=colors[j % len(colors)]), row=row, col=col ) # Update axis ranges and labels based on metric type # if metric == 'token_efficiency': # fig.update_yaxes(range=[0.2, 0.7], row=row, col=col) # elif metric == 'accuracy': # fig.update_yaxes(range=[0.8, 1], row=row, col=col) fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col) # Update layout fig.update_layout( height=1000, # 增加高度 width=1500, # 增加宽度 title_text=f'Model Comparison Across Evaluation Metrics on MATH-500', title=dict(font=dict(size=22)), # 增大标题字号 font=dict(size=14), # 增大整体字号 legend=dict( orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, font=dict(size=14) # 增大图例字号 ) ) # Save plot os.makedirs('outputs', exist_ok=True) fig.write_image(output_path) print(f'Model comparison plot saved to {output_path}') return combined_results judge_config = dict( api_key='EMPTY', base_url='http://0.0.0.0:8801/v1', model_name='Qwen2.5-72B-Instruct', ) distill_qwen_config = dict( report_path = '../eval-scope/outputs/20250218_180219', model_name = 'DeepSeek-R1-Distill-Qwen-7B', tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) math_qwen_config = dict( report_path = '../eval-scope/outputs/20250219_202358', model_name = 'Qwen2.5-Math-7B-Instruct', tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) r1_config = dict( report_path = '../eval-scope/outputs/20250307_000404', model_name = 'deepseek-r1', tokenizer_path = 'deepseek-ai/DeepSeek-R1', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) qwq_preview_config = dict( report_path = '../eval-scope/outputs/20250221_105911', model_name = 'qwq-32b-preview', tokenizer_path = 'Qwen/QwQ-32B-Preview', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) qwq_config = dict( report_path = '../eval-scope/outputs/20250306_181550', model_name = 'QwQ-32B', tokenizer_path = 'Qwen/QwQ-32B', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) distill_qwen_32b = dict( report_path = '../eval-scope/outputs/20250306_235951', model_name = 'deepseek-r1-distill-qwen-32b', tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) qwen3_32b_think = dict( report_path = '../eval-scope/outputs/20250428_151817', model_name = 'Qwen3-32B', tokenizer_path = 'Qwen/Qwen3-32B', dataset_name = 'math_500', subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'], split_strategies='separator', judge_config=judge_config ) if __name__ == '__main__': # run_task(distill_qwen_config, count=80) # run_task(math_qwen_config) # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128) # run_task(r1_config, max_tokens=20000, count=200, workers=128) # run_task(qwq_config, max_tokens=20000, count=200, workers=128) run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128) # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128) # combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png') # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png') # combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png') combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')