#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Gradio UI + EvalScope 调度脚本(量产版) 关键改动 1. 独立 Stop Evaluation 按钮,queue=False,立即触发 `stop_eval()` 2. `stop_eval()` 使用 psutil 递归杀进程树并 wait(),杜绝僵尸 3. 所有生成器统一返回 4 个输出:output_text ‖ is_running ‖ run_button 更新 ‖ stop_button 更新 """ import time import os import glob import threading import subprocess import gradio as gr import psutil import signal import shlex import pathlib # ---------------- 全局进程句柄 ---------------- current_process = None should_stop = False # ---------------- 可选数据集 ---------------- EVAL_DATASETS = [ "arc", "bbh", "ifeval", "ceval", "cmmlu", "competition_math", "gsm8k", "hellaswag", "humaneval", "mmlu", "mmlu_redux", "mmlu_pro", "race", "trivia_qa", "truthful_qa", "tool_bench" ] PERF_DATASETS = [ "openqa", "flickr8k", "longalpaca", "line_by_line", "speed_benchmark" ] def toggle_dataset_file_visibility(ds): return gr.update(visible=(ds == "line_by_line")) # ---------------- perf 模式运行 ---------------- def run_perf( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, dataset_path, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, extra_args ): global current_process if dataset == "line_by_line" and dataset_path is None: msg = "[❌] 请选择 line_by_line 数据集文件 (.txt)" yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) return timestamp = time.strftime("%Y%m%d-%H%M%S") model_name = model_override.strip() or timestamp command = [ "evalscope", "perf", "--url", api_url.strip(), "--api", api_provider, "--model", model_name, "--dataset", dataset, "--max-tokens", str(int(max_tokens)), "--min-tokens", str(int(min_tokens)), "--parallel", str(int(parallel_reqs)), "--max-prompt-length", str(int(max_prompt_len)), "--number", str(int(num_requests)), "--api-key", api_token.strip(), ] if dataset == "line_by_line" and dataset_path: command += ["--dataset-path", dataset_path] if extra_args.strip(): command += shlex.split(extra_args.strip()) full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(interactive=False), gr.update(visible=True) try: current_process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, start_new_session=True, # 独立进程组,便于后续 killpg / psutil ) for line in current_process.stdout: if should_stop: break full_output += line yield full_output, True, gr.update(interactive=False), gr.update(visible=True) current_process.stdout.close() current_process.wait() except Exception as e: full_output += f"[Error] {e}\n" yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) finally: current_process = None if dataset_path: pathlib.Path(dataset_path).unlink(missing_ok=True) full_output += "[Eval Finished]\n" # 自动启动可视化 if "Evaluation Report" in output_choices: vis_port = 7901 outputs_root = "./outputs" try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: latest_output = outputs_root vis_cmd = [ "evalscope", "app", "--outputs", outputs_root, "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] threading.Thread( target=subprocess.Popen, args=(vis_cmd,), kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, daemon=True ).start() full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) # ---------------- eval 模式运行 ---------------- def run_eval_tool( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, dataset_path, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, extra_args ): global current_process if dataset == "line_by_line" and dataset_path is None: msg = "[❌] 请选择 line_by_line 数据集文件 (.txt)" yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) return timestamp = time.strftime("%Y%m%d-%H%M%S") model_name = model_override.strip() or timestamp command = [ "evalscope", "eval", "--model", model_name, "--datasets", dataset ] if api_url.strip(): command += [ "--eval-type", "service", "--api-url", api_url.strip(), "--api-key", api_token.strip() ] if num_requests: command += ["--limit", str(int(num_requests))] if extra_args.strip(): command += shlex.split(extra_args.strip()) if dataset == "line_by_line" and dataset_path: command += ["--dataset-path", dataset_path] full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(interactive=False), gr.update(visible=True) try: current_process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, start_new_session=True ) for line in current_process.stdout: if should_stop: break full_output += line yield full_output, True, gr.update(interactive=False), gr.update(visible=True) current_process.stdout.close() current_process.wait() except Exception as e: full_output += f"[Error] {e}\n" yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) finally: current_process = None if dataset_path: pathlib.Path(dataset_path).unlink(missing_ok=True) full_output += "[Eval Finished]\n" if "Evaluation Report" in output_choices: vis_port = 7901 outputs_root = "./outputs" try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: latest_output = outputs_root vis_cmd = [ "evalscope", "app", "--outputs", outputs_root, "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] threading.Thread( target=subprocess.Popen, args=(vis_cmd,), kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, daemon=True ).start() full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) # ---------------- 停止函数 ---------------- def stop_eval() -> str: """ 彻底终止 current_process 及其全部子孙进程: 1. SIGINT(优雅退出,3 秒宽限) 2. 仍存活则 SIGKILL 3. wait() 主进程,防止僵尸 """ global current_process, should_stop should_stop = True if not (current_process and current_process.poll() is None): return "[⚠️ 无活动 evalscope 进程]\n" try: parent = psutil.Process(current_process.pid) family = parent.children(recursive=True) + [parent] # 1) SIGINT for p in family: p.send_signal(signal.SIGINT) _, alive = psutil.wait_procs(family, timeout=3) # 2) SIGKILL for p in alive: p.kill() psutil.wait_procs(alive, timeout=3) # 3) reap current_process.wait(timeout=3) return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n" except Exception as exc: return f"[❌ 终止失败: {exc}]\n" finally: current_process = None # ---------------- 控制器(仅负责启动) ---------------- def toggle_run( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, dataset_file, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, extra_args, is_running, run_mode ): global should_stop dataset_path = dataset_file.name if dataset_file else None if not inputs: msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n" yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) return should_stop = False if run_mode == "perf": yield from run_perf( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, dataset_path, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, extra_args ) elif run_mode == "eval": yield from run_eval_tool( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, dataset_path, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, extra_args ) elif run_mode == "app": info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n" yield info, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) # ---------------- 输入源互斥逻辑 ---------------- def enforce_input_exclusive_and_toggle_fields(selected): order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"] group1 = {"API Models", "Local Models"} group2 = {"Benchmarks", "Custom Datasets"} def keep_only_one(group): filtered = [item for item in selected if item in group] return filtered[-1:] final_sel = set(selected) final_sel -= group1 final_sel |= set(keep_only_one(group1)) final_sel -= group2 final_sel |= set(keep_only_one(group2)) final_list = [itm for itm in order if itm in final_sel] input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list) api_field_update = gr.update(visible="API Models" in final_sel) return input_update, api_field_update # ---------------- UI 构建 ---------------- with gr.Blocks(title="EvalScope 全功能界面") as demo: is_running = gr.State(value=False) # ── 顶栏:模式选择 ───────────────────────────── with gr.Group(): with gr.Row(): mode_dropdown = gr.Dropdown( label="评测类型", choices=["eval", "perf", "app"], value="perf", info="eval: 智力评测;perf: 性能评测;app: 可视化" ) # ── 输入源选择 ──────────────────────────────── with gr.Group(): with gr.Row(): input_choices = gr.CheckboxGroup( label="选择输入源", choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"], interactive=True ) # ── API 参数 ───────────────────────────────── with gr.Column(visible=False) as api_fields: api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions") api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx") with gr.Accordion("运行参数(可选修改)", open=False): with gr.Row(): api_provider_dropdown = gr.Dropdown( label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai" ) dataset_dropdown = gr.Dropdown( label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0] ) dataset_file_input = gr.File( label="Line‑by‑line 数据集文件(txt)", file_types=[".txt"], # 可改为 ["text/plain"] visible=False # 默认隐藏,选了 line_by_line 时再显示 ) model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name") extra_args_input = gr.Textbox(label="额外 EvalScope 参数", placeholder="例如: --disable-cache --temperature 0.7") with gr.Row(): max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024) min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024) with gr.Row(): parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1) num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100) max_prompt_len_slider = gr.Slider( label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360 ) # ── 本地/外部模块勾选 ────────────────────────── with gr.Row(): with gr.Column(): native_choices = gr.CheckboxGroup( label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"] ) with gr.Column(): other_choices = gr.CheckboxGroup( label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"] ) # ── 输出开关 ───────────────────────────────── output_choices = gr.CheckboxGroup( label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] ) # ── Run & Stop 按钮 ───────────────────────── run_button = gr.Button("Run Evaluation", variant="primary") stop_button = gr.Button("Stop Evaluation", variant="stop", visible=False) # ── 输出区域 ───────────────────────────────── output_text = gr.TextArea( label="执行结果", lines=20, interactive=False, show_copy_button=True ) # ── 逻辑绑定 ───────────────────────────────── input_choices.change( fn=enforce_input_exclusive_and_toggle_fields, inputs=input_choices, outputs=[input_choices, api_fields] ) mode_dropdown.change( lambda mode: gr.update( choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS, value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0] ), inputs=mode_dropdown, outputs=dataset_dropdown ) dataset_dropdown.change( toggle_dataset_file_visibility, inputs=dataset_dropdown, outputs=dataset_file_input ) # ---- Run 按钮(queue=True)---- run_button.click( fn=toggle_run, inputs=[ input_choices, native_choices, other_choices, output_choices, api_url_input, api_token_input, api_provider_dropdown, dataset_dropdown, dataset_file_input, max_tokens_slider, min_tokens_slider, parallel_slider, max_prompt_len_slider, num_req_slider, model_override_input, extra_args_input, is_running, mode_dropdown ], outputs=[output_text, is_running, run_button, stop_button], show_progress=True, queue=True ) # ---- Stop 按钮(queue=False)---- def stop_action(): msg = stop_eval() return msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False) stop_button.click( fn=stop_action, inputs=None, outputs=[output_text, is_running, run_button, stop_button], queue=False ) # ---------------- 入口 ---------------- if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7900)