import time import os import glob import threading import subprocess import gradio as gr import psutil import signal # ---------------- 全局进程句柄 ---------------- current_process = None should_stop = False # ---------------- 可选数据集 ---------------- EVAL_DATASETS = [ "arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k", "hellaswag", "humaneval", "mmlu", "mmlu_pro", "race", "trivia_qa", "truthful_qa" ] PERF_DATASETS = ["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"] # ---------------- perf 模式运行 ---------------- def run_perf( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override ): global current_process timestamp = time.strftime("%Y%m%d-%H%M%S") model_name = model_override.strip() or timestamp command = [ "evalscope", "perf", "--url", api_url.strip(), "--api", api_provider, "--model", model_name, "--dataset", dataset, "--max-tokens", str(int(max_tokens)), "--min-tokens", str(int(min_tokens)), "--parallel", str(int(parallel_reqs)), "--max-prompt-length", str(int(max_prompt_len)), "--number", str(int(num_requests)), "--api-key", api_token.strip(), ] full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(value="Stop Evaluation") try: current_process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, start_new_session=True ) for line in current_process.stdout: if should_stop: break full_output += line yield full_output, True, gr.update(value="Stop Evaluation") current_process.stdout.close() current_process.wait() except Exception as e: full_output += f"[Error] {e}\n" yield full_output, False, gr.update(value="Run Evaluation") finally: current_process = None full_output += "[Eval Finished]\n" if "Evaluation Report" in output_choices: vis_port = 7901 outputs_root = "./outputs" try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: latest_output = outputs_root vis_cmd = [ "evalscope", "app", "--outputs", outputs_root, "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] threading.Thread( target=subprocess.Popen, args=(vis_cmd,), kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, daemon=True ).start() full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" yield full_output, False, gr.update(value="Run Evaluation") # ---------------- eval 模式运行 ---------------- def run_eval_tool( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override ): global current_process timestamp = time.strftime("%Y%m%d-%H%M%S") model_name = model_override.strip() or timestamp command = [ "evalscope", "eval", "--model", model_name, "--datasets", dataset ] if api_url.strip(): command += [ "--eval-type", "service", "--api-url", api_url.strip(), "--api-key", api_token.strip() ] if num_requests: command += ["--limit", str(int(num_requests))] full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(value="Stop Evaluation") try: current_process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, start_new_session=True ) for line in current_process.stdout: if should_stop: break full_output += line yield full_output, True, gr.update(value="Stop Evaluation") current_process.stdout.close() current_process.wait() except Exception as e: full_output += f"[Error] {e}\n" yield full_output, False, gr.update(value="Run Evaluation") finally: current_process = None full_output += "[Eval Finished]\n" if "Evaluation Report" in output_choices: vis_port = 7901 outputs_root = "./outputs" try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: latest_output = outputs_root vis_cmd = [ "evalscope", "app", "--outputs", outputs_root, "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] threading.Thread( target=subprocess.Popen, args=(vis_cmd,), kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, daemon=True ).start() full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" yield full_output, False, gr.update(value="Run Evaluation") # ---------------- 停止函数 ---------------- def stop_eval(): global current_process, should_stop should_stop = True if current_process and current_process.poll() is None: try: pgid = os.getpgid(current_process.pid) os.killpg(pgid, signal.SIGINT) time.sleep(2) if current_process.poll() is None: os.killpg(pgid, signal.SIGKILL) return "[✅ 已发送终止信号 (SIGINT → SIGKILL fallback)]\n" except Exception as e: return f"[❌ 终止失败: {e}]\n" finally: current_process = None else: return "[⚠️ 无活动 evalscope 进程]\n" # ---------------- 控制器 ---------------- def toggle_run( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, is_running, run_mode ): global should_stop if not inputs: msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n" yield msg, False, gr.update(value="Run Evaluation") return if not is_running: should_stop = False if run_mode == "perf": yield from run_perf( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override ) elif run_mode == "eval": yield from run_eval_tool( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override ) elif run_mode == "app": yield "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]", False, gr.update(value="Run Evaluation") else: msg = stop_eval() yield msg, False, gr.update(value="Run Evaluation") # ---------------- 输入源互斥逻辑 ---------------- def enforce_input_exclusive_and_toggle_fields(selected): order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"] group1 = {"API Models", "Local Models"} group2 = {"Benchmarks", "Custom Datasets"} def keep_only_one(group): filtered = [item for item in selected if item in group] return filtered[-1:] final_sel = set(selected) final_sel -= group1 final_sel |= set(keep_only_one(group1)) final_sel -= group2 final_sel |= set(keep_only_one(group2)) final_list = [itm for itm in order if itm in final_sel] input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list) api_field_update = gr.update(visible="API Models" in final_sel) return input_update, api_field_update # ---------------- UI 构建 ---------------- with gr.Blocks(title="EvalScope 全功能界面") as demo: is_running = gr.State(value=False) with gr.Group(): with gr.Row(): mode_dropdown = gr.Dropdown( label="评测类型", choices=["eval", "perf", "app"], value="perf", info="eval: 智力评测;perf: 性能评测;app: 可视化" ) with gr.Group(): with gr.Row(): input_choices = gr.CheckboxGroup( label="选择输入源", choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"], interactive=True ) with gr.Column(visible=False) as api_fields: api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions") api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx") with gr.Accordion("运行参数(可选修改)", open=False): with gr.Row(): api_provider_dropdown = gr.Dropdown(label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai") dataset_dropdown = gr.Dropdown(label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0]) model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="my-llm") with gr.Row(): max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024) min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024) with gr.Row(): parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=16, step=1, value=1) num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100) max_prompt_len_slider = gr.Slider(label="最大 Prompt 长度", minimum=2048, maximum=32768, step=512, value=15360) with gr.Row(): with gr.Column(): native_choices = gr.CheckboxGroup(label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]) with gr.Column(): other_choices = gr.CheckboxGroup(label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]) output_choices = gr.CheckboxGroup(label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]) run_button = gr.Button("Run Evaluation") output_text = gr.TextArea(label="执行结果", lines=20, interactive=False, show_copy_button=True) input_choices.change( fn=enforce_input_exclusive_and_toggle_fields, inputs=input_choices, outputs=[input_choices, api_fields] ) mode_dropdown.change( lambda mode: gr.update( choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS, value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0] ), inputs=mode_dropdown, outputs=dataset_dropdown ) run_button.click( fn=toggle_run, inputs=[ input_choices, native_choices, other_choices, output_choices, api_url_input, api_token_input, api_provider_dropdown, dataset_dropdown, max_tokens_slider, min_tokens_slider, parallel_slider, max_prompt_len_slider, num_req_slider, model_override_input, is_running, mode_dropdown ], outputs=[output_text, is_running, run_button], show_progress=True ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7900)