import time import os import glob import threading import subprocess import gradio as gr # ---------------- 全局进程句柄 ---------------- current_process = None # ---------------- 核心运行函数 ---------------- def run_eval( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override ): """ 1. 动态拼装 evalscope perf 命令 2. 流式打印日志 3. (可选)启动可视化报告 """ global current_process timestamp = time.strftime("%Y%m%d-%H%M%S") model_name = model_override.strip() or timestamp command = [ "evalscope", "perf", "--url", api_url.strip(), "--api", api_provider, "--model", model_name, "--dataset", dataset, "--max-tokens", str(int(max_tokens)), "--min-tokens", str(int(min_tokens)), "--parallel", str(int(parallel_reqs)), "--max-prompt-length", str(int(max_prompt_len)), "--number", str(int(num_requests)), "--api-key", api_token.strip(), ] full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(value="Stop Evaluation") try: current_process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) for line in current_process.stdout: full_output += line yield full_output, True, gr.update(value="Stop Evaluation") current_process.stdout.close() current_process.wait() except Exception as e: full_output += f"[Error] {e}\n" yield full_output, False, gr.update(value="Run Evaluation") finally: current_process = None full_output += "[Eval Finished]\n" # ---------- 可视化报告 ---------- if "Evaluation Report" in output_choices: vis_port = 7861 outputs_root = "./outputs" try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: latest_output = outputs_root vis_cmd = [ "evalscope", "app", "--outputs", outputs_root, "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] threading.Thread( target=subprocess.Popen, args=(vis_cmd,), kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, daemon=True ).start() full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" yield full_output, False, gr.update(value="Run Evaluation") # ---------------- 停止函数 ---------------- def stop_eval(): global current_process if current_process and current_process.poll() is None: current_process.terminate() current_process = None return "[Stopped by user]\n" return "[No active process]\n" # ---------------- Run/Stop 控制器 ---------------- def toggle_run( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, is_running ): if not is_running: yield from run_eval( inputs, native, other, output_choices, api_url, api_token, api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override ) else: msg = stop_eval() yield msg, False, gr.update(value="Run Evaluation") # ---------------- 互斥逻辑 ---------------- def enforce_input_exclusive_and_toggle_fields(selected): group1 = {"API Models", "Local Models"} group2 = {"Benchmarks", "Custom Datasets"} def keep_only_one(group): filtered = [item for item in selected if item in group] return filtered[-1:] final_selection = set(selected) final_selection -= group1 final_selection |= set(keep_only_one(group1)) final_selection -= group2 final_selection |= set(keep_only_one(group2)) show_api_fields = "API Models" in final_selection return ( gr.update(value=list(final_selection)), gr.Row.update(visible=show_api_fields) ) # ---------------- 构建 Gradio UI ---------------- with gr.Blocks(title="EvalScope 全功能界面") as demo: is_running = gr.State(value=False) # ===== 输入源 ===== with gr.Group(): with gr.Row(): input_choices = gr.CheckboxGroup( label="选择输入源", choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"], interactive=True ) # ===== API 地址 & Token ===== with gr.Row(visible=False) as api_fields: api_url_input = gr.Textbox( label="API 地址", placeholder="https://api.example.com/v1/chat" ) api_token_input = gr.Textbox( label="Token 密钥", type="password", placeholder="sk-xxx" ) # ===== 本地/外部组件 ===== with gr.Row(): with gr.Column(): native_choices = gr.CheckboxGroup( label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"] ) with gr.Column(): other_choices = gr.CheckboxGroup( label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"] ) # ===== 运行参数 ===== with gr.Accordion("运行参数(可选修改)", open=False): with gr.Row(): api_provider_dropdown = gr.Dropdown( label="API Provider (--api)", choices=["openai", "azure", "ollama", "gemini"], value="openai" ) dataset_dropdown = gr.Dropdown( label="评测数据集 (--dataset)", choices=["openqa", "gsm8k", "mmlu", "truthfulqa"], value="openqa" ) model_override_input = gr.Textbox( label="自定义模型名 (--model),留空则使用时间戳", placeholder="e.g. my-llm-7b" ) with gr.Row(): max_tokens_slider = gr.Slider( label="Max Tokens (--max-tokens)", minimum=256, maximum=8192, step=256, value=1024 ) min_tokens_slider = gr.Slider( label="Min Tokens (--min-tokens)", minimum=0, maximum=4096, step=64, value=1024 ) with gr.Row(): parallel_slider = gr.Slider( label="并发请求数 (--parallel)", minimum=1, maximum=16, step=1, value=1 ) num_req_slider = gr.Slider( label="请求条数 (--number)", minimum=1, maximum=1000, step=1, value=100 ) max_prompt_len_slider = gr.Slider( label="最大 Prompt 长度 (--max-prompt-length)", minimum=2048, maximum=32768, step=512, value=15360 ) # ===== 输出形式 ===== output_choices = gr.CheckboxGroup( label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] ) # ===== 控制按钮 & 日志 ===== run_button = gr.Button("Run Evaluation") output_text = gr.TextArea( label="执行结果", lines=20, interactive=False, show_copy_button=True ) # ===== 绑定事件 ===== input_choices.change( fn=enforce_input_exclusive_and_toggle_fields, inputs=input_choices, outputs=[input_choices, api_fields] ) run_button.click( fn=toggle_run, inputs=[ input_choices, native_choices, other_choices, output_choices, api_url_input, api_token_input, api_provider_dropdown, dataset_dropdown, max_tokens_slider, min_tokens_slider, parallel_slider, max_prompt_len_slider, num_req_slider, model_override_input, is_running ], outputs=[output_text, is_running, run_button], show_progress=True ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7900)