import time import os import glob import threading import subprocess import gradio as gr # 全局变量:当前子进程 current_process = None # ⬇️⬇️⬇️ 运行 EvalScope 并(可选)启动可视化服务 ⬇️⬇️⬇️ def run_eval(inputs, native, other, output_choices, api_url, api_token): """ 1. 调用 `evalscope perf …` 跑基准测试 2. 若用户勾选 “Evaluation Report”,测试完成后后台启动 `evalscope app` Web 可视化服务,并在文本框追加访问链接 """ global current_process timestamp = time.strftime("%Y%m%d-%H%M%S") command = [ "evalscope", "perf", "--url", api_url.strip(), "--api", "openai", "--model", timestamp, # 以时间戳当模型名,避免冲突 "--dataset", "openqa", "--max-tokens", "1024", "--min-tokens", "1024", "--parallel", "1", "--max-prompt-length", "15360", "--number", "100", "--api-key", api_token.strip(), ] full_output = f"[Eval Started @ {timestamp}]\n" yield full_output, True, gr.update(value="Stop Evaluation") try: current_process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) # 实时流式输出 for line in current_process.stdout: full_output += line yield full_output, True, gr.update(value="Stop Evaluation") current_process.stdout.close() current_process.wait() except Exception as e: full_output += f"[Error] {e}\n" yield full_output, False, gr.update(value="Run Evaluation") finally: current_process = None full_output += "[Eval Finished]\n" # ========== 可视化报告 ========== if "Evaluation Report" in output_choices: vis_port = 7861 outputs_root = "./outputs" # ⬇️ EvalScope perf 会在 outputs_root 下生成 timestamp 目录 # 这里额外取最新目录备用(目前 UI 只需要根目录) try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: latest_output = outputs_root # 保险:若 outputs 还不存在 vis_cmd = [ "evalscope", "app", "--outputs", outputs_root, "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] # 后台线程启动,不阻塞 UI threading.Thread( target=subprocess.Popen, args=(vis_cmd,), kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, daemon=True ).start() full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" yield full_output, False, gr.update(value="Run Evaluation") # ⬇️⬇️⬇️ 停止按钮逻辑 ⬇️⬇️⬇️ def stop_eval(): global current_process if current_process and current_process.poll() is None: current_process.terminate() current_process = None return "[Stopped by user]\n" return "[No active process]\n" # ⬇️⬇️⬇️ Run/Stop 控制器(必须是 generator) ⬇️⬇️⬇️ def toggle_run(inputs, native, other, output_choices, api_url, api_token, is_running): if not is_running: # 开始跑 yield from run_eval(inputs, native, other, output_choices, api_url, api_token) else: # 用户点 Stop msg = stop_eval() yield msg, False, gr.update(value="Run Evaluation") # ⬇️⬇️⬇️ 互斥逻辑:同组保留最后一个选项 ⬇️⬇️⬇️ def enforce_input_exclusive_and_toggle_fields(selected): group1 = {"API Models", "Local Models"} group2 = {"Benchmarks", "Custom Datasets"} def keep_only_one(group): filtered = [item for item in selected if item in group] return filtered[-1:] final_selection = set(selected) final_selection -= group1 final_selection |= set(keep_only_one(group1)) final_selection -= group2 final_selection |= set(keep_only_one(group2)) show_api_fields = "API Models" in final_selection return ( gr.update(value=list(final_selection)), gr.Row.update(visible=show_api_fields) ) # ------------- 构建 Gradio UI ------------- with gr.Blocks(title="EvalScope 全功能界面") as demo: is_running = gr.State(value=False) with gr.Group(): with gr.Row(): input_choices = gr.CheckboxGroup( label="选择输入源", choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"], interactive=True ) with gr.Row(visible=False) as api_fields: api_url_input = gr.Textbox( label="API 地址", placeholder="https://api.example.com/v1/chat" ) api_token_input = gr.Textbox( label="Token 密钥", type="password", placeholder="sk-xxx" ) with gr.Row(): with gr.Column(): native_choices = gr.CheckboxGroup( label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"] ) with gr.Column(): other_choices = gr.CheckboxGroup( label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"] ) with gr.Row(): output_choices = gr.CheckboxGroup( label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] ) run_button = gr.Button("Run Evaluation") output_text = gr.TextArea( label="执行结果", lines=20, interactive=False, show_copy_button=True ) # 绑定输入互斥 input_choices.change( fn=enforce_input_exclusive_and_toggle_fields, inputs=input_choices, outputs=[input_choices, api_fields] ) # 绑定 Run/Stop run_button.click( fn=toggle_run, inputs=[ input_choices, native_choices, other_choices, output_choices, api_url_input, api_token_input, is_running ], outputs=[output_text, is_running, run_button], show_progress=True ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7900)