""" evalscope_ui.py Gradio 端到端配置面板(针对 EvalScope ≥5.35.0) 运行方式:python evalscope_ui.py """ import json, subprocess, tempfile, os, datetime import gradio as gr # ---------- 常量 ---------- DEFAULT_LIMIT = 5 DEFAULT_PARALLEL = 1 PORT = 7860 # 与 Dockerfile 的 EXPOSE 保持一致 REPORT_DIR = "./reports" # 评测报告输出目录 os.makedirs(REPORT_DIR, exist_ok=True) # ---------- 核心回调 ---------- def run_eval( model, api_mode, api_url, api_key, local_device_map, # 模型 / API datasets, limit, gen_cfg_json, ds_cfg_json, # 数据集 & 参数 backend, save_wandb, save_swanlab, save_gradio, # 可视化输出 stress_parallel, stress_number, stress_stream, # 性能压测 extra_yaml_json # 高级自定义 ): """ • 将表单参数组织成 EvalScope TaskConfig 字典 • 写入临时 YAML • subprocess 调用 evalscope.run.run_task """ try: task_cfg = { "model" : model.strip(), "datasets": [d.strip() for d in datasets.split()] if datasets else [], "limit" : limit or None, "backend" : backend, } # ---------- Model / API 细节 ---------- if api_mode != "local": task_cfg["api"] = api_mode if api_url: task_cfg["url"] = api_url if api_key: task_cfg["api_key"] = api_key else: task_cfg["model_args"] = { "device_map": local_device_map or "auto" } # ---------- 生成与数据集高级 JSON ---------- if gen_cfg_json: task_cfg["generation_config"] = json.loads(gen_cfg_json) if ds_cfg_json: task_cfg["dataset_args"] = json.loads(ds_cfg_json) # ---------- 性能压测 ---------- task_cfg["stress_test"] = { "parallel": stress_parallel, "number" : stress_number, "stream" : stress_stream } # ---------- 额外 YAML/JSON 直接 merge ---------- if extra_yaml_json: extra_dict = json.loads(extra_yaml_json) task_cfg.update(extra_dict) # ---------- 写入临时 YAML 并执行 ---------- ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") yaml_path = os.path.join("/tmp", f"task_{ts}.yaml") with open(yaml_path, "w") as f: import yaml; yaml.safe_dump(task_cfg, f, allow_unicode=True) # 结果输出路径 report_path = os.path.join(REPORT_DIR, f"report_{ts}.json") cmd = [ "python", "-m", "evalscope.run", "--task-cfg", yaml_path, "--report-path", report_path ] # 可选第三方可视化 if save_wandb: cmd += ["--wandb"] if save_swanlab: cmd += ["--swanlab"] if save_gradio: cmd += ["--gradio"] completed = subprocess.run(cmd, capture_output=True, text=True, check=True) return f"✅ 评测完成!报告位于: {report_path}\n\n{completed.stdout}" except subprocess.CalledProcessError as e: return f"❌ EvalScope 执行失败\nSTDERR:\n{e.stderr}" except Exception as eg: return f"❌ 脚本内部异常: {eg}" # ---------- Gradio UI ---------- with gr.Blocks(title="EvalScope 全量配置面板") as demo: gr.Markdown("## EvalScope 评测配置界面(Gradio@5.35.0)") with gr.Tab("模型与 API"): model = gr.Textbox(label="模型 ID / 本地路径") api_mode = gr.Radio(["openai", "dashscope", "local", "local_vllm"], value="local", label="API / 模式") api_url = gr.Textbox(label="API URL(remote 模式必填)", placeholder="http://host:port/chat/completion") api_key = gr.Textbox(label="API Key(可选)", type="password") local_device = gr.Textbox(label="device_map(local 模式)", value="auto") with gr.Tab("数据集与参数"): datasets = gr.Textbox(label="Datasets(空格分隔)", placeholder="gsm8k arc mmlu") limit = gr.Number(label="limit", value=DEFAULT_LIMIT, precision=0) gen_cfg_json = gr.JSON(label="generation_config(JSON)", value={}) ds_cfg_json = gr.JSON(label="dataset_args(JSON)", value={}) with gr.Tab("后端与可视化"): backend = gr.Dropdown(["native", "opencompass", "vlmevalkit", "ragas", "mteb"], value="native", label="Evaluation Backend") save_wandb = gr.Checkbox(label="推送 WandB", value=False) save_swanlab = gr.Checkbox(label="推送 SwanLab", value=False) save_gradio = gr.Checkbox(label="生成本地 Gradio 报告", value=True) with gr.Tab("性能压测(可选)"): stress_parallel = gr.Number(label="并发 parallel", value=DEFAULT_PARALLEL, precision=0) stress_number = gr.Number(label="请求数 number", value=1000, precision=0) stress_stream = gr.Checkbox(label="开启 stream", value=True) with gr.Tab("高级配置 YAML/JSON 合并"): extra_yaml_json = gr.JSON(label="额外 TaskConfig 字段", value={}) run_btn = gr.Button("🚀 运行 EvalScope") output = gr.Textbox(label="控制台输出 / 错误信息", lines=15) run_btn.click( run_eval, inputs=[model, api_mode, api_url, api_key, local_device, datasets, limit, gen_cfg_json, ds_cfg_json, backend, save_wandb, save_swanlab, save_gradio, stress_parallel, stress_number, stress_stream, extra_yaml_json], outputs=output ) # 启动 if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=PORT)