gradio-5.35.0/evalscope_ui.py

142 lines
5.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
evalscope_ui.py
Gradio 端到端配置面板(针对 EvalScope ≥5.35.0
运行方式python evalscope_ui.py
"""
import json, subprocess, tempfile, os, datetime
import gradio as gr
# ---------- 常量 ----------
DEFAULT_LIMIT = 5
DEFAULT_PARALLEL = 1
PORT = 7860 # 与 Dockerfile 的 EXPOSE 保持一致
REPORT_DIR = "./reports" # 评测报告输出目录
os.makedirs(REPORT_DIR, exist_ok=True)
# ---------- 核心回调 ----------
def run_eval(
model, api_mode, api_url, api_key, local_device_map, # 模型 / API
datasets, limit, gen_cfg_json, ds_cfg_json, # 数据集 & 参数
backend, save_wandb, save_swanlab, save_gradio, # 可视化输出
stress_parallel, stress_number, stress_stream, # 性能压测
extra_yaml_json # 高级自定义
):
"""
• 将表单参数组织成 EvalScope TaskConfig 字典
• 写入临时 YAML
• subprocess 调用 evalscope.run.run_task
"""
try:
task_cfg = {
"model" : model.strip(),
"datasets": [d.strip() for d in datasets.split()] if datasets else [],
"limit" : limit or None,
"backend" : backend,
}
# ---------- Model / API 细节 ----------
if api_mode != "local":
task_cfg["api"] = api_mode
if api_url: task_cfg["url"] = api_url
if api_key: task_cfg["api_key"] = api_key
else:
task_cfg["model_args"] = {
"device_map": local_device_map or "auto"
}
# ---------- 生成与数据集高级 JSON ----------
if gen_cfg_json:
task_cfg["generation_config"] = json.loads(gen_cfg_json)
if ds_cfg_json:
task_cfg["dataset_args"] = json.loads(ds_cfg_json)
# ---------- 性能压测 ----------
task_cfg["stress_test"] = {
"parallel": stress_parallel,
"number" : stress_number,
"stream" : stress_stream
}
# ---------- 额外 YAML/JSON 直接 merge ----------
if extra_yaml_json:
extra_dict = json.loads(extra_yaml_json)
task_cfg.update(extra_dict)
# ---------- 写入临时 YAML 并执行 ----------
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
yaml_path = os.path.join("/tmp", f"task_{ts}.yaml")
with open(yaml_path, "w") as f:
import yaml; yaml.safe_dump(task_cfg, f, allow_unicode=True)
# 结果输出路径
report_path = os.path.join(REPORT_DIR, f"report_{ts}.json")
cmd = [
"python", "-m", "evalscope.run",
"--task-cfg", yaml_path,
"--report-path", report_path
]
# 可选第三方可视化
if save_wandb: cmd += ["--wandb"]
if save_swanlab: cmd += ["--swanlab"]
if save_gradio: cmd += ["--gradio"]
completed = subprocess.run(cmd, capture_output=True, text=True, check=True)
return f"✅ 评测完成!报告位于: {report_path}\n\n{completed.stdout}"
except subprocess.CalledProcessError as e:
return f"❌ EvalScope 执行失败\nSTDERR:\n{e.stderr}"
except Exception as eg:
return f"❌ 脚本内部异常: {eg}"
# ---------- Gradio UI ----------
with gr.Blocks(title="EvalScope 全量配置面板") as demo:
gr.Markdown("## EvalScope 评测配置界面Gradio@5.35.0")
with gr.Tab("模型与 API"):
model = gr.Textbox(label="模型 ID / 本地路径")
api_mode = gr.Radio(["openai", "dashscope", "local", "local_vllm"], value="local", label="API / 模式")
api_url = gr.Textbox(label="API URLremote 模式必填)", placeholder="http://host:port/chat/completion")
api_key = gr.Textbox(label="API Key可选", type="password")
local_device = gr.Textbox(label="device_maplocal 模式)", value="auto")
with gr.Tab("数据集与参数"):
datasets = gr.Textbox(label="Datasets空格分隔", placeholder="gsm8k arc mmlu")
limit = gr.Number(label="limit", value=DEFAULT_LIMIT, precision=0)
gen_cfg_json = gr.JSON(label="generation_configJSON", value={})
ds_cfg_json = gr.JSON(label="dataset_argsJSON", value={})
with gr.Tab("后端与可视化"):
backend = gr.Dropdown(["native", "opencompass", "vlmevalkit", "ragas", "mteb"], value="native", label="Evaluation Backend")
save_wandb = gr.Checkbox(label="推送 WandB", value=False)
save_swanlab = gr.Checkbox(label="推送 SwanLab", value=False)
save_gradio = gr.Checkbox(label="生成本地 Gradio 报告", value=True)
with gr.Tab("性能压测(可选)"):
stress_parallel = gr.Number(label="并发 parallel", value=DEFAULT_PARALLEL, precision=0)
stress_number = gr.Number(label="请求数 number", value=1000, precision=0)
stress_stream = gr.Checkbox(label="开启 stream", value=True)
with gr.Tab("高级配置 YAML/JSON 合并"):
extra_yaml_json = gr.JSON(label="额外 TaskConfig 字段", value={})
run_btn = gr.Button("🚀 运行 EvalScope")
output = gr.Textbox(label="控制台输出 / 错误信息", lines=15)
run_btn.click(
run_eval,
inputs=[model, api_mode, api_url, api_key, local_device,
datasets, limit, gen_cfg_json, ds_cfg_json,
backend, save_wandb, save_swanlab, save_gradio,
stress_parallel, stress_number, stress_stream,
extra_yaml_json],
outputs=output
)
# 启动
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=PORT)