evalscope_v0.17.0/gradio_ui.py

494 lines
17 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Gradio UI+EvalScope 调度脚本(量产版)
关键改动
1. 独立 StopEvaluation 按钮queue=False立即触发 `stop_eval()`
2. `stop_eval()` 使用 psutil 递归杀进程树并 wait(),杜绝僵尸
3. 所有生成器统一返回 4 个输出output_text ‖ is_running ‖ run_button 更新 ‖ stop_button 更新
"""
import time
import os
import glob
import threading
import subprocess
import gradio as gr
import psutil
import signal
import shlex
import pathlib
# ---------------- 全局进程句柄 ----------------
current_process = None
should_stop = False
# ---------------- 可选数据集 ----------------
EVAL_DATASETS = [
"arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k",
"hellaswag", "humaneval", "mmlu", "mmlu_pro", "race",
"trivia_qa", "truthful_qa"
]
PERF_DATASETS = [
"openqa", "flickr8k", "longalpaca",
"line_by_line", "speed_benchmark"
]
def toggle_dataset_file_visibility(ds):
return gr.update(visible=(ds == "line_by_line"))
# ---------------- perf 模式运行 ----------------
def run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
dataset_path,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
extra_args
):
global current_process
if dataset == "line_by_line" and dataset_path is None:
msg = "[❌] 请选择 line_by_line 数据集文件 (.txt)"
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
return
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "perf",
"--url", api_url.strip(),
"--api", api_provider,
"--model", model_name,
"--dataset", dataset,
"--max-tokens", str(int(max_tokens)),
"--min-tokens", str(int(min_tokens)),
"--parallel", str(int(parallel_reqs)),
"--max-prompt-length", str(int(max_prompt_len)),
"--number", str(int(num_requests)),
"--api-key", api_token.strip(),
]
if dataset == "line_by_line" and dataset_path:
command += ["--dataset-path", dataset_path]
if extra_args.strip():
command += shlex.split(extra_args.strip())
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
try:
current_process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
start_new_session=True, # 独立进程组,便于后续 killpg / psutil
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
finally:
current_process = None
if dataset_path:
pathlib.Path(dataset_path).unlink(missing_ok=True)
full_output += "[Eval Finished]\n"
# 自动启动可视化
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
# ---------------- eval 模式运行 ----------------
def run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
dataset_path,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override, extra_args
):
global current_process
if dataset == "line_by_line" and dataset_path is None:
msg = "[❌] 请选择 line_by_line 数据集文件 (.txt)"
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
return
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "eval",
"--model", model_name,
"--datasets", dataset
]
if api_url.strip():
command += [
"--eval-type", "service",
"--api-url", api_url.strip(),
"--api-key", api_token.strip()
]
if num_requests:
command += ["--limit", str(int(num_requests))]
if extra_args.strip():
command += shlex.split(extra_args.strip())
if dataset == "line_by_line" and dataset_path:
command += ["--dataset-path", dataset_path]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
try:
current_process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
finally:
current_process = None
if dataset_path:
pathlib.Path(dataset_path).unlink(missing_ok=True)
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
# ---------------- 停止函数 ----------------
def stop_eval() -> str:
"""
彻底终止 current_process 及其全部子孙进程:
1. SIGINT优雅退出3 秒宽限)
2. 仍存活则 SIGKILL
3. wait() 主进程,防止僵尸
"""
global current_process, should_stop
should_stop = True
if not (current_process and current_process.poll() is None):
return "[⚠️ 无活动 evalscope 进程]\n"
try:
parent = psutil.Process(current_process.pid)
family = parent.children(recursive=True) + [parent]
# 1) SIGINT
for p in family:
p.send_signal(signal.SIGINT)
_, alive = psutil.wait_procs(family, timeout=3)
# 2) SIGKILL
for p in alive:
p.kill()
psutil.wait_procs(alive, timeout=3)
# 3) reap
current_process.wait(timeout=3)
return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n"
except Exception as exc:
return f"[❌ 终止失败: {exc}]\n"
finally:
current_process = None
# ---------------- 控制器(仅负责启动) ----------------
def toggle_run(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset, dataset_file,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
extra_args,
is_running,
run_mode
):
global should_stop
dataset_path = dataset_file.name if dataset_file else None
if not inputs:
msg = "[❌ 错误] 必须至少选择一个输入源API、本地、基准或自定义才能开始运行。\n"
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
return
should_stop = False
if run_mode == "perf":
yield from run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
dataset_path,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
extra_args
)
elif run_mode == "eval":
yield from run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
dataset_path,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
extra_args
)
elif run_mode == "app":
info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n"
yield info, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
# ---------------- 输入源互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected):
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"}
def keep_only_one(group):
filtered = [item for item in selected if item in group]
return filtered[-1:]
final_sel = set(selected)
final_sel -= group1
final_sel |= set(keep_only_one(group1))
final_sel -= group2
final_sel |= set(keep_only_one(group2))
final_list = [itm for itm in order if itm in final_sel]
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
api_field_update = gr.update(visible="API Models" in final_sel)
return input_update, api_field_update
# ---------------- UI 构建 ----------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
# ── 顶栏:模式选择 ─────────────────────────────
with gr.Group():
with gr.Row():
mode_dropdown = gr.Dropdown(
label="评测类型",
choices=["eval", "perf", "app"],
value="perf",
info="eval: 智力评测perf: 性能评测app: 可视化"
)
# ── 输入源选择 ────────────────────────────────
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
label="选择输入源",
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
interactive=True
)
# ── API 参数 ─────────────────────────────────
with gr.Column(visible=False) as api_fields:
api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions")
api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
with gr.Accordion("运行参数(可选修改)", open=False):
with gr.Row():
api_provider_dropdown = gr.Dropdown(
label="API Provider",
choices=["openai", "azure", "ollama", "gemini"],
value="openai"
)
dataset_dropdown = gr.Dropdown(
label="评测数据集 (--dataset)",
choices=PERF_DATASETS,
value=PERF_DATASETS[0]
)
dataset_file_input = gr.File(
label="Linebyline 数据集文件txt",
file_types=[".txt"], # 可改为 ["text/plain"]
visible=False # 默认隐藏,选了 line_by_line 时再显示
)
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
extra_args_input = gr.Textbox(label="额外 EvalScope 参数", placeholder="例如: --disable-cache --temperature 0.7")
with gr.Row():
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
with gr.Row():
parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1)
num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100)
max_prompt_len_slider = gr.Slider(
label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360
)
# ── 本地/外部模块勾选 ──────────────────────────
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(
label="启用本地模块",
choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]
)
with gr.Column():
other_choices = gr.CheckboxGroup(
label="启用外部后端",
choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]
)
# ── 输出开关 ─────────────────────────────────
output_choices = gr.CheckboxGroup(
label="输出形式",
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
)
# ── Run & Stop 按钮 ─────────────────────────
run_button = gr.Button("Run Evaluation", variant="primary")
stop_button = gr.Button("Stop Evaluation", variant="stop", visible=False)
# ── 输出区域 ─────────────────────────────────
output_text = gr.TextArea(
label="执行结果", lines=20, interactive=False, show_copy_button=True
)
# ── 逻辑绑定 ─────────────────────────────────
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
outputs=[input_choices, api_fields]
)
mode_dropdown.change(
lambda mode: gr.update(
choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS,
value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0]
),
inputs=mode_dropdown,
outputs=dataset_dropdown
)
dataset_dropdown.change(
toggle_dataset_file_visibility,
inputs=dataset_dropdown,
outputs=dataset_file_input
)
# ---- Run 按钮queue=True----
run_button.click(
fn=toggle_run,
inputs=[
input_choices, native_choices, other_choices,
output_choices,
api_url_input, api_token_input,
api_provider_dropdown, dataset_dropdown, dataset_file_input,
max_tokens_slider, min_tokens_slider, parallel_slider,
max_prompt_len_slider, num_req_slider,
model_override_input,
extra_args_input,
is_running,
mode_dropdown
],
outputs=[output_text, is_running, run_button, stop_button],
show_progress=True,
queue=True
)
# ---- Stop 按钮queue=False----
def stop_action():
msg = stop_eval()
return msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
stop_button.click(
fn=stop_action,
inputs=None,
outputs=[output_text, is_running, run_button, stop_button],
queue=False
)
# ---------------- 入口 ----------------
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)