494 lines
17 KiB
Python
494 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Gradio UI + EvalScope 调度脚本(量产版)
|
||
|
||
关键改动
|
||
1. 独立 Stop Evaluation 按钮,queue=False,立即触发 `stop_eval()`
|
||
2. `stop_eval()` 使用 psutil 递归杀进程树并 wait(),杜绝僵尸
|
||
3. 所有生成器统一返回 4 个输出:output_text ‖ is_running ‖ run_button 更新 ‖ stop_button 更新
|
||
"""
|
||
|
||
import time
|
||
import os
|
||
import glob
|
||
import threading
|
||
import subprocess
|
||
import gradio as gr
|
||
import psutil
|
||
import signal
|
||
import shlex
|
||
import pathlib
|
||
|
||
# ---------------- 全局进程句柄 ----------------
|
||
current_process = None
|
||
should_stop = False
|
||
|
||
# ---------------- 可选数据集 ----------------
|
||
EVAL_DATASETS = [
|
||
"arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k",
|
||
"hellaswag", "humaneval", "mmlu", "mmlu_pro", "race",
|
||
"trivia_qa", "truthful_qa"
|
||
]
|
||
PERF_DATASETS = [
|
||
"openqa", "flickr8k", "longalpaca", "random_dataset",
|
||
"line_by_line", "custom", "speed_benchmark"
|
||
]
|
||
|
||
def toggle_dataset_file_visibility(ds):
|
||
return gr.update(visible=(ds == "line_by_line"))
|
||
|
||
# ---------------- perf 模式运行 ----------------
|
||
def run_perf(
|
||
inputs, native, other, output_choices,
|
||
api_url, api_token,
|
||
api_provider, dataset,
|
||
dataset_path,
|
||
max_tokens, min_tokens, parallel_reqs,
|
||
max_prompt_len, num_requests,
|
||
model_override,
|
||
extra_args
|
||
):
|
||
global current_process
|
||
|
||
if dataset == "line_by_line" and dataset_path is None:
|
||
msg = "[❌] 请选择 line_by_line 数据集文件 (.txt)"
|
||
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
return
|
||
|
||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||
model_name = model_override.strip() or timestamp
|
||
|
||
command = [
|
||
"evalscope", "perf",
|
||
"--url", api_url.strip(),
|
||
"--api", api_provider,
|
||
"--model", model_name,
|
||
"--dataset", dataset,
|
||
"--max-tokens", str(int(max_tokens)),
|
||
"--min-tokens", str(int(min_tokens)),
|
||
"--parallel", str(int(parallel_reqs)),
|
||
"--max-prompt-length", str(int(max_prompt_len)),
|
||
"--number", str(int(num_requests)),
|
||
"--api-key", api_token.strip(),
|
||
]
|
||
|
||
if dataset == "line_by_line" and dataset_path:
|
||
command += ["--dataset-path", dataset_path]
|
||
|
||
if extra_args.strip():
|
||
command += shlex.split(extra_args.strip())
|
||
|
||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
||
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
||
|
||
try:
|
||
current_process = subprocess.Popen(
|
||
command,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.STDOUT,
|
||
text=True,
|
||
bufsize=1,
|
||
start_new_session=True, # 独立进程组,便于后续 killpg / psutil
|
||
)
|
||
|
||
for line in current_process.stdout:
|
||
if should_stop:
|
||
break
|
||
full_output += line
|
||
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
||
|
||
current_process.stdout.close()
|
||
current_process.wait()
|
||
|
||
except Exception as e:
|
||
full_output += f"[Error] {e}\n"
|
||
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
|
||
finally:
|
||
current_process = None
|
||
if dataset_path:
|
||
pathlib.Path(dataset_path).unlink(missing_ok=True)
|
||
|
||
full_output += "[Eval Finished]\n"
|
||
|
||
# 自动启动可视化
|
||
if "Evaluation Report" in output_choices:
|
||
vis_port = 7901
|
||
outputs_root = "./outputs"
|
||
try:
|
||
latest_output = max(
|
||
glob.glob(os.path.join(outputs_root, "*")),
|
||
key=os.path.getmtime
|
||
)
|
||
except ValueError:
|
||
latest_output = outputs_root
|
||
|
||
vis_cmd = [
|
||
"evalscope", "app",
|
||
"--outputs", outputs_root,
|
||
"--server-name", "0.0.0.0",
|
||
"--server-port", str(vis_port),
|
||
]
|
||
threading.Thread(
|
||
target=subprocess.Popen,
|
||
args=(vis_cmd,),
|
||
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
|
||
daemon=True
|
||
).start()
|
||
|
||
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
||
|
||
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
|
||
# ---------------- eval 模式运行 ----------------
|
||
def run_eval_tool(
|
||
inputs, native, other, output_choices,
|
||
api_url, api_token,
|
||
api_provider, dataset,
|
||
dataset_path,
|
||
max_tokens, min_tokens, parallel_reqs,
|
||
max_prompt_len, num_requests,
|
||
model_override, extra_args
|
||
|
||
):
|
||
global current_process
|
||
|
||
if dataset == "line_by_line" and dataset_path is None:
|
||
msg = "[❌] 请选择 line_by_line 数据集文件 (.txt)"
|
||
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
return
|
||
|
||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||
model_name = model_override.strip() or timestamp
|
||
|
||
command = [
|
||
"evalscope", "eval",
|
||
"--model", model_name,
|
||
"--datasets", dataset
|
||
]
|
||
if api_url.strip():
|
||
command += [
|
||
"--eval-type", "service",
|
||
"--api-url", api_url.strip(),
|
||
"--api-key", api_token.strip()
|
||
]
|
||
if num_requests:
|
||
command += ["--limit", str(int(num_requests))]
|
||
|
||
if extra_args.strip():
|
||
command += shlex.split(extra_args.strip())
|
||
|
||
if dataset == "line_by_line" and dataset_path:
|
||
command += ["--dataset-path", dataset_path]
|
||
|
||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
||
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
||
|
||
try:
|
||
current_process = subprocess.Popen(
|
||
command,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.STDOUT,
|
||
text=True,
|
||
bufsize=1,
|
||
start_new_session=True
|
||
)
|
||
|
||
for line in current_process.stdout:
|
||
if should_stop:
|
||
break
|
||
full_output += line
|
||
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
||
|
||
current_process.stdout.close()
|
||
current_process.wait()
|
||
|
||
except Exception as e:
|
||
full_output += f"[Error] {e}\n"
|
||
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
|
||
finally:
|
||
current_process = None
|
||
if dataset_path:
|
||
pathlib.Path(dataset_path).unlink(missing_ok=True)
|
||
|
||
full_output += "[Eval Finished]\n"
|
||
|
||
if "Evaluation Report" in output_choices:
|
||
vis_port = 7901
|
||
outputs_root = "./outputs"
|
||
try:
|
||
latest_output = max(
|
||
glob.glob(os.path.join(outputs_root, "*")),
|
||
key=os.path.getmtime
|
||
)
|
||
except ValueError:
|
||
latest_output = outputs_root
|
||
|
||
vis_cmd = [
|
||
"evalscope", "app",
|
||
"--outputs", outputs_root,
|
||
"--server-name", "0.0.0.0",
|
||
"--server-port", str(vis_port),
|
||
]
|
||
threading.Thread(
|
||
target=subprocess.Popen,
|
||
args=(vis_cmd,),
|
||
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
|
||
daemon=True
|
||
).start()
|
||
|
||
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
||
|
||
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
|
||
# ---------------- 停止函数 ----------------
|
||
def stop_eval() -> str:
|
||
"""
|
||
彻底终止 current_process 及其全部子孙进程:
|
||
1. SIGINT(优雅退出,3 秒宽限)
|
||
2. 仍存活则 SIGKILL
|
||
3. wait() 主进程,防止僵尸
|
||
"""
|
||
global current_process, should_stop
|
||
should_stop = True
|
||
|
||
if not (current_process and current_process.poll() is None):
|
||
return "[⚠️ 无活动 evalscope 进程]\n"
|
||
|
||
try:
|
||
parent = psutil.Process(current_process.pid)
|
||
family = parent.children(recursive=True) + [parent]
|
||
|
||
# 1) SIGINT
|
||
for p in family:
|
||
p.send_signal(signal.SIGINT)
|
||
_, alive = psutil.wait_procs(family, timeout=3)
|
||
|
||
# 2) SIGKILL
|
||
for p in alive:
|
||
p.kill()
|
||
psutil.wait_procs(alive, timeout=3)
|
||
|
||
# 3) reap
|
||
current_process.wait(timeout=3)
|
||
return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n"
|
||
|
||
except Exception as exc:
|
||
return f"[❌ 终止失败: {exc}]\n"
|
||
|
||
finally:
|
||
current_process = None
|
||
|
||
# ---------------- 控制器(仅负责启动) ----------------
|
||
def toggle_run(
|
||
inputs, native, other, output_choices,
|
||
api_url, api_token,
|
||
api_provider, dataset, dataset_file,
|
||
max_tokens, min_tokens, parallel_reqs,
|
||
max_prompt_len, num_requests,
|
||
model_override,
|
||
extra_args,
|
||
is_running,
|
||
run_mode
|
||
):
|
||
global should_stop
|
||
|
||
dataset_path = dataset_file.name if dataset_file else None
|
||
|
||
if not inputs:
|
||
msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n"
|
||
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
return
|
||
|
||
should_stop = False
|
||
if run_mode == "perf":
|
||
yield from run_perf(
|
||
inputs, native, other, output_choices,
|
||
api_url, api_token,
|
||
api_provider, dataset,
|
||
dataset_path,
|
||
max_tokens, min_tokens, parallel_reqs,
|
||
max_prompt_len, num_requests,
|
||
model_override,
|
||
extra_args
|
||
)
|
||
elif run_mode == "eval":
|
||
yield from run_eval_tool(
|
||
inputs, native, other, output_choices,
|
||
api_url, api_token,
|
||
api_provider, dataset,
|
||
dataset_path,
|
||
max_tokens, min_tokens, parallel_reqs,
|
||
max_prompt_len, num_requests,
|
||
model_override,
|
||
extra_args
|
||
)
|
||
elif run_mode == "app":
|
||
info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n"
|
||
yield info, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
|
||
# ---------------- 输入源互斥逻辑 ----------------
|
||
def enforce_input_exclusive_and_toggle_fields(selected):
|
||
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
|
||
group1 = {"API Models", "Local Models"}
|
||
group2 = {"Benchmarks", "Custom Datasets"}
|
||
|
||
def keep_only_one(group):
|
||
filtered = [item for item in selected if item in group]
|
||
return filtered[-1:]
|
||
|
||
final_sel = set(selected)
|
||
final_sel -= group1
|
||
final_sel |= set(keep_only_one(group1))
|
||
final_sel -= group2
|
||
final_sel |= set(keep_only_one(group2))
|
||
|
||
final_list = [itm for itm in order if itm in final_sel]
|
||
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
|
||
api_field_update = gr.update(visible="API Models" in final_sel)
|
||
return input_update, api_field_update
|
||
|
||
# ---------------- UI 构建 ----------------
|
||
with gr.Blocks(title="EvalScope 全功能界面") as demo:
|
||
is_running = gr.State(value=False)
|
||
|
||
# ── 顶栏:模式选择 ─────────────────────────────
|
||
with gr.Group():
|
||
with gr.Row():
|
||
mode_dropdown = gr.Dropdown(
|
||
label="评测类型",
|
||
choices=["eval", "perf", "app"],
|
||
value="perf",
|
||
info="eval: 智力评测;perf: 性能评测;app: 可视化"
|
||
)
|
||
|
||
# ── 输入源选择 ────────────────────────────────
|
||
with gr.Group():
|
||
with gr.Row():
|
||
input_choices = gr.CheckboxGroup(
|
||
label="选择输入源",
|
||
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
|
||
interactive=True
|
||
)
|
||
|
||
# ── API 参数 ─────────────────────────────────
|
||
with gr.Column(visible=False) as api_fields:
|
||
api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions")
|
||
api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
|
||
with gr.Accordion("运行参数(可选修改)", open=False):
|
||
with gr.Row():
|
||
api_provider_dropdown = gr.Dropdown(
|
||
label="API Provider",
|
||
choices=["openai", "azure", "ollama", "gemini"],
|
||
value="openai"
|
||
)
|
||
dataset_dropdown = gr.Dropdown(
|
||
label="评测数据集 (--dataset)",
|
||
choices=PERF_DATASETS,
|
||
value=PERF_DATASETS[0]
|
||
)
|
||
dataset_file_input = gr.File(
|
||
label="Line‑by‑line 数据集文件(txt)",
|
||
file_types=[".txt"], # 可改为 ["text/plain"]
|
||
visible=False # 默认隐藏,选了 line_by_line 时再显示
|
||
)
|
||
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
|
||
extra_args_input = gr.Textbox(label="额外 EvalScope 参数", placeholder="例如: --disable-cache --temperature 0.7")
|
||
with gr.Row():
|
||
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
|
||
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
|
||
with gr.Row():
|
||
parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1)
|
||
num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100)
|
||
max_prompt_len_slider = gr.Slider(
|
||
label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360
|
||
)
|
||
|
||
# ── 本地/外部模块勾选 ──────────────────────────
|
||
with gr.Row():
|
||
with gr.Column():
|
||
native_choices = gr.CheckboxGroup(
|
||
label="启用本地模块",
|
||
choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]
|
||
)
|
||
with gr.Column():
|
||
other_choices = gr.CheckboxGroup(
|
||
label="启用外部后端",
|
||
choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]
|
||
)
|
||
|
||
# ── 输出开关 ─────────────────────────────────
|
||
output_choices = gr.CheckboxGroup(
|
||
label="输出形式",
|
||
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
|
||
)
|
||
|
||
# ── Run & Stop 按钮 ─────────────────────────
|
||
run_button = gr.Button("Run Evaluation", variant="primary")
|
||
stop_button = gr.Button("Stop Evaluation", variant="stop", visible=False)
|
||
|
||
# ── 输出区域 ─────────────────────────────────
|
||
output_text = gr.TextArea(
|
||
label="执行结果", lines=20, interactive=False, show_copy_button=True
|
||
)
|
||
|
||
# ── 逻辑绑定 ─────────────────────────────────
|
||
input_choices.change(
|
||
fn=enforce_input_exclusive_and_toggle_fields,
|
||
inputs=input_choices,
|
||
outputs=[input_choices, api_fields]
|
||
)
|
||
|
||
mode_dropdown.change(
|
||
lambda mode: gr.update(
|
||
choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS,
|
||
value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0]
|
||
),
|
||
inputs=mode_dropdown,
|
||
outputs=dataset_dropdown
|
||
)
|
||
|
||
dataset_dropdown.change(
|
||
toggle_dataset_file_visibility,
|
||
inputs=dataset_dropdown,
|
||
outputs=dataset_file_input
|
||
)
|
||
|
||
# ---- Run 按钮(queue=True)----
|
||
run_button.click(
|
||
fn=toggle_run,
|
||
inputs=[
|
||
input_choices, native_choices, other_choices,
|
||
output_choices,
|
||
api_url_input, api_token_input,
|
||
api_provider_dropdown, dataset_dropdown, dataset_file_input,
|
||
max_tokens_slider, min_tokens_slider, parallel_slider,
|
||
max_prompt_len_slider, num_req_slider,
|
||
model_override_input,
|
||
extra_args_input,
|
||
is_running,
|
||
mode_dropdown
|
||
],
|
||
outputs=[output_text, is_running, run_button, stop_button],
|
||
show_progress=True,
|
||
queue=True
|
||
)
|
||
|
||
# ---- Stop 按钮(queue=False)----
|
||
def stop_action():
|
||
msg = stop_eval()
|
||
return msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
|
||
|
||
stop_button.click(
|
||
fn=stop_action,
|
||
inputs=None,
|
||
outputs=[output_text, is_running, run_button, stop_button],
|
||
queue=False
|
||
)
|
||
|
||
# ---------------- 入口 ----------------
|
||
if __name__ == "__main__":
|
||
demo.launch(server_name="0.0.0.0", server_port=7900)
|