This commit is contained in:
hailin 2025-07-18 15:40:28 +08:00
parent 063f21a336
commit 8360f8875f
2 changed files with 511 additions and 73 deletions

View File

@ -1,3 +1,14 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Gradio UI+EvalScope 调度脚本量产版
关键改动
1. 独立 StopEvaluation 按钮queue=False立即触发 `stop_eval()`
2. `stop_eval()` 使用 psutil 递归杀进程树并 wait()杜绝僵尸
3. 所有生成器统一返回 4 个输出output_text is_running run_button 更新 stop_button 更新
"""
import time
import os
import glob
@ -17,8 +28,10 @@ EVAL_DATASETS = [
"hellaswag", "humaneval", "mmlu", "mmlu_pro", "race",
"trivia_qa", "truthful_qa"
]
PERF_DATASETS = ["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"]
PERF_DATASETS = [
"openqa", "flickr8k", "longalpaca", "random_dataset",
"line_by_line", "custom", "speed_benchmark"
]
# ---------------- perf 模式运行 ----------------
def run_perf(
@ -49,32 +62,37 @@ def run_perf(
]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
start_new_session=True, # 独立进程组,便于后续 killpg / psutil
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
finally:
current_process = None
full_output += "[Eval Finished]\n"
# 自动启动可视化
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
@ -101,7 +119,7 @@ def run_perf(
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
# ---------------- eval 模式运行 ----------------
def run_eval_tool(
@ -132,26 +150,30 @@ def run_eval_tool(
command += ["--limit", str(int(num_requests))]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
finally:
current_process = None
@ -184,15 +206,15 @@ def run_eval_tool(
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
yield full_output, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
# ---------------- 停止函数 ----------------
def stop_eval():
def stop_eval() -> str:
"""
彻底终止 current_process 及其全部子孙进程
1. 先发 SIGINTCtrlC尝试优雅退出
2. 3 秒内仍存活的进程升级为 SIGKILL
3. 最后 wait() 主进程防止僵尸
1. SIGINT优雅退出3 秒宽限
2. 仍存活则 SIGKILL
3. wait() 主进程防止僵尸
"""
global current_process, should_stop
should_stop = True
@ -202,34 +224,29 @@ def stop_eval():
try:
parent = psutil.Process(current_process.pid)
family = parent.children(recursive=True) + [parent] # 整棵进程树
family = parent.children(recursive=True) + [parent]
# ── 1) 尝试优雅终止 ──────────────────────
# 1) SIGINT
for p in family:
p.send_signal(signal.SIGINT)
_, alive = psutil.wait_procs(family, timeout=3)
# 给 10 秒宽限期
_, alive = psutil.wait_procs(family, timeout=10)
# ── 2) 强制 kill 仍存活的 ────────────────
# 2) SIGKILL
for p in alive:
p.kill()
psutil.wait_procs(alive, timeout=10)
# ── 3) 回收僵尸,确保句柄关闭 ────────────
current_process.wait(timeout=10)
psutil.wait_procs(alive, timeout=3)
# 3) reap
current_process.wait(timeout=3)
return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n"
except Exception as e:
return f"[❌ 终止失败: {e}]\n"
except Exception as exc:
return f"[❌ 终止失败: {exc}]\n"
finally:
current_process = None
# ---------------- 控制器 ----------------
# ---------------- 控制器(仅负责启动) ----------------
def toggle_run(
inputs, native, other, output_choices,
api_url, api_token,
@ -244,10 +261,9 @@ def toggle_run(
if not inputs:
msg = "[❌ 错误] 必须至少选择一个输入源API、本地、基准或自定义才能开始运行。\n"
yield msg, False, gr.update(value="Run Evaluation")
yield msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
return
if not is_running:
should_stop = False
if run_mode == "perf":
yield from run_perf(
@ -268,11 +284,8 @@ def toggle_run(
model_override
)
elif run_mode == "app":
yield "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]", False, gr.update(value="Run Evaluation")
else:
msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation")
info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n"
yield info, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
# ---------------- 输入源互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected):
@ -299,6 +312,7 @@ def enforce_input_exclusive_and_toggle_fields(selected):
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
# ── 顶栏:模式选择 ─────────────────────────────
with gr.Group():
with gr.Row():
mode_dropdown = gr.Dropdown(
@ -308,6 +322,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
info="eval: 智力评测perf: 性能评测app: 可视化"
)
# ── 输入源选择 ────────────────────────────────
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
@ -316,13 +331,22 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
interactive=True
)
# ── API 参数 ─────────────────────────────────
with gr.Column(visible=False) as api_fields:
api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions")
api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
with gr.Accordion("运行参数(可选修改)", open=False):
with gr.Row():
api_provider_dropdown = gr.Dropdown(label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai")
dataset_dropdown = gr.Dropdown(label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0])
api_provider_dropdown = gr.Dropdown(
label="API Provider",
choices=["openai", "azure", "ollama", "gemini"],
value="openai"
)
dataset_dropdown = gr.Dropdown(
label="评测数据集 (--dataset)",
choices=PERF_DATASETS,
value=PERF_DATASETS[0]
)
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
with gr.Row():
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
@ -330,18 +354,39 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
with gr.Row():
parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1)
num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100)
max_prompt_len_slider = gr.Slider(label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360)
max_prompt_len_slider = gr.Slider(
label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360
)
# ── 本地/外部模块勾选 ──────────────────────────
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"])
native_choices = gr.CheckboxGroup(
label="启用本地模块",
choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]
)
with gr.Column():
other_choices = gr.CheckboxGroup(label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"])
other_choices = gr.CheckboxGroup(
label="启用外部后端",
choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]
)
output_choices = gr.CheckboxGroup(label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"])
run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea(label="执行结果", lines=20, interactive=False, show_copy_button=True)
# ── 输出开关 ─────────────────────────────────
output_choices = gr.CheckboxGroup(
label="输出形式",
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
)
# ── Run & Stop 按钮 ─────────────────────────
run_button = gr.Button("Run Evaluation", variant="primary")
stop_button = gr.Button("Stop Evaluation", variant="stop", visible=False)
# ── 输出区域 ─────────────────────────────────
output_text = gr.TextArea(
label="执行结果", lines=20, interactive=False, show_copy_button=True
)
# ── 逻辑绑定 ─────────────────────────────────
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
@ -357,6 +402,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
outputs=dataset_dropdown
)
# ---- Run 按钮queue=True----
run_button.click(
fn=toggle_run,
inputs=[
@ -370,9 +416,23 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running,
mode_dropdown
],
outputs=[output_text, is_running, run_button],
show_progress=True
outputs=[output_text, is_running, run_button, stop_button],
show_progress=True,
queue=True
)
# ---- Stop 按钮queue=False----
def stop_action():
msg = stop_eval()
return msg, False, gr.update(value="Run Evaluation", interactive=True), gr.update(visible=False)
stop_button.click(
fn=stop_action,
inputs=None,
outputs=[output_text, is_running, run_button, stop_button],
queue=False
)
# ---------------- 入口 ----------------
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)

378
gradio_ui_2025_7_18.py Normal file
View File

@ -0,0 +1,378 @@
import time
import os
import glob
import threading
import subprocess
import gradio as gr
import psutil
import signal
# ---------------- 全局进程句柄 ----------------
current_process = None
should_stop = False
# ---------------- 可选数据集 ----------------
EVAL_DATASETS = [
"arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k",
"hellaswag", "humaneval", "mmlu", "mmlu_pro", "race",
"trivia_qa", "truthful_qa"
]
PERF_DATASETS = ["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"]
# ---------------- perf 模式运行 ----------------
def run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "perf",
"--url", api_url.strip(),
"--api", api_provider,
"--model", model_name,
"--dataset", dataset,
"--max-tokens", str(int(max_tokens)),
"--min-tokens", str(int(min_tokens)),
"--parallel", str(int(parallel_reqs)),
"--max-prompt-length", str(int(max_prompt_len)),
"--number", str(int(num_requests)),
"--api-key", api_token.strip(),
]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ---------------- eval 模式运行 ----------------
def run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "eval",
"--model", model_name,
"--datasets", dataset
]
if api_url.strip():
command += [
"--eval-type", "service",
"--api-url", api_url.strip(),
"--api-key", api_token.strip()
]
if num_requests:
command += ["--limit", str(int(num_requests))]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ---------------- 停止函数 ----------------
def stop_eval():
"""
彻底终止 current_process 及其全部子孙进程
1. 先发 SIGINTCtrlC尝试优雅退出
2. 3 秒内仍存活的进程升级为 SIGKILL
3. 最后 wait() 主进程防止僵尸
"""
global current_process, should_stop
should_stop = True
if not (current_process and current_process.poll() is None):
return "[⚠️ 无活动 evalscope 进程]\n"
try:
parent = psutil.Process(current_process.pid)
family = parent.children(recursive=True) + [parent] # 整棵进程树
# ── 1) 尝试优雅终止 ──────────────────────
for p in family:
p.send_signal(signal.SIGINT)
# 给 3 秒宽限期
_, alive = psutil.wait_procs(family, timeout=3)
# ── 2) 强制 kill 仍存活的 ────────────────
for p in alive:
p.kill()
psutil.wait_procs(alive, timeout=3)
# ── 3) 回收僵尸,确保句柄关闭 ────────────
current_process.wait(timeout=3)
return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n"
except Exception as e:
return f"[❌ 终止失败: {e}]\n"
finally:
current_process = None
# ---------------- 控制器 ----------------
def toggle_run(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
is_running,
run_mode
):
global should_stop
if not inputs:
msg = "[❌ 错误] 必须至少选择一个输入源API、本地、基准或自定义才能开始运行。\n"
yield msg, False, gr.update(value="Run Evaluation")
return
if not is_running:
should_stop = False
if run_mode == "perf":
yield from run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
elif run_mode == "eval":
yield from run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
elif run_mode == "app":
yield "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]", False, gr.update(value="Run Evaluation")
else:
msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation")
# ---------------- 输入源互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected):
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"}
def keep_only_one(group):
filtered = [item for item in selected if item in group]
return filtered[-1:]
final_sel = set(selected)
final_sel -= group1
final_sel |= set(keep_only_one(group1))
final_sel -= group2
final_sel |= set(keep_only_one(group2))
final_list = [itm for itm in order if itm in final_sel]
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
api_field_update = gr.update(visible="API Models" in final_sel)
return input_update, api_field_update
# ---------------- UI 构建 ----------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
with gr.Group():
with gr.Row():
mode_dropdown = gr.Dropdown(
label="评测类型",
choices=["eval", "perf", "app"],
value="perf",
info="eval: 智力评测perf: 性能评测app: 可视化"
)
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
label="选择输入源",
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
interactive=True
)
with gr.Column(visible=False) as api_fields:
api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions")
api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
with gr.Accordion("运行参数(可选修改)", open=False):
with gr.Row():
api_provider_dropdown = gr.Dropdown(label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai")
dataset_dropdown = gr.Dropdown(label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0])
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
with gr.Row():
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
with gr.Row():
parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1)
num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100)
max_prompt_len_slider = gr.Slider(label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360)
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"])
with gr.Column():
other_choices = gr.CheckboxGroup(label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"])
output_choices = gr.CheckboxGroup(label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"])
run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea(label="执行结果", lines=20, interactive=False, show_copy_button=True)
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
outputs=[input_choices, api_fields]
)
mode_dropdown.change(
lambda mode: gr.update(
choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS,
value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0]
),
inputs=mode_dropdown,
outputs=dataset_dropdown
)
run_button.click(
fn=toggle_run,
inputs=[
input_choices, native_choices, other_choices,
output_choices,
api_url_input, api_token_input,
api_provider_dropdown, dataset_dropdown,
max_tokens_slider, min_tokens_slider, parallel_slider,
max_prompt_len_slider, num_req_slider,
model_override_input,
is_running,
mode_dropdown
],
outputs=[output_text, is_running, run_button],
show_progress=True
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)