This commit is contained in:
hailin 2025-07-08 17:07:58 +08:00
parent 0fd04efbec
commit 8b8949779d
2 changed files with 317 additions and 39 deletions

214
gradio_ui.ok.py Normal file
View File

@ -0,0 +1,214 @@
import time
import os
import glob
import threading
import subprocess
import gradio as gr
# 全局变量:当前子进程
current_process = None
# ⬇️⬇️⬇️ 运行 EvalScope 并(可选)启动可视化服务 ⬇️⬇️⬇️
def run_eval(inputs, native, other, output_choices, api_url, api_token):
"""
1. 调用 `evalscope perf ` 跑基准测试
2. 若用户勾选 Evaluation Report测试完成后后台启动
`evalscope app` Web 可视化服务并在文本框追加访问链接
"""
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
command = [
"evalscope", "perf",
"--url", api_url.strip(),
"--api", "openai",
"--model", timestamp, # 以时间戳当模型名,避免冲突
"--dataset", "openqa",
"--max-tokens", "1024",
"--min-tokens", "1024",
"--parallel", "1",
"--max-prompt-length", "15360",
"--number", "100",
"--api-key", api_token.strip(),
]
full_output = f"[Eval Started @ {timestamp}]\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1
)
# 实时流式输出
for line in current_process.stdout:
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
# ========== 可视化报告 ==========
if "Evaluation Report" in output_choices:
vis_port = 7861
outputs_root = "./outputs"
# ⬇️ EvalScope perf 会在 outputs_root 下生成 timestamp 目录
# 这里额外取最新目录备用(目前 UI 只需要根目录)
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root # 保险:若 outputs 还不存在
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
# 后台线程启动,不阻塞 UI
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL,
"stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ⬇️⬇️⬇️ 停止按钮逻辑 ⬇️⬇️⬇️
def stop_eval():
global current_process
if current_process and current_process.poll() is None:
current_process.terminate()
current_process = None
return "[Stopped by user]\n"
return "[No active process]\n"
# ⬇️⬇️⬇️ Run/Stop 控制器(必须是 generator ⬇️⬇️⬇️
def toggle_run(inputs, native, other, output_choices,
api_url, api_token, is_running):
if not is_running:
# 开始跑
yield from run_eval(inputs, native, other,
output_choices, api_url, api_token)
else:
# 用户点 Stop
msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation")
# ⬇️⬇️⬇️ 互斥逻辑:同组保留最后一个选项 ⬇️⬇️⬇️
def enforce_input_exclusive_and_toggle_fields(selected):
group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"}
def keep_only_one(group):
filtered = [item for item in selected if item in group]
return filtered[-1:]
final_selection = set(selected)
final_selection -= group1
final_selection |= set(keep_only_one(group1))
final_selection -= group2
final_selection |= set(keep_only_one(group2))
show_api_fields = "API Models" in final_selection
return (
gr.update(value=list(final_selection)),
gr.Row.update(visible=show_api_fields)
)
# ------------- 构建 Gradio UI -------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
label="选择输入源",
choices=["API Models", "Local Models",
"Benchmarks", "Custom Datasets"],
interactive=True
)
with gr.Row(visible=False) as api_fields:
api_url_input = gr.Textbox(
label="API 地址",
placeholder="https://api.example.com/v1/chat"
)
api_token_input = gr.Textbox(
label="Token 密钥",
type="password",
placeholder="sk-xxx"
)
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(
label="启用本地模块",
choices=["Model Adapter", "Data Adapter",
"Evaluator", "Perf Monitor"]
)
with gr.Column():
other_choices = gr.CheckboxGroup(
label="启用外部后端",
choices=["OpenCompass", "VLMEvalKit",
"RAGAS", "MTEB/CMTEB"]
)
with gr.Row():
output_choices = gr.CheckboxGroup(
label="输出形式",
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
)
run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea(
label="执行结果",
lines=20,
interactive=False,
show_copy_button=True
)
# 绑定输入互斥
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
outputs=[input_choices, api_fields]
)
# 绑定 Run/Stop
run_button.click(
fn=toggle_run,
inputs=[
input_choices, native_choices, other_choices,
output_choices, api_url_input, api_token_input, is_running
],
outputs=[output_text, is_running, run_button],
show_progress=True
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)

View File

@ -5,35 +5,44 @@ import threading
import subprocess import subprocess
import gradio as gr import gradio as gr
# 全局变量:当前子进程 # ---------------- 全局进程句柄 ----------------
current_process = None current_process = None
# ⬇️⬇️⬇️ 运行 EvalScope 并(可选)启动可视化服务 ⬇️⬇️⬇️ # ---------------- 核心运行函数 ----------------
def run_eval(inputs, native, other, output_choices, api_url, api_token): def run_eval(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
""" """
1. 调用 `evalscope perf ` 跑基准测试 1. 动态拼装 evalscope perf 命令
2. 若用户勾选 Evaluation Report测试完成后后台启动 2. 流式打印日志
`evalscope app` Web 可视化服务并在文本框追加访问链接 3. 可选启动可视化报告
""" """
global current_process global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S") timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [ command = [
"evalscope", "perf", "evalscope", "perf",
"--url", api_url.strip(), "--url", api_url.strip(),
"--api", "openai", "--api", api_provider,
"--model", timestamp, # 以时间戳当模型名,避免冲突 "--model", model_name,
"--dataset", "openqa", "--dataset", dataset,
"--max-tokens", "1024", "--max-tokens", str(int(max_tokens)),
"--min-tokens", "1024", "--min-tokens", str(int(min_tokens)),
"--parallel", "1", "--parallel", str(int(parallel_reqs)),
"--max-prompt-length", "15360", "--max-prompt-length", str(int(max_prompt_len)),
"--number", "100", "--number", str(int(num_requests)),
"--api-key", api_token.strip(), "--api-key", api_token.strip(),
] ]
full_output = f"[Eval Started @ {timestamp}]\n" full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation") yield full_output, True, gr.update(value="Stop Evaluation")
try: try:
@ -42,7 +51,6 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token):
text=True, bufsize=1 text=True, bufsize=1
) )
# 实时流式输出
for line in current_process.stdout: for line in current_process.stdout:
full_output += line full_output += line
yield full_output, True, gr.update(value="Stop Evaluation") yield full_output, True, gr.update(value="Stop Evaluation")
@ -59,19 +67,17 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token):
full_output += "[Eval Finished]\n" full_output += "[Eval Finished]\n"
# ========== 可视化报告 ========== # ---------- 可视化报告 ----------
if "Evaluation Report" in output_choices: if "Evaluation Report" in output_choices:
vis_port = 7861 vis_port = 7861
outputs_root = "./outputs" outputs_root = "./outputs"
# ⬇️ EvalScope perf 会在 outputs_root 下生成 timestamp 目录
# 这里额外取最新目录备用(目前 UI 只需要根目录)
try: try:
latest_output = max( latest_output = max(
glob.glob(os.path.join(outputs_root, "*")), glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime key=os.path.getmtime
) )
except ValueError: except ValueError:
latest_output = outputs_root # 保险:若 outputs 还不存在 latest_output = outputs_root
vis_cmd = [ vis_cmd = [
"evalscope", "app", "evalscope", "app",
@ -79,8 +85,6 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token):
"--server-name", "0.0.0.0", "--server-name", "0.0.0.0",
"--server-port", str(vis_port), "--server-port", str(vis_port),
] ]
# 后台线程启动,不阻塞 UI
threading.Thread( threading.Thread(
target=subprocess.Popen, target=subprocess.Popen,
args=(vis_cmd,), args=(vis_cmd,),
@ -94,7 +98,7 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token):
yield full_output, False, gr.update(value="Run Evaluation") yield full_output, False, gr.update(value="Run Evaluation")
# ⬇️⬇️⬇️ 停止按钮逻辑 ⬇️⬇️⬇️ # ---------------- 停止函数 ----------------
def stop_eval(): def stop_eval():
global current_process global current_process
if current_process and current_process.poll() is None: if current_process and current_process.poll() is None:
@ -104,20 +108,31 @@ def stop_eval():
return "[No active process]\n" return "[No active process]\n"
# ⬇️⬇️⬇️ Run/Stop 控制器(必须是 generator ⬇️⬇️⬇️ # ---------------- Run/Stop 控制器 ----------------
def toggle_run(inputs, native, other, output_choices, def toggle_run(
api_url, api_token, is_running): inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
is_running
):
if not is_running: if not is_running:
# 开始跑 yield from run_eval(
yield from run_eval(inputs, native, other, inputs, native, other, output_choices,
output_choices, api_url, api_token) api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
else: else:
# 用户点 Stop
msg = stop_eval() msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation") yield msg, False, gr.update(value="Run Evaluation")
# ⬇️⬇️⬇️ 互斥逻辑:同组保留最后一个选项 ⬇️⬇️⬇️ # ---------------- 互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected): def enforce_input_exclusive_and_toggle_fields(selected):
group1 = {"API Models", "Local Models"} group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"} group2 = {"Benchmarks", "Custom Datasets"}
@ -140,10 +155,11 @@ def enforce_input_exclusive_and_toggle_fields(selected):
) )
# ------------- 构建 Gradio UI ------------- # ---------------- 构建 Gradio UI ----------------
with gr.Blocks(title="EvalScope 全功能界面") as demo: with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False) is_running = gr.State(value=False)
# ===== 输入源 =====
with gr.Group(): with gr.Group():
with gr.Row(): with gr.Row():
input_choices = gr.CheckboxGroup( input_choices = gr.CheckboxGroup(
@ -153,6 +169,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
interactive=True interactive=True
) )
# ===== API 地址 & Token =====
with gr.Row(visible=False) as api_fields: with gr.Row(visible=False) as api_fields:
api_url_input = gr.Textbox( api_url_input = gr.Textbox(
label="API 地址", label="API 地址",
@ -164,6 +181,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
placeholder="sk-xxx" placeholder="sk-xxx"
) )
# ===== 本地/外部组件 =====
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
native_choices = gr.CheckboxGroup( native_choices = gr.CheckboxGroup(
@ -178,12 +196,53 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
"RAGAS", "MTEB/CMTEB"] "RAGAS", "MTEB/CMTEB"]
) )
# ===== 运行参数 =====
with gr.Accordion("运行参数(可选修改)", open=False):
with gr.Row(): with gr.Row():
api_provider_dropdown = gr.Dropdown(
label="API Provider (--api)",
choices=["openai", "azure", "ollama", "gemini"],
value="openai"
)
dataset_dropdown = gr.Dropdown(
label="评测数据集 (--dataset)",
choices=["openqa", "gsm8k", "mmlu", "truthfulqa"],
value="openqa"
)
model_override_input = gr.Textbox(
label="自定义模型名 (--model),留空则使用时间戳",
placeholder="e.g. my-llm-7b"
)
with gr.Row():
max_tokens_slider = gr.Slider(
label="Max Tokens (--max-tokens)",
minimum=256, maximum=8192, step=256, value=1024
)
min_tokens_slider = gr.Slider(
label="Min Tokens (--min-tokens)",
minimum=0, maximum=4096, step=64, value=1024
)
with gr.Row():
parallel_slider = gr.Slider(
label="并发请求数 (--parallel)",
minimum=1, maximum=16, step=1, value=1
)
num_req_slider = gr.Slider(
label="请求条数 (--number)",
minimum=1, maximum=1000, step=1, value=100
)
max_prompt_len_slider = gr.Slider(
label="最大 Prompt 长度 (--max-prompt-length)",
minimum=2048, maximum=32768, step=512, value=15360
)
# ===== 输出形式 =====
output_choices = gr.CheckboxGroup( output_choices = gr.CheckboxGroup(
label="输出形式", label="输出形式",
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
) )
# ===== 控制按钮 & 日志 =====
run_button = gr.Button("Run Evaluation") run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea( output_text = gr.TextArea(
label="执行结果", label="执行结果",
@ -192,19 +251,24 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
show_copy_button=True show_copy_button=True
) )
# 绑定输入互斥 # ===== 绑定事件 =====
input_choices.change( input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields, fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices, inputs=input_choices,
outputs=[input_choices, api_fields] outputs=[input_choices, api_fields]
) )
# 绑定 Run/Stop
run_button.click( run_button.click(
fn=toggle_run, fn=toggle_run,
inputs=[ inputs=[
input_choices, native_choices, other_choices, input_choices, native_choices, other_choices,
output_choices, api_url_input, api_token_input, is_running output_choices,
api_url_input, api_token_input,
api_provider_dropdown, dataset_dropdown,
max_tokens_slider, min_tokens_slider, parallel_slider,
max_prompt_len_slider, num_req_slider,
model_override_input,
is_running
], ],
outputs=[output_text, is_running, run_button], outputs=[output_text, is_running, run_button],
show_progress=True show_progress=True