evalscope_v0.17.0/gradio_ui.ok.py

215 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import os
import glob
import threading
import subprocess
import gradio as gr
# 全局变量:当前子进程
current_process = None
# ⬇️⬇️⬇️ 运行 EvalScope 并(可选)启动可视化服务 ⬇️⬇️⬇️
def run_eval(inputs, native, other, output_choices, api_url, api_token):
"""
1. 调用 `evalscope perf …` 跑基准测试
2. 若用户勾选 “Evaluation Report”测试完成后后台启动
`evalscope app` Web 可视化服务,并在文本框追加访问链接
"""
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
command = [
"evalscope", "perf",
"--url", api_url.strip(),
"--api", "openai",
"--model", timestamp, # 以时间戳当模型名,避免冲突
"--dataset", "openqa",
"--max-tokens", "1024",
"--min-tokens", "1024",
"--parallel", "1",
"--max-prompt-length", "15360",
"--number", "100",
"--api-key", api_token.strip(),
]
full_output = f"[Eval Started @ {timestamp}]\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1
)
# 实时流式输出
for line in current_process.stdout:
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
# ========== 可视化报告 ==========
if "Evaluation Report" in output_choices:
vis_port = 7861
outputs_root = "./outputs"
# ⬇️ EvalScope perf 会在 outputs_root 下生成 timestamp 目录
# 这里额外取最新目录备用(目前 UI 只需要根目录)
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root # 保险:若 outputs 还不存在
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
# 后台线程启动,不阻塞 UI
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL,
"stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ⬇️⬇️⬇️ 停止按钮逻辑 ⬇️⬇️⬇️
def stop_eval():
global current_process
if current_process and current_process.poll() is None:
current_process.terminate()
current_process = None
return "[Stopped by user]\n"
return "[No active process]\n"
# ⬇️⬇️⬇️ Run/Stop 控制器(必须是 generator ⬇️⬇️⬇️
def toggle_run(inputs, native, other, output_choices,
api_url, api_token, is_running):
if not is_running:
# 开始跑
yield from run_eval(inputs, native, other,
output_choices, api_url, api_token)
else:
# 用户点 Stop
msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation")
# ⬇️⬇️⬇️ 互斥逻辑:同组保留最后一个选项 ⬇️⬇️⬇️
def enforce_input_exclusive_and_toggle_fields(selected):
group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"}
def keep_only_one(group):
filtered = [item for item in selected if item in group]
return filtered[-1:]
final_selection = set(selected)
final_selection -= group1
final_selection |= set(keep_only_one(group1))
final_selection -= group2
final_selection |= set(keep_only_one(group2))
show_api_fields = "API Models" in final_selection
return (
gr.update(value=list(final_selection)),
gr.Row.update(visible=show_api_fields)
)
# ------------- 构建 Gradio UI -------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
label="选择输入源",
choices=["API Models", "Local Models",
"Benchmarks", "Custom Datasets"],
interactive=True
)
with gr.Row(visible=False) as api_fields:
api_url_input = gr.Textbox(
label="API 地址",
placeholder="https://api.example.com/v1/chat"
)
api_token_input = gr.Textbox(
label="Token 密钥",
type="password",
placeholder="sk-xxx"
)
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(
label="启用本地模块",
choices=["Model Adapter", "Data Adapter",
"Evaluator", "Perf Monitor"]
)
with gr.Column():
other_choices = gr.CheckboxGroup(
label="启用外部后端",
choices=["OpenCompass", "VLMEvalKit",
"RAGAS", "MTEB/CMTEB"]
)
with gr.Row():
output_choices = gr.CheckboxGroup(
label="输出形式",
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
)
run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea(
label="执行结果",
lines=20,
interactive=False,
show_copy_button=True
)
# 绑定输入互斥
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
outputs=[input_choices, api_fields]
)
# 绑定 Run/Stop
run_button.click(
fn=toggle_run,
inputs=[
input_choices, native_choices, other_choices,
output_choices, api_url_input, api_token_input, is_running
],
outputs=[output_text, is_running, run_button],
show_progress=True
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)