evalscope_v0.17.0/gradio_ui_old.py

403 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import os
import glob
import threading
import subprocess
import gradio as gr
import psutil
import signal
# ---------------- 全局进程句柄 ----------------
current_process = None
should_stop = False
# ---------------- 核心运行函数 ----------------
def run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "perf",
"--url", api_url.strip(),
"--api", api_provider,
"--model", model_name,
"--dataset", dataset,
"--max-tokens", str(int(max_tokens)),
"--min-tokens", str(int(min_tokens)),
"--parallel", str(int(parallel_reqs)),
"--max-prompt-length", str(int(max_prompt_len)),
"--number", str(int(num_requests)),
"--api-key", api_token.strip(),
]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL,
"stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ---------------- 停止函数 ----------------
def stop_eval():
global current_process, should_stop
should_stop = True
if current_process and current_process.poll() is None:
try:
pgid = os.getpgid(current_process.pid)
os.killpg(pgid, signal.SIGINT) # ✅ 优雅终止
time.sleep(2)
if current_process.poll() is None:
os.killpg(pgid, signal.SIGKILL) # ❗ 强制终止
return "[✅ 已发送终止信号 (SIGINT → SIGKILL fallback)]\n"
except Exception as e:
return f"[❌ 终止失败: {e}]\n"
finally:
current_process = None
else:
return "[⚠️ 无活动 evalscope 进程]\n"
# ---------------- Run/Stop 控制器 ----------------
def toggle_run(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
is_running,
run_mode # 👈 增加这个参数
):
global should_stop
if not inputs:
msg = "[❌ 错误] 必须至少选择一个输入源API、本地、基准或自定义才能开始运行。\n"
yield msg, False, gr.update(value="Run Evaluation")
return
if not is_running:
should_stop = False
if run_mode == "perf":
yield from run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
elif run_mode == "eval":
yield from run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
else:
msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation")
# ---------------- 互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected):
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"}
def keep_only_one(group):
filtered = [item for item in selected if item in group]
return filtered[-1:]
final_sel = set(selected)
final_sel -= group1
final_sel |= set(keep_only_one(group1))
final_sel -= group2
final_sel |= set(keep_only_one(group2))
final_list = [itm for itm in order if itm in final_sel]
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
show_api_fields = "API Models" in final_sel
api_field_update = gr.update(visible=show_api_fields) # ✅ 正确
return input_update, api_field_update
def run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "eval",
"--model", model_name,
"--datasets", dataset
]
if api_url.strip():
command += [
"--eval-type", "service",
"--api-url", api_url.strip(),
"--api-key", api_token.strip()
]
if num_requests:
command += ["--limit", str(int(num_requests))]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL,
"stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ---------------- 构建 Gradio UI ----------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
with gr.Group():
with gr.Row():
mode_dropdown = gr.Dropdown(
label="评测类型",
info="eval: 智力评测perf: 推理性能app: Web 可视化",
choices=["eval", "perf", "app"],
value="perf"
)
# ===== 输入源 =====
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
label="选择输入源",
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
interactive=True
)
# ===== API 地址 & 运行参数(统一控制显示) =====
with gr.Column(visible=False) as api_fields:
api_url_input = gr.Textbox(
label="API 地址",
placeholder="https://ai.aiszaiai.com/v1/chat/completions"
)
api_token_input = gr.Textbox(
label="Token 密钥",
type="password",
placeholder="sk-xxx"
)
with gr.Accordion("运行参数(可选修改)", open=False):
with gr.Row():
api_provider_dropdown = gr.Dropdown(
label="API Provider (--api)",
choices=["openai", "azure", "ollama", "gemini"],
value="openai"
)
dataset_dropdown = gr.Dropdown(
label="评测数据集 (--dataset)",
choices=["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"],
value="openqa"
)
model_override_input = gr.Textbox(
label="自定义模型名 (--model),留空则使用时间戳",
placeholder="e.g. my-llm-7b"
)
with gr.Row():
max_tokens_slider = gr.Slider(
label="Max Tokens (--max-tokens)",
minimum=256, maximum=8192, step=256, value=1024
)
min_tokens_slider = gr.Slider(
label="Min Tokens (--min-tokens)",
minimum=0, maximum=4096, step=64, value=1024
)
with gr.Row():
parallel_slider = gr.Slider(
label="并发请求数 (--parallel)",
minimum=1, maximum=16, step=1, value=1
)
num_req_slider = gr.Slider(
label="请求条数 (--number)",
minimum=1, maximum=1000, step=1, value=100
)
max_prompt_len_slider = gr.Slider(
label="最大 Prompt 长度 (--max-prompt-length)",
minimum=2048, maximum=32768, step=512, value=15360
)
# ===== 本地/外部组件 =====
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(
label="启用本地模块",
choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]
)
with gr.Column():
other_choices = gr.CheckboxGroup(
label="启用外部后端",
choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]
)
# ===== 输出形式 =====
output_choices = gr.CheckboxGroup(
label="输出形式",
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
)
# ===== 控制按钮 & 日志 =====
run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea(
label="执行结果",
lines=20,
interactive=False,
show_copy_button=True
)
# ===== 绑定事件 =====
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
outputs=[input_choices, api_fields] # ✅ 只输出这两个
)
run_button.click(
fn=toggle_run,
inputs=[
input_choices, native_choices, other_choices,
output_choices,
api_url_input, api_token_input,
api_provider_dropdown, dataset_dropdown,
max_tokens_slider, min_tokens_slider, parallel_slider,
max_prompt_len_slider, num_req_slider,
model_override_input,
is_running,
mode_dropdown # ✅ 改为新的变量
],
outputs=[output_text, is_running, run_button],
show_progress=True
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)