evalscope_v0.17.0/gradio_ui.py

356 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import os
import glob
import threading
import subprocess
import gradio as gr
import psutil
import signal
# ---------------- 全局进程句柄 ----------------
current_process = None
should_stop = False
# ---------------- 可选数据集 ----------------
EVAL_DATASETS = [
"arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k",
"hellaswag", "humaneval", "mmlu", "mmlu_pro", "race",
"trivia_qa", "truthful_qa"
]
PERF_DATASETS = ["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"]
# ---------------- perf 模式运行 ----------------
def run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "perf",
"--url", api_url.strip(),
"--api", api_provider,
"--model", model_name,
"--dataset", dataset,
"--max-tokens", str(int(max_tokens)),
"--min-tokens", str(int(min_tokens)),
"--parallel", str(int(parallel_reqs)),
"--max-prompt-length", str(int(max_prompt_len)),
"--number", str(int(num_requests)),
"--api-key", api_token.strip(),
]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ---------------- eval 模式运行 ----------------
def run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
):
global current_process
timestamp = time.strftime("%Y%m%d-%H%M%S")
model_name = model_override.strip() or timestamp
command = [
"evalscope", "eval",
"--model", model_name,
"--datasets", dataset
]
if api_url.strip():
command += [
"--eval-type", "service",
"--api-url", api_url.strip(),
"--api-key", api_token.strip()
]
if num_requests:
command += ["--limit", str(int(num_requests))]
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
yield full_output, True, gr.update(value="Stop Evaluation")
try:
current_process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1, start_new_session=True
)
for line in current_process.stdout:
if should_stop:
break
full_output += line
yield full_output, True, gr.update(value="Stop Evaluation")
current_process.stdout.close()
current_process.wait()
except Exception as e:
full_output += f"[Error] {e}\n"
yield full_output, False, gr.update(value="Run Evaluation")
finally:
current_process = None
full_output += "[Eval Finished]\n"
if "Evaluation Report" in output_choices:
vis_port = 7901
outputs_root = "./outputs"
try:
latest_output = max(
glob.glob(os.path.join(outputs_root, "*")),
key=os.path.getmtime
)
except ValueError:
latest_output = outputs_root
vis_cmd = [
"evalscope", "app",
"--outputs", outputs_root,
"--server-name", "0.0.0.0",
"--server-port", str(vis_port),
]
threading.Thread(
target=subprocess.Popen,
args=(vis_cmd,),
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
daemon=True
).start()
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
yield full_output, False, gr.update(value="Run Evaluation")
# ---------------- 停止函数 ----------------
def stop_eval():
global current_process, should_stop
should_stop = True
if current_process and current_process.poll() is None:
try:
pgid = os.getpgid(current_process.pid)
os.killpg(pgid, signal.SIGINT)
time.sleep(2)
if current_process.poll() is None:
os.killpg(pgid, signal.SIGKILL)
return "[✅ 已发送终止信号 (SIGINT → SIGKILL fallback)]\n"
except Exception as e:
return f"[❌ 终止失败: {e}]\n"
finally:
current_process = None
else:
return "[⚠️ 无活动 evalscope 进程]\n"
# ---------------- 控制器 ----------------
def toggle_run(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override,
is_running,
run_mode
):
global should_stop
if not inputs:
msg = "[❌ 错误] 必须至少选择一个输入源API、本地、基准或自定义才能开始运行。\n"
yield msg, False, gr.update(value="Run Evaluation")
return
if not is_running:
should_stop = False
if run_mode == "perf":
yield from run_perf(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
elif run_mode == "eval":
yield from run_eval_tool(
inputs, native, other, output_choices,
api_url, api_token,
api_provider, dataset,
max_tokens, min_tokens, parallel_reqs,
max_prompt_len, num_requests,
model_override
)
elif run_mode == "app":
yield "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]", False, gr.update(value="Run Evaluation")
else:
msg = stop_eval()
yield msg, False, gr.update(value="Run Evaluation")
# ---------------- 输入源互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected):
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
group1 = {"API Models", "Local Models"}
group2 = {"Benchmarks", "Custom Datasets"}
def keep_only_one(group):
filtered = [item for item in selected if item in group]
return filtered[-1:]
final_sel = set(selected)
final_sel -= group1
final_sel |= set(keep_only_one(group1))
final_sel -= group2
final_sel |= set(keep_only_one(group2))
final_list = [itm for itm in order if itm in final_sel]
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
api_field_update = gr.update(visible="API Models" in final_sel)
return input_update, api_field_update
# ---------------- UI 构建 ----------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
is_running = gr.State(value=False)
with gr.Group():
with gr.Row():
mode_dropdown = gr.Dropdown(
label="评测类型",
choices=["eval", "perf", "app"],
value="perf",
info="eval: 智力评测perf: 性能评测app: 可视化"
)
with gr.Group():
with gr.Row():
input_choices = gr.CheckboxGroup(
label="选择输入源",
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
interactive=True
)
with gr.Column(visible=False) as api_fields:
api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions")
api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
with gr.Accordion("运行参数(可选修改)", open=False):
with gr.Row():
api_provider_dropdown = gr.Dropdown(label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai")
dataset_dropdown = gr.Dropdown(label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0])
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="my-llm")
with gr.Row():
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
with gr.Row():
parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=16, step=1, value=1)
num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100)
max_prompt_len_slider = gr.Slider(label="最大 Prompt 长度", minimum=2048, maximum=32768, step=512, value=15360)
with gr.Row():
with gr.Column():
native_choices = gr.CheckboxGroup(label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"])
with gr.Column():
other_choices = gr.CheckboxGroup(label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"])
output_choices = gr.CheckboxGroup(label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"])
run_button = gr.Button("Run Evaluation")
output_text = gr.TextArea(label="执行结果", lines=20, interactive=False, show_copy_button=True)
input_choices.change(
fn=enforce_input_exclusive_and_toggle_fields,
inputs=input_choices,
outputs=[input_choices, api_fields]
)
mode_dropdown.change(
lambda mode: gr.update(
choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS,
value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0]
),
inputs=mode_dropdown,
outputs=dataset_dropdown
)
run_button.click(
fn=toggle_run,
inputs=[
input_choices, native_choices, other_choices,
output_choices,
api_url_input, api_token_input,
api_provider_dropdown, dataset_dropdown,
max_tokens_slider, min_tokens_slider, parallel_slider,
max_prompt_len_slider, num_req_slider,
model_override_input,
is_running,
mode_dropdown
],
outputs=[output_text, is_running, run_button],
show_progress=True
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7900)