279 lines
8.5 KiB
Python
279 lines
8.5 KiB
Python
import time
|
|
import os
|
|
import glob
|
|
import threading
|
|
import subprocess
|
|
import gradio as gr
|
|
|
|
# ---------------- 全局进程句柄 ----------------
|
|
current_process = None
|
|
|
|
|
|
# ---------------- 核心运行函数 ----------------
|
|
def run_eval(
|
|
inputs, native, other, output_choices,
|
|
api_url, api_token,
|
|
api_provider, dataset,
|
|
max_tokens, min_tokens, parallel_reqs,
|
|
max_prompt_len, num_requests,
|
|
model_override
|
|
):
|
|
"""
|
|
1. 动态拼装 evalscope perf 命令
|
|
2. 流式打印日志
|
|
3. (可选)启动可视化报告
|
|
"""
|
|
global current_process
|
|
|
|
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
model_name = model_override.strip() or timestamp
|
|
|
|
command = [
|
|
"evalscope", "perf",
|
|
"--url", api_url.strip(),
|
|
"--api", api_provider,
|
|
"--model", model_name,
|
|
"--dataset", dataset,
|
|
"--max-tokens", str(int(max_tokens)),
|
|
"--min-tokens", str(int(min_tokens)),
|
|
"--parallel", str(int(parallel_reqs)),
|
|
"--max-prompt-length", str(int(max_prompt_len)),
|
|
"--number", str(int(num_requests)),
|
|
"--api-key", api_token.strip(),
|
|
]
|
|
|
|
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
|
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
|
|
try:
|
|
current_process = subprocess.Popen(
|
|
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
text=True, bufsize=1
|
|
)
|
|
|
|
for line in current_process.stdout:
|
|
full_output += line
|
|
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
|
|
current_process.stdout.close()
|
|
current_process.wait()
|
|
|
|
except Exception as e:
|
|
full_output += f"[Error] {e}\n"
|
|
yield full_output, False, gr.update(value="Run Evaluation")
|
|
|
|
finally:
|
|
current_process = None
|
|
|
|
full_output += "[Eval Finished]\n"
|
|
|
|
# ---------- 可视化报告 ----------
|
|
if "Evaluation Report" in output_choices:
|
|
vis_port = 7861
|
|
outputs_root = "./outputs"
|
|
try:
|
|
latest_output = max(
|
|
glob.glob(os.path.join(outputs_root, "*")),
|
|
key=os.path.getmtime
|
|
)
|
|
except ValueError:
|
|
latest_output = outputs_root
|
|
|
|
vis_cmd = [
|
|
"evalscope", "app",
|
|
"--outputs", outputs_root,
|
|
"--server-name", "0.0.0.0",
|
|
"--server-port", str(vis_port),
|
|
]
|
|
threading.Thread(
|
|
target=subprocess.Popen,
|
|
args=(vis_cmd,),
|
|
kwargs={"stdout": subprocess.DEVNULL,
|
|
"stderr": subprocess.STDOUT},
|
|
daemon=True
|
|
).start()
|
|
|
|
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
|
|
|
yield full_output, False, gr.update(value="Run Evaluation")
|
|
|
|
|
|
# ---------------- 停止函数 ----------------
|
|
def stop_eval():
|
|
global current_process
|
|
if current_process and current_process.poll() is None:
|
|
current_process.terminate()
|
|
current_process = None
|
|
return "[Stopped by user]\n"
|
|
return "[No active process]\n"
|
|
|
|
|
|
# ---------------- Run/Stop 控制器 ----------------
|
|
def toggle_run(
|
|
inputs, native, other, output_choices,
|
|
api_url, api_token,
|
|
api_provider, dataset,
|
|
max_tokens, min_tokens, parallel_reqs,
|
|
max_prompt_len, num_requests,
|
|
model_override,
|
|
is_running
|
|
):
|
|
if not is_running:
|
|
yield from run_eval(
|
|
inputs, native, other, output_choices,
|
|
api_url, api_token,
|
|
api_provider, dataset,
|
|
max_tokens, min_tokens, parallel_reqs,
|
|
max_prompt_len, num_requests,
|
|
model_override
|
|
)
|
|
else:
|
|
msg = stop_eval()
|
|
yield msg, False, gr.update(value="Run Evaluation")
|
|
|
|
|
|
# ---------------- 互斥逻辑 ----------------
|
|
def enforce_input_exclusive_and_toggle_fields(selected):
|
|
group1 = {"API Models", "Local Models"}
|
|
group2 = {"Benchmarks", "Custom Datasets"}
|
|
|
|
def keep_only_one(group):
|
|
filtered = [item for item in selected if item in group]
|
|
return filtered[-1:]
|
|
|
|
final_selection = set(selected)
|
|
final_selection -= group1
|
|
final_selection |= set(keep_only_one(group1))
|
|
|
|
final_selection -= group2
|
|
final_selection |= set(keep_only_one(group2))
|
|
|
|
show_api_fields = "API Models" in final_selection
|
|
return (
|
|
gr.update(value=list(final_selection)),
|
|
gr.Row.update(visible=show_api_fields)
|
|
)
|
|
|
|
|
|
# ---------------- 构建 Gradio UI ----------------
|
|
with gr.Blocks(title="EvalScope 全功能界面") as demo:
|
|
is_running = gr.State(value=False)
|
|
|
|
# ===== 输入源 =====
|
|
with gr.Group():
|
|
with gr.Row():
|
|
input_choices = gr.CheckboxGroup(
|
|
label="选择输入源",
|
|
choices=["API Models", "Local Models",
|
|
"Benchmarks", "Custom Datasets"],
|
|
interactive=True
|
|
)
|
|
|
|
# ===== API 地址 & Token =====
|
|
with gr.Row(visible=False) as api_fields:
|
|
api_url_input = gr.Textbox(
|
|
label="API 地址",
|
|
placeholder="https://api.example.com/v1/chat"
|
|
)
|
|
api_token_input = gr.Textbox(
|
|
label="Token 密钥",
|
|
type="password",
|
|
placeholder="sk-xxx"
|
|
)
|
|
|
|
# ===== 本地/外部组件 =====
|
|
with gr.Row():
|
|
with gr.Column():
|
|
native_choices = gr.CheckboxGroup(
|
|
label="启用本地模块",
|
|
choices=["Model Adapter", "Data Adapter",
|
|
"Evaluator", "Perf Monitor"]
|
|
)
|
|
with gr.Column():
|
|
other_choices = gr.CheckboxGroup(
|
|
label="启用外部后端",
|
|
choices=["OpenCompass", "VLMEvalKit",
|
|
"RAGAS", "MTEB/CMTEB"]
|
|
)
|
|
|
|
# ===== 运行参数 =====
|
|
with gr.Accordion("运行参数(可选修改)", open=False):
|
|
with gr.Row():
|
|
api_provider_dropdown = gr.Dropdown(
|
|
label="API Provider (--api)",
|
|
choices=["openai", "azure", "ollama", "gemini"],
|
|
value="openai"
|
|
)
|
|
dataset_dropdown = gr.Dropdown(
|
|
label="评测数据集 (--dataset)",
|
|
choices=["openqa", "gsm8k", "mmlu", "truthfulqa"],
|
|
value="openqa"
|
|
)
|
|
model_override_input = gr.Textbox(
|
|
label="自定义模型名 (--model),留空则使用时间戳",
|
|
placeholder="e.g. my-llm-7b"
|
|
)
|
|
with gr.Row():
|
|
max_tokens_slider = gr.Slider(
|
|
label="Max Tokens (--max-tokens)",
|
|
minimum=256, maximum=8192, step=256, value=1024
|
|
)
|
|
min_tokens_slider = gr.Slider(
|
|
label="Min Tokens (--min-tokens)",
|
|
minimum=0, maximum=4096, step=64, value=1024
|
|
)
|
|
with gr.Row():
|
|
parallel_slider = gr.Slider(
|
|
label="并发请求数 (--parallel)",
|
|
minimum=1, maximum=16, step=1, value=1
|
|
)
|
|
num_req_slider = gr.Slider(
|
|
label="请求条数 (--number)",
|
|
minimum=1, maximum=1000, step=1, value=100
|
|
)
|
|
max_prompt_len_slider = gr.Slider(
|
|
label="最大 Prompt 长度 (--max-prompt-length)",
|
|
minimum=2048, maximum=32768, step=512, value=15360
|
|
)
|
|
|
|
# ===== 输出形式 =====
|
|
output_choices = gr.CheckboxGroup(
|
|
label="输出形式",
|
|
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
|
|
)
|
|
|
|
# ===== 控制按钮 & 日志 =====
|
|
run_button = gr.Button("Run Evaluation")
|
|
output_text = gr.TextArea(
|
|
label="执行结果",
|
|
lines=20,
|
|
interactive=False,
|
|
show_copy_button=True
|
|
)
|
|
|
|
# ===== 绑定事件 =====
|
|
input_choices.change(
|
|
fn=enforce_input_exclusive_and_toggle_fields,
|
|
inputs=input_choices,
|
|
outputs=[input_choices, api_fields]
|
|
)
|
|
|
|
run_button.click(
|
|
fn=toggle_run,
|
|
inputs=[
|
|
input_choices, native_choices, other_choices,
|
|
output_choices,
|
|
api_url_input, api_token_input,
|
|
api_provider_dropdown, dataset_dropdown,
|
|
max_tokens_slider, min_tokens_slider, parallel_slider,
|
|
max_prompt_len_slider, num_req_slider,
|
|
model_override_input,
|
|
is_running
|
|
],
|
|
outputs=[output_text, is_running, run_button],
|
|
show_progress=True
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch(server_name="0.0.0.0", server_port=7900)
|