This commit is contained in:
parent
8360f8875f
commit
fcc5990439
|
|
@ -1,59 +0,0 @@
|
||||||
########################
|
|
||||||
# 1️⃣ Build stage
|
|
||||||
########################
|
|
||||||
FROM python:3.10-slim AS builder
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
# 系统依赖:编译 C/C++ 扩展 & git 拉源码
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends \
|
|
||||||
build-essential \
|
|
||||||
git \
|
|
||||||
curl && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /build
|
|
||||||
|
|
||||||
# 先复制 requirements 目录,利用 Docker layer cache
|
|
||||||
COPY evalscope.0.17.0/requirements ./evalscope/requirements
|
|
||||||
|
|
||||||
# 更新 pip & 预装常用 build tools
|
|
||||||
RUN pip install --upgrade pip setuptools wheel
|
|
||||||
|
|
||||||
# 把所有依赖装进 /install 目录(✳️ 关键)
|
|
||||||
RUN pip install --no-cache-dir --prefix=/install \
|
|
||||||
-r ./evalscope/requirements/framework.txt \
|
|
||||||
-r ./evalscope/requirements/opencompass.txt \
|
|
||||||
-r ./evalscope/requirements/vlmeval.txt \
|
|
||||||
-r ./evalscope/requirements/aigc.txt \
|
|
||||||
-r ./evalscope/requirements/app.txt \
|
|
||||||
-r ./evalscope/requirements/dev.txt \
|
|
||||||
-r ./evalscope/requirements/docs.txt \
|
|
||||||
-r ./evalscope/requirements/perf.txt \
|
|
||||||
-r ./evalscope/requirements/rag.txt
|
|
||||||
|
|
||||||
# 安装 evalscope 本体(非 editable,减少后续 COPY)
|
|
||||||
COPY evalscope.0.17.0/ ./evalscope
|
|
||||||
RUN pip install --no-cache-dir --prefix=/install ./evalscope
|
|
||||||
|
|
||||||
# 仅带上入口脚本
|
|
||||||
COPY gradio_ui.py .
|
|
||||||
|
|
||||||
########################
|
|
||||||
# 2️⃣ Runtime stage
|
|
||||||
########################
|
|
||||||
FROM python:3.10-slim AS runtime
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
# 把 builder 阶段产物注入到 /usr/local 下
|
|
||||||
# /install/bin 里可能有可执行文件;site-packages 在 /install/lib/…
|
|
||||||
COPY --from=builder /install /usr/local
|
|
||||||
COPY --from=builder /build/gradio_ui.py /app/gradio_ui.py
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
EXPOSE 7900 7901
|
|
||||||
|
|
||||||
# 可选:彻底关闭 pip 缓存,避免 runtime 再次安装时产生垃圾
|
|
||||||
ENV PIP_NO_CACHE_DIR=1
|
|
||||||
|
|
||||||
CMD ["python3", "gradio_ui.py"]
|
|
||||||
24
gradio_ui.py
24
gradio_ui.py
|
|
@ -17,6 +17,7 @@ import subprocess
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import psutil
|
import psutil
|
||||||
import signal
|
import signal
|
||||||
|
import shlex
|
||||||
|
|
||||||
# ---------------- 全局进程句柄 ----------------
|
# ---------------- 全局进程句柄 ----------------
|
||||||
current_process = None
|
current_process = None
|
||||||
|
|
@ -40,7 +41,8 @@ def run_perf(
|
||||||
api_provider, dataset,
|
api_provider, dataset,
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
max_tokens, min_tokens, parallel_reqs,
|
||||||
max_prompt_len, num_requests,
|
max_prompt_len, num_requests,
|
||||||
model_override
|
model_override,
|
||||||
|
extra_args
|
||||||
):
|
):
|
||||||
global current_process
|
global current_process
|
||||||
|
|
||||||
|
|
@ -61,6 +63,10 @@ def run_perf(
|
||||||
"--api-key", api_token.strip(),
|
"--api-key", api_token.strip(),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if extra_args.strip():
|
||||||
|
command += shlex.split(extra_args.strip())
|
||||||
|
|
||||||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
||||||
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
||||||
|
|
||||||
|
|
@ -128,7 +134,8 @@ def run_eval_tool(
|
||||||
api_provider, dataset,
|
api_provider, dataset,
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
max_tokens, min_tokens, parallel_reqs,
|
||||||
max_prompt_len, num_requests,
|
max_prompt_len, num_requests,
|
||||||
model_override
|
model_override, extra_args
|
||||||
|
|
||||||
):
|
):
|
||||||
global current_process
|
global current_process
|
||||||
|
|
||||||
|
|
@ -149,6 +156,10 @@ def run_eval_tool(
|
||||||
if num_requests:
|
if num_requests:
|
||||||
command += ["--limit", str(int(num_requests))]
|
command += ["--limit", str(int(num_requests))]
|
||||||
|
|
||||||
|
|
||||||
|
if extra_args.strip():
|
||||||
|
command += shlex.split(extra_args.strip())
|
||||||
|
|
||||||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
||||||
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
yield full_output, True, gr.update(interactive=False), gr.update(visible=True)
|
||||||
|
|
||||||
|
|
@ -254,6 +265,7 @@ def toggle_run(
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
max_tokens, min_tokens, parallel_reqs,
|
||||||
max_prompt_len, num_requests,
|
max_prompt_len, num_requests,
|
||||||
model_override,
|
model_override,
|
||||||
|
extra_args,
|
||||||
is_running,
|
is_running,
|
||||||
run_mode
|
run_mode
|
||||||
):
|
):
|
||||||
|
|
@ -272,7 +284,8 @@ def toggle_run(
|
||||||
api_provider, dataset,
|
api_provider, dataset,
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
max_tokens, min_tokens, parallel_reqs,
|
||||||
max_prompt_len, num_requests,
|
max_prompt_len, num_requests,
|
||||||
model_override
|
model_override,
|
||||||
|
extra_args
|
||||||
)
|
)
|
||||||
elif run_mode == "eval":
|
elif run_mode == "eval":
|
||||||
yield from run_eval_tool(
|
yield from run_eval_tool(
|
||||||
|
|
@ -281,7 +294,8 @@ def toggle_run(
|
||||||
api_provider, dataset,
|
api_provider, dataset,
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
max_tokens, min_tokens, parallel_reqs,
|
||||||
max_prompt_len, num_requests,
|
max_prompt_len, num_requests,
|
||||||
model_override
|
model_override,
|
||||||
|
extra_args
|
||||||
)
|
)
|
||||||
elif run_mode == "app":
|
elif run_mode == "app":
|
||||||
info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n"
|
info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n"
|
||||||
|
|
@ -348,6 +362,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
|
||||||
value=PERF_DATASETS[0]
|
value=PERF_DATASETS[0]
|
||||||
)
|
)
|
||||||
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
|
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
|
||||||
|
extra_args_input = gr.Textbox(label="额外 EvalScope 参数", placeholder="例如: --disable-cache --temperature 0.7")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
|
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
|
||||||
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
|
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
|
||||||
|
|
@ -413,6 +428,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo:
|
||||||
max_tokens_slider, min_tokens_slider, parallel_slider,
|
max_tokens_slider, min_tokens_slider, parallel_slider,
|
||||||
max_prompt_len_slider, num_req_slider,
|
max_prompt_len_slider, num_req_slider,
|
||||||
model_override_input,
|
model_override_input,
|
||||||
|
extra_args_input,
|
||||||
is_running,
|
is_running,
|
||||||
mode_dropdown
|
mode_dropdown
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -1,378 +0,0 @@
|
||||||
import time
|
|
||||||
import os
|
|
||||||
import glob
|
|
||||||
import threading
|
|
||||||
import subprocess
|
|
||||||
import gradio as gr
|
|
||||||
import psutil
|
|
||||||
import signal
|
|
||||||
|
|
||||||
# ---------------- 全局进程句柄 ----------------
|
|
||||||
current_process = None
|
|
||||||
should_stop = False
|
|
||||||
|
|
||||||
# ---------------- 可选数据集 ----------------
|
|
||||||
EVAL_DATASETS = [
|
|
||||||
"arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k",
|
|
||||||
"hellaswag", "humaneval", "mmlu", "mmlu_pro", "race",
|
|
||||||
"trivia_qa", "truthful_qa"
|
|
||||||
]
|
|
||||||
|
|
||||||
PERF_DATASETS = ["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"]
|
|
||||||
|
|
||||||
# ---------------- perf 模式运行 ----------------
|
|
||||||
def run_perf(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
):
|
|
||||||
global current_process
|
|
||||||
|
|
||||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
||||||
model_name = model_override.strip() or timestamp
|
|
||||||
|
|
||||||
command = [
|
|
||||||
"evalscope", "perf",
|
|
||||||
"--url", api_url.strip(),
|
|
||||||
"--api", api_provider,
|
|
||||||
"--model", model_name,
|
|
||||||
"--dataset", dataset,
|
|
||||||
"--max-tokens", str(int(max_tokens)),
|
|
||||||
"--min-tokens", str(int(min_tokens)),
|
|
||||||
"--parallel", str(int(parallel_reqs)),
|
|
||||||
"--max-prompt-length", str(int(max_prompt_len)),
|
|
||||||
"--number", str(int(num_requests)),
|
|
||||||
"--api-key", api_token.strip(),
|
|
||||||
]
|
|
||||||
|
|
||||||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
try:
|
|
||||||
current_process = subprocess.Popen(
|
|
||||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
||||||
text=True, bufsize=1, start_new_session=True
|
|
||||||
)
|
|
||||||
|
|
||||||
for line in current_process.stdout:
|
|
||||||
if should_stop:
|
|
||||||
break
|
|
||||||
full_output += line
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
current_process.stdout.close()
|
|
||||||
current_process.wait()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
full_output += f"[Error] {e}\n"
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
current_process = None
|
|
||||||
|
|
||||||
full_output += "[Eval Finished]\n"
|
|
||||||
|
|
||||||
if "Evaluation Report" in output_choices:
|
|
||||||
vis_port = 7901
|
|
||||||
outputs_root = "./outputs"
|
|
||||||
try:
|
|
||||||
latest_output = max(
|
|
||||||
glob.glob(os.path.join(outputs_root, "*")),
|
|
||||||
key=os.path.getmtime
|
|
||||||
)
|
|
||||||
except ValueError:
|
|
||||||
latest_output = outputs_root
|
|
||||||
|
|
||||||
vis_cmd = [
|
|
||||||
"evalscope", "app",
|
|
||||||
"--outputs", outputs_root,
|
|
||||||
"--server-name", "0.0.0.0",
|
|
||||||
"--server-port", str(vis_port),
|
|
||||||
]
|
|
||||||
threading.Thread(
|
|
||||||
target=subprocess.Popen,
|
|
||||||
args=(vis_cmd,),
|
|
||||||
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
|
|
||||||
daemon=True
|
|
||||||
).start()
|
|
||||||
|
|
||||||
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
|
||||||
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
# ---------------- eval 模式运行 ----------------
|
|
||||||
def run_eval_tool(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
):
|
|
||||||
global current_process
|
|
||||||
|
|
||||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
||||||
model_name = model_override.strip() or timestamp
|
|
||||||
|
|
||||||
command = [
|
|
||||||
"evalscope", "eval",
|
|
||||||
"--model", model_name,
|
|
||||||
"--datasets", dataset
|
|
||||||
]
|
|
||||||
if api_url.strip():
|
|
||||||
command += [
|
|
||||||
"--eval-type", "service",
|
|
||||||
"--api-url", api_url.strip(),
|
|
||||||
"--api-key", api_token.strip()
|
|
||||||
]
|
|
||||||
if num_requests:
|
|
||||||
command += ["--limit", str(int(num_requests))]
|
|
||||||
|
|
||||||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
try:
|
|
||||||
current_process = subprocess.Popen(
|
|
||||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
||||||
text=True, bufsize=1, start_new_session=True
|
|
||||||
)
|
|
||||||
|
|
||||||
for line in current_process.stdout:
|
|
||||||
if should_stop:
|
|
||||||
break
|
|
||||||
full_output += line
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
current_process.stdout.close()
|
|
||||||
current_process.wait()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
full_output += f"[Error] {e}\n"
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
current_process = None
|
|
||||||
|
|
||||||
full_output += "[Eval Finished]\n"
|
|
||||||
|
|
||||||
if "Evaluation Report" in output_choices:
|
|
||||||
vis_port = 7901
|
|
||||||
outputs_root = "./outputs"
|
|
||||||
try:
|
|
||||||
latest_output = max(
|
|
||||||
glob.glob(os.path.join(outputs_root, "*")),
|
|
||||||
key=os.path.getmtime
|
|
||||||
)
|
|
||||||
except ValueError:
|
|
||||||
latest_output = outputs_root
|
|
||||||
|
|
||||||
vis_cmd = [
|
|
||||||
"evalscope", "app",
|
|
||||||
"--outputs", outputs_root,
|
|
||||||
"--server-name", "0.0.0.0",
|
|
||||||
"--server-port", str(vis_port),
|
|
||||||
]
|
|
||||||
threading.Thread(
|
|
||||||
target=subprocess.Popen,
|
|
||||||
args=(vis_cmd,),
|
|
||||||
kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT},
|
|
||||||
daemon=True
|
|
||||||
).start()
|
|
||||||
|
|
||||||
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
|
||||||
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
# ---------------- 停止函数 ----------------
|
|
||||||
def stop_eval():
|
|
||||||
"""
|
|
||||||
彻底终止 current_process 及其全部子孙进程:
|
|
||||||
1. 先发 SIGINT(Ctrl‑C)尝试优雅退出
|
|
||||||
2. 3 秒内仍存活的进程升级为 SIGKILL
|
|
||||||
3. 最后 wait() 主进程,防止僵尸
|
|
||||||
"""
|
|
||||||
global current_process, should_stop
|
|
||||||
should_stop = True
|
|
||||||
|
|
||||||
if not (current_process and current_process.poll() is None):
|
|
||||||
return "[⚠️ 无活动 evalscope 进程]\n"
|
|
||||||
|
|
||||||
try:
|
|
||||||
parent = psutil.Process(current_process.pid)
|
|
||||||
family = parent.children(recursive=True) + [parent] # 整棵进程树
|
|
||||||
|
|
||||||
# ── 1) 尝试优雅终止 ──────────────────────
|
|
||||||
for p in family:
|
|
||||||
p.send_signal(signal.SIGINT)
|
|
||||||
|
|
||||||
# 给 3 秒宽限期
|
|
||||||
_, alive = psutil.wait_procs(family, timeout=3)
|
|
||||||
|
|
||||||
# ── 2) 强制 kill 仍存活的 ────────────────
|
|
||||||
for p in alive:
|
|
||||||
p.kill()
|
|
||||||
psutil.wait_procs(alive, timeout=3)
|
|
||||||
|
|
||||||
# ── 3) 回收僵尸,确保句柄关闭 ────────────
|
|
||||||
current_process.wait(timeout=3)
|
|
||||||
|
|
||||||
return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
return f"[❌ 终止失败: {e}]\n"
|
|
||||||
|
|
||||||
finally:
|
|
||||||
current_process = None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------- 控制器 ----------------
|
|
||||||
def toggle_run(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override,
|
|
||||||
is_running,
|
|
||||||
run_mode
|
|
||||||
):
|
|
||||||
global should_stop
|
|
||||||
|
|
||||||
if not inputs:
|
|
||||||
msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n"
|
|
||||||
yield msg, False, gr.update(value="Run Evaluation")
|
|
||||||
return
|
|
||||||
|
|
||||||
if not is_running:
|
|
||||||
should_stop = False
|
|
||||||
if run_mode == "perf":
|
|
||||||
yield from run_perf(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
)
|
|
||||||
elif run_mode == "eval":
|
|
||||||
yield from run_eval_tool(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
)
|
|
||||||
elif run_mode == "app":
|
|
||||||
yield "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]", False, gr.update(value="Run Evaluation")
|
|
||||||
else:
|
|
||||||
msg = stop_eval()
|
|
||||||
yield msg, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------- 输入源互斥逻辑 ----------------
|
|
||||||
def enforce_input_exclusive_and_toggle_fields(selected):
|
|
||||||
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
|
|
||||||
group1 = {"API Models", "Local Models"}
|
|
||||||
group2 = {"Benchmarks", "Custom Datasets"}
|
|
||||||
|
|
||||||
def keep_only_one(group):
|
|
||||||
filtered = [item for item in selected if item in group]
|
|
||||||
return filtered[-1:]
|
|
||||||
|
|
||||||
final_sel = set(selected)
|
|
||||||
final_sel -= group1
|
|
||||||
final_sel |= set(keep_only_one(group1))
|
|
||||||
final_sel -= group2
|
|
||||||
final_sel |= set(keep_only_one(group2))
|
|
||||||
|
|
||||||
final_list = [itm for itm in order if itm in final_sel]
|
|
||||||
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
|
|
||||||
api_field_update = gr.update(visible="API Models" in final_sel)
|
|
||||||
return input_update, api_field_update
|
|
||||||
|
|
||||||
# ---------------- UI 构建 ----------------
|
|
||||||
with gr.Blocks(title="EvalScope 全功能界面") as demo:
|
|
||||||
is_running = gr.State(value=False)
|
|
||||||
|
|
||||||
with gr.Group():
|
|
||||||
with gr.Row():
|
|
||||||
mode_dropdown = gr.Dropdown(
|
|
||||||
label="评测类型",
|
|
||||||
choices=["eval", "perf", "app"],
|
|
||||||
value="perf",
|
|
||||||
info="eval: 智力评测;perf: 性能评测;app: 可视化"
|
|
||||||
)
|
|
||||||
|
|
||||||
with gr.Group():
|
|
||||||
with gr.Row():
|
|
||||||
input_choices = gr.CheckboxGroup(
|
|
||||||
label="选择输入源",
|
|
||||||
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
|
|
||||||
interactive=True
|
|
||||||
)
|
|
||||||
|
|
||||||
with gr.Column(visible=False) as api_fields:
|
|
||||||
api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions")
|
|
||||||
api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
|
|
||||||
with gr.Accordion("运行参数(可选修改)", open=False):
|
|
||||||
with gr.Row():
|
|
||||||
api_provider_dropdown = gr.Dropdown(label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai")
|
|
||||||
dataset_dropdown = gr.Dropdown(label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0])
|
|
||||||
model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name")
|
|
||||||
with gr.Row():
|
|
||||||
max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024)
|
|
||||||
min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024)
|
|
||||||
with gr.Row():
|
|
||||||
parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1)
|
|
||||||
num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100)
|
|
||||||
max_prompt_len_slider = gr.Slider(label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360)
|
|
||||||
|
|
||||||
with gr.Row():
|
|
||||||
with gr.Column():
|
|
||||||
native_choices = gr.CheckboxGroup(label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"])
|
|
||||||
with gr.Column():
|
|
||||||
other_choices = gr.CheckboxGroup(label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"])
|
|
||||||
|
|
||||||
output_choices = gr.CheckboxGroup(label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"])
|
|
||||||
run_button = gr.Button("Run Evaluation")
|
|
||||||
output_text = gr.TextArea(label="执行结果", lines=20, interactive=False, show_copy_button=True)
|
|
||||||
|
|
||||||
input_choices.change(
|
|
||||||
fn=enforce_input_exclusive_and_toggle_fields,
|
|
||||||
inputs=input_choices,
|
|
||||||
outputs=[input_choices, api_fields]
|
|
||||||
)
|
|
||||||
|
|
||||||
mode_dropdown.change(
|
|
||||||
lambda mode: gr.update(
|
|
||||||
choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS,
|
|
||||||
value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0]
|
|
||||||
),
|
|
||||||
inputs=mode_dropdown,
|
|
||||||
outputs=dataset_dropdown
|
|
||||||
)
|
|
||||||
|
|
||||||
run_button.click(
|
|
||||||
fn=toggle_run,
|
|
||||||
inputs=[
|
|
||||||
input_choices, native_choices, other_choices,
|
|
||||||
output_choices,
|
|
||||||
api_url_input, api_token_input,
|
|
||||||
api_provider_dropdown, dataset_dropdown,
|
|
||||||
max_tokens_slider, min_tokens_slider, parallel_slider,
|
|
||||||
max_prompt_len_slider, num_req_slider,
|
|
||||||
model_override_input,
|
|
||||||
is_running,
|
|
||||||
mode_dropdown
|
|
||||||
],
|
|
||||||
outputs=[output_text, is_running, run_button],
|
|
||||||
show_progress=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
demo.launch(server_name="0.0.0.0", server_port=7900)
|
|
||||||
402
gradio_ui_old.py
402
gradio_ui_old.py
|
|
@ -1,402 +0,0 @@
|
||||||
import time
|
|
||||||
import os
|
|
||||||
import glob
|
|
||||||
import threading
|
|
||||||
import subprocess
|
|
||||||
import gradio as gr
|
|
||||||
import psutil
|
|
||||||
import signal
|
|
||||||
|
|
||||||
# ---------------- 全局进程句柄 ----------------
|
|
||||||
current_process = None
|
|
||||||
should_stop = False
|
|
||||||
|
|
||||||
# ---------------- 核心运行函数 ----------------
|
|
||||||
def run_perf(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
):
|
|
||||||
global current_process
|
|
||||||
|
|
||||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
||||||
model_name = model_override.strip() or timestamp
|
|
||||||
|
|
||||||
command = [
|
|
||||||
"evalscope", "perf",
|
|
||||||
"--url", api_url.strip(),
|
|
||||||
"--api", api_provider,
|
|
||||||
"--model", model_name,
|
|
||||||
"--dataset", dataset,
|
|
||||||
"--max-tokens", str(int(max_tokens)),
|
|
||||||
"--min-tokens", str(int(min_tokens)),
|
|
||||||
"--parallel", str(int(parallel_reqs)),
|
|
||||||
"--max-prompt-length", str(int(max_prompt_len)),
|
|
||||||
"--number", str(int(num_requests)),
|
|
||||||
"--api-key", api_token.strip(),
|
|
||||||
]
|
|
||||||
|
|
||||||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
try:
|
|
||||||
current_process = subprocess.Popen(
|
|
||||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
||||||
text=True, bufsize=1, start_new_session=True
|
|
||||||
)
|
|
||||||
|
|
||||||
for line in current_process.stdout:
|
|
||||||
if should_stop:
|
|
||||||
break
|
|
||||||
full_output += line
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
current_process.stdout.close()
|
|
||||||
current_process.wait()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
full_output += f"[Error] {e}\n"
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
current_process = None
|
|
||||||
|
|
||||||
full_output += "[Eval Finished]\n"
|
|
||||||
|
|
||||||
if "Evaluation Report" in output_choices:
|
|
||||||
vis_port = 7901
|
|
||||||
outputs_root = "./outputs"
|
|
||||||
try:
|
|
||||||
latest_output = max(
|
|
||||||
glob.glob(os.path.join(outputs_root, "*")),
|
|
||||||
key=os.path.getmtime
|
|
||||||
)
|
|
||||||
except ValueError:
|
|
||||||
latest_output = outputs_root
|
|
||||||
|
|
||||||
vis_cmd = [
|
|
||||||
"evalscope", "app",
|
|
||||||
"--outputs", outputs_root,
|
|
||||||
"--server-name", "0.0.0.0",
|
|
||||||
"--server-port", str(vis_port),
|
|
||||||
]
|
|
||||||
threading.Thread(
|
|
||||||
target=subprocess.Popen,
|
|
||||||
args=(vis_cmd,),
|
|
||||||
kwargs={"stdout": subprocess.DEVNULL,
|
|
||||||
"stderr": subprocess.STDOUT},
|
|
||||||
daemon=True
|
|
||||||
).start()
|
|
||||||
|
|
||||||
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
|
||||||
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------- 停止函数 ----------------
|
|
||||||
def stop_eval():
|
|
||||||
global current_process, should_stop
|
|
||||||
should_stop = True
|
|
||||||
|
|
||||||
if current_process and current_process.poll() is None:
|
|
||||||
try:
|
|
||||||
pgid = os.getpgid(current_process.pid)
|
|
||||||
os.killpg(pgid, signal.SIGINT) # ✅ 优雅终止
|
|
||||||
time.sleep(2)
|
|
||||||
if current_process.poll() is None:
|
|
||||||
os.killpg(pgid, signal.SIGKILL) # ❗ 强制终止
|
|
||||||
return "[✅ 已发送终止信号 (SIGINT → SIGKILL fallback)]\n"
|
|
||||||
except Exception as e:
|
|
||||||
return f"[❌ 终止失败: {e}]\n"
|
|
||||||
finally:
|
|
||||||
current_process = None
|
|
||||||
else:
|
|
||||||
return "[⚠️ 无活动 evalscope 进程]\n"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Run/Stop 控制器 ----------------
|
|
||||||
def toggle_run(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override,
|
|
||||||
is_running,
|
|
||||||
run_mode # 👈 增加这个参数
|
|
||||||
):
|
|
||||||
global should_stop
|
|
||||||
|
|
||||||
if not inputs:
|
|
||||||
msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n"
|
|
||||||
yield msg, False, gr.update(value="Run Evaluation")
|
|
||||||
return
|
|
||||||
|
|
||||||
if not is_running:
|
|
||||||
should_stop = False
|
|
||||||
if run_mode == "perf":
|
|
||||||
yield from run_perf(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
)
|
|
||||||
elif run_mode == "eval":
|
|
||||||
yield from run_eval_tool(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
msg = stop_eval()
|
|
||||||
yield msg, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------- 互斥逻辑 ----------------
|
|
||||||
def enforce_input_exclusive_and_toggle_fields(selected):
|
|
||||||
order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
|
|
||||||
group1 = {"API Models", "Local Models"}
|
|
||||||
group2 = {"Benchmarks", "Custom Datasets"}
|
|
||||||
|
|
||||||
def keep_only_one(group):
|
|
||||||
filtered = [item for item in selected if item in group]
|
|
||||||
return filtered[-1:]
|
|
||||||
|
|
||||||
final_sel = set(selected)
|
|
||||||
final_sel -= group1
|
|
||||||
final_sel |= set(keep_only_one(group1))
|
|
||||||
final_sel -= group2
|
|
||||||
final_sel |= set(keep_only_one(group2))
|
|
||||||
|
|
||||||
final_list = [itm for itm in order if itm in final_sel]
|
|
||||||
|
|
||||||
input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
|
|
||||||
|
|
||||||
show_api_fields = "API Models" in final_sel
|
|
||||||
api_field_update = gr.update(visible=show_api_fields) # ✅ 正确
|
|
||||||
|
|
||||||
return input_update, api_field_update
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run_eval_tool(
|
|
||||||
inputs, native, other, output_choices,
|
|
||||||
api_url, api_token,
|
|
||||||
api_provider, dataset,
|
|
||||||
max_tokens, min_tokens, parallel_reqs,
|
|
||||||
max_prompt_len, num_requests,
|
|
||||||
model_override
|
|
||||||
):
|
|
||||||
global current_process
|
|
||||||
|
|
||||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
||||||
model_name = model_override.strip() or timestamp
|
|
||||||
|
|
||||||
command = [
|
|
||||||
"evalscope", "eval",
|
|
||||||
"--model", model_name,
|
|
||||||
"--datasets", dataset
|
|
||||||
]
|
|
||||||
if api_url.strip():
|
|
||||||
command += [
|
|
||||||
"--eval-type", "service",
|
|
||||||
"--api-url", api_url.strip(),
|
|
||||||
"--api-key", api_token.strip()
|
|
||||||
]
|
|
||||||
if num_requests:
|
|
||||||
command += ["--limit", str(int(num_requests))]
|
|
||||||
|
|
||||||
full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
try:
|
|
||||||
current_process = subprocess.Popen(
|
|
||||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
||||||
text=True, bufsize=1, start_new_session=True
|
|
||||||
)
|
|
||||||
|
|
||||||
for line in current_process.stdout:
|
|
||||||
if should_stop:
|
|
||||||
break
|
|
||||||
full_output += line
|
|
||||||
yield full_output, True, gr.update(value="Stop Evaluation")
|
|
||||||
|
|
||||||
current_process.stdout.close()
|
|
||||||
current_process.wait()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
full_output += f"[Error] {e}\n"
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
current_process = None
|
|
||||||
|
|
||||||
full_output += "[Eval Finished]\n"
|
|
||||||
|
|
||||||
if "Evaluation Report" in output_choices:
|
|
||||||
vis_port = 7901
|
|
||||||
outputs_root = "./outputs"
|
|
||||||
try:
|
|
||||||
latest_output = max(
|
|
||||||
glob.glob(os.path.join(outputs_root, "*")),
|
|
||||||
key=os.path.getmtime
|
|
||||||
)
|
|
||||||
except ValueError:
|
|
||||||
latest_output = outputs_root
|
|
||||||
|
|
||||||
vis_cmd = [
|
|
||||||
"evalscope", "app",
|
|
||||||
"--outputs", outputs_root,
|
|
||||||
"--server-name", "0.0.0.0",
|
|
||||||
"--server-port", str(vis_port),
|
|
||||||
]
|
|
||||||
threading.Thread(
|
|
||||||
target=subprocess.Popen,
|
|
||||||
args=(vis_cmd,),
|
|
||||||
kwargs={"stdout": subprocess.DEVNULL,
|
|
||||||
"stderr": subprocess.STDOUT},
|
|
||||||
daemon=True
|
|
||||||
).start()
|
|
||||||
|
|
||||||
full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"
|
|
||||||
|
|
||||||
yield full_output, False, gr.update(value="Run Evaluation")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------- 构建 Gradio UI ----------------
|
|
||||||
with gr.Blocks(title="EvalScope 全功能界面") as demo:
|
|
||||||
is_running = gr.State(value=False)
|
|
||||||
|
|
||||||
with gr.Group():
|
|
||||||
with gr.Row():
|
|
||||||
mode_dropdown = gr.Dropdown(
|
|
||||||
label="评测类型",
|
|
||||||
info="eval: 智力评测;perf: 推理性能;app: Web 可视化",
|
|
||||||
choices=["eval", "perf", "app"],
|
|
||||||
value="perf"
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===== 输入源 =====
|
|
||||||
with gr.Group():
|
|
||||||
with gr.Row():
|
|
||||||
input_choices = gr.CheckboxGroup(
|
|
||||||
label="选择输入源",
|
|
||||||
choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
|
|
||||||
interactive=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===== API 地址 & 运行参数(统一控制显示) =====
|
|
||||||
with gr.Column(visible=False) as api_fields:
|
|
||||||
api_url_input = gr.Textbox(
|
|
||||||
label="API 地址",
|
|
||||||
placeholder="https://ai.aiszaiai.com/v1/chat/completions"
|
|
||||||
)
|
|
||||||
api_token_input = gr.Textbox(
|
|
||||||
label="Token 密钥",
|
|
||||||
type="password",
|
|
||||||
placeholder="sk-xxx"
|
|
||||||
)
|
|
||||||
with gr.Accordion("运行参数(可选修改)", open=False):
|
|
||||||
with gr.Row():
|
|
||||||
api_provider_dropdown = gr.Dropdown(
|
|
||||||
label="API Provider (--api)",
|
|
||||||
choices=["openai", "azure", "ollama", "gemini"],
|
|
||||||
value="openai"
|
|
||||||
)
|
|
||||||
dataset_dropdown = gr.Dropdown(
|
|
||||||
label="评测数据集 (--dataset)",
|
|
||||||
choices=["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"],
|
|
||||||
value="openqa"
|
|
||||||
)
|
|
||||||
model_override_input = gr.Textbox(
|
|
||||||
label="自定义模型名 (--model),留空则使用时间戳",
|
|
||||||
placeholder="e.g. my-llm-7b"
|
|
||||||
)
|
|
||||||
with gr.Row():
|
|
||||||
max_tokens_slider = gr.Slider(
|
|
||||||
label="Max Tokens (--max-tokens)",
|
|
||||||
minimum=256, maximum=8192, step=256, value=1024
|
|
||||||
)
|
|
||||||
min_tokens_slider = gr.Slider(
|
|
||||||
label="Min Tokens (--min-tokens)",
|
|
||||||
minimum=0, maximum=4096, step=64, value=1024
|
|
||||||
)
|
|
||||||
with gr.Row():
|
|
||||||
parallel_slider = gr.Slider(
|
|
||||||
label="并发请求数 (--parallel)",
|
|
||||||
minimum=1, maximum=16, step=1, value=1
|
|
||||||
)
|
|
||||||
num_req_slider = gr.Slider(
|
|
||||||
label="请求条数 (--number)",
|
|
||||||
minimum=1, maximum=1000, step=1, value=100
|
|
||||||
)
|
|
||||||
max_prompt_len_slider = gr.Slider(
|
|
||||||
label="最大 Prompt 长度 (--max-prompt-length)",
|
|
||||||
minimum=2048, maximum=32768, step=512, value=15360
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===== 本地/外部组件 =====
|
|
||||||
with gr.Row():
|
|
||||||
with gr.Column():
|
|
||||||
native_choices = gr.CheckboxGroup(
|
|
||||||
label="启用本地模块",
|
|
||||||
choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]
|
|
||||||
)
|
|
||||||
with gr.Column():
|
|
||||||
other_choices = gr.CheckboxGroup(
|
|
||||||
label="启用外部后端",
|
|
||||||
choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===== 输出形式 =====
|
|
||||||
output_choices = gr.CheckboxGroup(
|
|
||||||
label="输出形式",
|
|
||||||
choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===== 控制按钮 & 日志 =====
|
|
||||||
run_button = gr.Button("Run Evaluation")
|
|
||||||
output_text = gr.TextArea(
|
|
||||||
label="执行结果",
|
|
||||||
lines=20,
|
|
||||||
interactive=False,
|
|
||||||
show_copy_button=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===== 绑定事件 =====
|
|
||||||
input_choices.change(
|
|
||||||
fn=enforce_input_exclusive_and_toggle_fields,
|
|
||||||
inputs=input_choices,
|
|
||||||
outputs=[input_choices, api_fields] # ✅ 只输出这两个
|
|
||||||
)
|
|
||||||
|
|
||||||
run_button.click(
|
|
||||||
fn=toggle_run,
|
|
||||||
inputs=[
|
|
||||||
input_choices, native_choices, other_choices,
|
|
||||||
output_choices,
|
|
||||||
api_url_input, api_token_input,
|
|
||||||
api_provider_dropdown, dataset_dropdown,
|
|
||||||
max_tokens_slider, min_tokens_slider, parallel_slider,
|
|
||||||
max_prompt_len_slider, num_req_slider,
|
|
||||||
model_override_input,
|
|
||||||
is_running,
|
|
||||||
mode_dropdown # ✅ 改为新的变量
|
|
||||||
],
|
|
||||||
outputs=[output_text, is_running, run_button],
|
|
||||||
show_progress=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
demo.launch(server_name="0.0.0.0", server_port=7900)
|
|
||||||
Loading…
Reference in New Issue