diff --git a/Dockerfile.ok b/Dockerfile.ok deleted file mode 100644 index f895894..0000000 --- a/Dockerfile.ok +++ /dev/null @@ -1,59 +0,0 @@ -######################## -# 1️⃣ Build stage -######################## -FROM python:3.10-slim AS builder -ENV DEBIAN_FRONTEND=noninteractive - -# 系统依赖:编译 C/C++ 扩展 & git 拉源码 -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - git \ - curl && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /build - -# 先复制 requirements 目录,利用 Docker layer cache -COPY evalscope.0.17.0/requirements ./evalscope/requirements - -# 更新 pip & 预装常用 build tools -RUN pip install --upgrade pip setuptools wheel - -# 把所有依赖装进 /install 目录(✳️ 关键) -RUN pip install --no-cache-dir --prefix=/install \ - -r ./evalscope/requirements/framework.txt \ - -r ./evalscope/requirements/opencompass.txt \ - -r ./evalscope/requirements/vlmeval.txt \ - -r ./evalscope/requirements/aigc.txt \ - -r ./evalscope/requirements/app.txt \ - -r ./evalscope/requirements/dev.txt \ - -r ./evalscope/requirements/docs.txt \ - -r ./evalscope/requirements/perf.txt \ - -r ./evalscope/requirements/rag.txt - -# 安装 evalscope 本体(非 editable,减少后续 COPY) -COPY evalscope.0.17.0/ ./evalscope -RUN pip install --no-cache-dir --prefix=/install ./evalscope - -# 仅带上入口脚本 -COPY gradio_ui.py . - -######################## -# 2️⃣ Runtime stage -######################## -FROM python:3.10-slim AS runtime -ENV DEBIAN_FRONTEND=noninteractive - -# 把 builder 阶段产物注入到 /usr/local 下 -# /install/bin 里可能有可执行文件;site-packages 在 /install/lib/… -COPY --from=builder /install /usr/local -COPY --from=builder /build/gradio_ui.py /app/gradio_ui.py - -WORKDIR /app -EXPOSE 7900 7901 - -# 可选:彻底关闭 pip 缓存,避免 runtime 再次安装时产生垃圾 -ENV PIP_NO_CACHE_DIR=1 - -CMD ["python3", "gradio_ui.py"] diff --git a/gradio_ui.py b/gradio_ui.py index a53b52e..f7f1e96 100644 --- a/gradio_ui.py +++ b/gradio_ui.py @@ -17,6 +17,7 @@ import subprocess import gradio as gr import psutil import signal +import shlex # ---------------- 全局进程句柄 ---------------- current_process = None @@ -40,7 +41,8 @@ def run_perf( api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, - model_override + model_override, + extra_args ): global current_process @@ -61,6 +63,10 @@ def run_perf( "--api-key", api_token.strip(), ] + + if extra_args.strip(): + command += shlex.split(extra_args.strip()) + full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(interactive=False), gr.update(visible=True) @@ -128,7 +134,8 @@ def run_eval_tool( api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, - model_override + model_override, extra_args + ): global current_process @@ -149,6 +156,10 @@ def run_eval_tool( if num_requests: command += ["--limit", str(int(num_requests))] + + if extra_args.strip(): + command += shlex.split(extra_args.strip()) + full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(interactive=False), gr.update(visible=True) @@ -254,6 +265,7 @@ def toggle_run( max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, model_override, + extra_args, is_running, run_mode ): @@ -272,7 +284,8 @@ def toggle_run( api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, - model_override + model_override, + extra_args ) elif run_mode == "eval": yield from run_eval_tool( @@ -281,7 +294,8 @@ def toggle_run( api_provider, dataset, max_tokens, min_tokens, parallel_reqs, max_prompt_len, num_requests, - model_override + model_override, + extra_args ) elif run_mode == "app": info = "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]\n" @@ -348,6 +362,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo: value=PERF_DATASETS[0] ) model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name") + extra_args_input = gr.Textbox(label="额外 EvalScope 参数", placeholder="例如: --disable-cache --temperature 0.7") with gr.Row(): max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024) min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024) @@ -413,6 +428,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo: max_tokens_slider, min_tokens_slider, parallel_slider, max_prompt_len_slider, num_req_slider, model_override_input, + extra_args_input, is_running, mode_dropdown ], diff --git a/gradio_ui_2025_7_18.py b/gradio_ui_2025_7_18.py deleted file mode 100644 index 65aeb8f..0000000 --- a/gradio_ui_2025_7_18.py +++ /dev/null @@ -1,378 +0,0 @@ -import time -import os -import glob -import threading -import subprocess -import gradio as gr -import psutil -import signal - -# ---------------- 全局进程句柄 ---------------- -current_process = None -should_stop = False - -# ---------------- 可选数据集 ---------------- -EVAL_DATASETS = [ - "arc", "bbh", "ceval", "cmmlu", "competition_math", "gsm8k", - "hellaswag", "humaneval", "mmlu", "mmlu_pro", "race", - "trivia_qa", "truthful_qa" -] - -PERF_DATASETS = ["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"] - -# ---------------- perf 模式运行 ---------------- -def run_perf( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override -): - global current_process - - timestamp = time.strftime("%Y%m%d-%H%M%S") - model_name = model_override.strip() or timestamp - - command = [ - "evalscope", "perf", - "--url", api_url.strip(), - "--api", api_provider, - "--model", model_name, - "--dataset", dataset, - "--max-tokens", str(int(max_tokens)), - "--min-tokens", str(int(min_tokens)), - "--parallel", str(int(parallel_reqs)), - "--max-prompt-length", str(int(max_prompt_len)), - "--number", str(int(num_requests)), - "--api-key", api_token.strip(), - ] - - full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" - yield full_output, True, gr.update(value="Stop Evaluation") - - try: - current_process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - text=True, bufsize=1, start_new_session=True - ) - - for line in current_process.stdout: - if should_stop: - break - full_output += line - yield full_output, True, gr.update(value="Stop Evaluation") - - current_process.stdout.close() - current_process.wait() - - except Exception as e: - full_output += f"[Error] {e}\n" - yield full_output, False, gr.update(value="Run Evaluation") - - finally: - current_process = None - - full_output += "[Eval Finished]\n" - - if "Evaluation Report" in output_choices: - vis_port = 7901 - outputs_root = "./outputs" - try: - latest_output = max( - glob.glob(os.path.join(outputs_root, "*")), - key=os.path.getmtime - ) - except ValueError: - latest_output = outputs_root - - vis_cmd = [ - "evalscope", "app", - "--outputs", outputs_root, - "--server-name", "0.0.0.0", - "--server-port", str(vis_port), - ] - threading.Thread( - target=subprocess.Popen, - args=(vis_cmd,), - kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, - daemon=True - ).start() - - full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" - - yield full_output, False, gr.update(value="Run Evaluation") - -# ---------------- eval 模式运行 ---------------- -def run_eval_tool( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override -): - global current_process - - timestamp = time.strftime("%Y%m%d-%H%M%S") - model_name = model_override.strip() or timestamp - - command = [ - "evalscope", "eval", - "--model", model_name, - "--datasets", dataset - ] - if api_url.strip(): - command += [ - "--eval-type", "service", - "--api-url", api_url.strip(), - "--api-key", api_token.strip() - ] - if num_requests: - command += ["--limit", str(int(num_requests))] - - full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" - yield full_output, True, gr.update(value="Stop Evaluation") - - try: - current_process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - text=True, bufsize=1, start_new_session=True - ) - - for line in current_process.stdout: - if should_stop: - break - full_output += line - yield full_output, True, gr.update(value="Stop Evaluation") - - current_process.stdout.close() - current_process.wait() - - except Exception as e: - full_output += f"[Error] {e}\n" - yield full_output, False, gr.update(value="Run Evaluation") - - finally: - current_process = None - - full_output += "[Eval Finished]\n" - - if "Evaluation Report" in output_choices: - vis_port = 7901 - outputs_root = "./outputs" - try: - latest_output = max( - glob.glob(os.path.join(outputs_root, "*")), - key=os.path.getmtime - ) - except ValueError: - latest_output = outputs_root - - vis_cmd = [ - "evalscope", "app", - "--outputs", outputs_root, - "--server-name", "0.0.0.0", - "--server-port", str(vis_port), - ] - threading.Thread( - target=subprocess.Popen, - args=(vis_cmd,), - kwargs={"stdout": subprocess.DEVNULL, "stderr": subprocess.STDOUT}, - daemon=True - ).start() - - full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" - - yield full_output, False, gr.update(value="Run Evaluation") - -# ---------------- 停止函数 ---------------- -def stop_eval(): - """ - 彻底终止 current_process 及其全部子孙进程: - 1. 先发 SIGINT(Ctrl‑C)尝试优雅退出 - 2. 3 秒内仍存活的进程升级为 SIGKILL - 3. 最后 wait() 主进程,防止僵尸 - """ - global current_process, should_stop - should_stop = True - - if not (current_process and current_process.poll() is None): - return "[⚠️ 无活动 evalscope 进程]\n" - - try: - parent = psutil.Process(current_process.pid) - family = parent.children(recursive=True) + [parent] # 整棵进程树 - - # ── 1) 尝试优雅终止 ────────────────────── - for p in family: - p.send_signal(signal.SIGINT) - - # 给 3 秒宽限期 - _, alive = psutil.wait_procs(family, timeout=3) - - # ── 2) 强制 kill 仍存活的 ──────────────── - for p in alive: - p.kill() - psutil.wait_procs(alive, timeout=3) - - # ── 3) 回收僵尸,确保句柄关闭 ──────────── - current_process.wait(timeout=3) - - return "[✅ 已终止进程树 (SIGINT ➜ SIGKILL fallback)]\n" - - except Exception as e: - return f"[❌ 终止失败: {e}]\n" - - finally: - current_process = None - - - -# ---------------- 控制器 ---------------- -def toggle_run( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override, - is_running, - run_mode -): - global should_stop - - if not inputs: - msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n" - yield msg, False, gr.update(value="Run Evaluation") - return - - if not is_running: - should_stop = False - if run_mode == "perf": - yield from run_perf( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override - ) - elif run_mode == "eval": - yield from run_eval_tool( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override - ) - elif run_mode == "app": - yield "[⚠️ 当前为 app 模式,请手动打开 http://localhost:7901 查看报告]", False, gr.update(value="Run Evaluation") - else: - msg = stop_eval() - yield msg, False, gr.update(value="Run Evaluation") - - -# ---------------- 输入源互斥逻辑 ---------------- -def enforce_input_exclusive_and_toggle_fields(selected): - order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"] - group1 = {"API Models", "Local Models"} - group2 = {"Benchmarks", "Custom Datasets"} - - def keep_only_one(group): - filtered = [item for item in selected if item in group] - return filtered[-1:] - - final_sel = set(selected) - final_sel -= group1 - final_sel |= set(keep_only_one(group1)) - final_sel -= group2 - final_sel |= set(keep_only_one(group2)) - - final_list = [itm for itm in order if itm in final_sel] - input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list) - api_field_update = gr.update(visible="API Models" in final_sel) - return input_update, api_field_update - -# ---------------- UI 构建 ---------------- -with gr.Blocks(title="EvalScope 全功能界面") as demo: - is_running = gr.State(value=False) - - with gr.Group(): - with gr.Row(): - mode_dropdown = gr.Dropdown( - label="评测类型", - choices=["eval", "perf", "app"], - value="perf", - info="eval: 智力评测;perf: 性能评测;app: 可视化" - ) - - with gr.Group(): - with gr.Row(): - input_choices = gr.CheckboxGroup( - label="选择输入源", - choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"], - interactive=True - ) - - with gr.Column(visible=False) as api_fields: - api_url_input = gr.Textbox(label="API 地址", placeholder="https://.../v1/chat/completions") - api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx") - with gr.Accordion("运行参数(可选修改)", open=False): - with gr.Row(): - api_provider_dropdown = gr.Dropdown(label="API Provider", choices=["openai", "azure", "ollama", "gemini"], value="openai") - dataset_dropdown = gr.Dropdown(label="评测数据集 (--dataset)", choices=PERF_DATASETS, value=PERF_DATASETS[0]) - model_override_input = gr.Textbox(label="自定义模型名 (--model)", placeholder="llm-name") - with gr.Row(): - max_tokens_slider = gr.Slider(label="Max Tokens", minimum=256, maximum=8192, step=256, value=1024) - min_tokens_slider = gr.Slider(label="Min Tokens", minimum=0, maximum=4096, step=64, value=1024) - with gr.Row(): - parallel_slider = gr.Slider(label="并发请求数", minimum=1, maximum=100, step=1, value=1) - num_req_slider = gr.Slider(label="请求条数", minimum=1, maximum=1000, step=1, value=100) - max_prompt_len_slider = gr.Slider(label="最大 Prompt 长度", minimum=2048, maximum=262144, step=512, value=15360) - - with gr.Row(): - with gr.Column(): - native_choices = gr.CheckboxGroup(label="启用本地模块", choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]) - with gr.Column(): - other_choices = gr.CheckboxGroup(label="启用外部后端", choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]) - - output_choices = gr.CheckboxGroup(label="输出形式", choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]) - run_button = gr.Button("Run Evaluation") - output_text = gr.TextArea(label="执行结果", lines=20, interactive=False, show_copy_button=True) - - input_choices.change( - fn=enforce_input_exclusive_and_toggle_fields, - inputs=input_choices, - outputs=[input_choices, api_fields] - ) - - mode_dropdown.change( - lambda mode: gr.update( - choices=EVAL_DATASETS if mode == "eval" else PERF_DATASETS, - value=EVAL_DATASETS[0] if mode == "eval" else PERF_DATASETS[0] - ), - inputs=mode_dropdown, - outputs=dataset_dropdown - ) - - run_button.click( - fn=toggle_run, - inputs=[ - input_choices, native_choices, other_choices, - output_choices, - api_url_input, api_token_input, - api_provider_dropdown, dataset_dropdown, - max_tokens_slider, min_tokens_slider, parallel_slider, - max_prompt_len_slider, num_req_slider, - model_override_input, - is_running, - mode_dropdown - ], - outputs=[output_text, is_running, run_button], - show_progress=True - ) - -if __name__ == "__main__": - demo.launch(server_name="0.0.0.0", server_port=7900) diff --git a/gradio_ui_old.py b/gradio_ui_old.py deleted file mode 100644 index b75bf75..0000000 --- a/gradio_ui_old.py +++ /dev/null @@ -1,402 +0,0 @@ -import time -import os -import glob -import threading -import subprocess -import gradio as gr -import psutil -import signal - -# ---------------- 全局进程句柄 ---------------- -current_process = None -should_stop = False - -# ---------------- 核心运行函数 ---------------- -def run_perf( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override -): - global current_process - - timestamp = time.strftime("%Y%m%d-%H%M%S") - model_name = model_override.strip() or timestamp - - command = [ - "evalscope", "perf", - "--url", api_url.strip(), - "--api", api_provider, - "--model", model_name, - "--dataset", dataset, - "--max-tokens", str(int(max_tokens)), - "--min-tokens", str(int(min_tokens)), - "--parallel", str(int(parallel_reqs)), - "--max-prompt-length", str(int(max_prompt_len)), - "--number", str(int(num_requests)), - "--api-key", api_token.strip(), - ] - - full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" - yield full_output, True, gr.update(value="Stop Evaluation") - - try: - current_process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - text=True, bufsize=1, start_new_session=True - ) - - for line in current_process.stdout: - if should_stop: - break - full_output += line - yield full_output, True, gr.update(value="Stop Evaluation") - - current_process.stdout.close() - current_process.wait() - - except Exception as e: - full_output += f"[Error] {e}\n" - yield full_output, False, gr.update(value="Run Evaluation") - - finally: - current_process = None - - full_output += "[Eval Finished]\n" - - if "Evaluation Report" in output_choices: - vis_port = 7901 - outputs_root = "./outputs" - try: - latest_output = max( - glob.glob(os.path.join(outputs_root, "*")), - key=os.path.getmtime - ) - except ValueError: - latest_output = outputs_root - - vis_cmd = [ - "evalscope", "app", - "--outputs", outputs_root, - "--server-name", "0.0.0.0", - "--server-port", str(vis_port), - ] - threading.Thread( - target=subprocess.Popen, - args=(vis_cmd,), - kwargs={"stdout": subprocess.DEVNULL, - "stderr": subprocess.STDOUT}, - daemon=True - ).start() - - full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" - - yield full_output, False, gr.update(value="Run Evaluation") - - -# ---------------- 停止函数 ---------------- -def stop_eval(): - global current_process, should_stop - should_stop = True - - if current_process and current_process.poll() is None: - try: - pgid = os.getpgid(current_process.pid) - os.killpg(pgid, signal.SIGINT) # ✅ 优雅终止 - time.sleep(2) - if current_process.poll() is None: - os.killpg(pgid, signal.SIGKILL) # ❗ 强制终止 - return "[✅ 已发送终止信号 (SIGINT → SIGKILL fallback)]\n" - except Exception as e: - return f"[❌ 终止失败: {e}]\n" - finally: - current_process = None - else: - return "[⚠️ 无活动 evalscope 进程]\n" - - - -# ---------------- Run/Stop 控制器 ---------------- -def toggle_run( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override, - is_running, - run_mode # 👈 增加这个参数 -): - global should_stop - - if not inputs: - msg = "[❌ 错误] 必须至少选择一个输入源(API、本地、基准或自定义)才能开始运行。\n" - yield msg, False, gr.update(value="Run Evaluation") - return - - if not is_running: - should_stop = False - if run_mode == "perf": - yield from run_perf( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override - ) - elif run_mode == "eval": - yield from run_eval_tool( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override - ) - else: - msg = stop_eval() - yield msg, False, gr.update(value="Run Evaluation") - - -# ---------------- 互斥逻辑 ---------------- -def enforce_input_exclusive_and_toggle_fields(selected): - order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"] - group1 = {"API Models", "Local Models"} - group2 = {"Benchmarks", "Custom Datasets"} - - def keep_only_one(group): - filtered = [item for item in selected if item in group] - return filtered[-1:] - - final_sel = set(selected) - final_sel -= group1 - final_sel |= set(keep_only_one(group1)) - final_sel -= group2 - final_sel |= set(keep_only_one(group2)) - - final_list = [itm for itm in order if itm in final_sel] - - input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list) - - show_api_fields = "API Models" in final_sel - api_field_update = gr.update(visible=show_api_fields) # ✅ 正确 - - return input_update, api_field_update - - - -def run_eval_tool( - inputs, native, other, output_choices, - api_url, api_token, - api_provider, dataset, - max_tokens, min_tokens, parallel_reqs, - max_prompt_len, num_requests, - model_override -): - global current_process - - timestamp = time.strftime("%Y%m%d-%H%M%S") - model_name = model_override.strip() or timestamp - - command = [ - "evalscope", "eval", - "--model", model_name, - "--datasets", dataset - ] - if api_url.strip(): - command += [ - "--eval-type", "service", - "--api-url", api_url.strip(), - "--api-key", api_token.strip() - ] - if num_requests: - command += ["--limit", str(int(num_requests))] - - full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" - yield full_output, True, gr.update(value="Stop Evaluation") - - try: - current_process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - text=True, bufsize=1, start_new_session=True - ) - - for line in current_process.stdout: - if should_stop: - break - full_output += line - yield full_output, True, gr.update(value="Stop Evaluation") - - current_process.stdout.close() - current_process.wait() - - except Exception as e: - full_output += f"[Error] {e}\n" - yield full_output, False, gr.update(value="Run Evaluation") - - finally: - current_process = None - - full_output += "[Eval Finished]\n" - - if "Evaluation Report" in output_choices: - vis_port = 7901 - outputs_root = "./outputs" - try: - latest_output = max( - glob.glob(os.path.join(outputs_root, "*")), - key=os.path.getmtime - ) - except ValueError: - latest_output = outputs_root - - vis_cmd = [ - "evalscope", "app", - "--outputs", outputs_root, - "--server-name", "0.0.0.0", - "--server-port", str(vis_port), - ] - threading.Thread( - target=subprocess.Popen, - args=(vis_cmd,), - kwargs={"stdout": subprocess.DEVNULL, - "stderr": subprocess.STDOUT}, - daemon=True - ).start() - - full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" - - yield full_output, False, gr.update(value="Run Evaluation") - - - - -# ---------------- 构建 Gradio UI ---------------- -with gr.Blocks(title="EvalScope 全功能界面") as demo: - is_running = gr.State(value=False) - - with gr.Group(): - with gr.Row(): - mode_dropdown = gr.Dropdown( - label="评测类型", - info="eval: 智力评测;perf: 推理性能;app: Web 可视化", - choices=["eval", "perf", "app"], - value="perf" - ) - - # ===== 输入源 ===== - with gr.Group(): - with gr.Row(): - input_choices = gr.CheckboxGroup( - label="选择输入源", - choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"], - interactive=True - ) - - # ===== API 地址 & 运行参数(统一控制显示) ===== - with gr.Column(visible=False) as api_fields: - api_url_input = gr.Textbox( - label="API 地址", - placeholder="https://ai.aiszaiai.com/v1/chat/completions" - ) - api_token_input = gr.Textbox( - label="Token 密钥", - type="password", - placeholder="sk-xxx" - ) - with gr.Accordion("运行参数(可选修改)", open=False): - with gr.Row(): - api_provider_dropdown = gr.Dropdown( - label="API Provider (--api)", - choices=["openai", "azure", "ollama", "gemini"], - value="openai" - ) - dataset_dropdown = gr.Dropdown( - label="评测数据集 (--dataset)", - choices=["openqa", "flickr8k", "longalpaca", "random_dataset", "line_by_line", "custom", "speed_benchmark"], - value="openqa" - ) - model_override_input = gr.Textbox( - label="自定义模型名 (--model),留空则使用时间戳", - placeholder="e.g. my-llm-7b" - ) - with gr.Row(): - max_tokens_slider = gr.Slider( - label="Max Tokens (--max-tokens)", - minimum=256, maximum=8192, step=256, value=1024 - ) - min_tokens_slider = gr.Slider( - label="Min Tokens (--min-tokens)", - minimum=0, maximum=4096, step=64, value=1024 - ) - with gr.Row(): - parallel_slider = gr.Slider( - label="并发请求数 (--parallel)", - minimum=1, maximum=16, step=1, value=1 - ) - num_req_slider = gr.Slider( - label="请求条数 (--number)", - minimum=1, maximum=1000, step=1, value=100 - ) - max_prompt_len_slider = gr.Slider( - label="最大 Prompt 长度 (--max-prompt-length)", - minimum=2048, maximum=32768, step=512, value=15360 - ) - - # ===== 本地/外部组件 ===== - with gr.Row(): - with gr.Column(): - native_choices = gr.CheckboxGroup( - label="启用本地模块", - choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"] - ) - with gr.Column(): - other_choices = gr.CheckboxGroup( - label="启用外部后端", - choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"] - ) - - # ===== 输出形式 ===== - output_choices = gr.CheckboxGroup( - label="输出形式", - choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] - ) - - # ===== 控制按钮 & 日志 ===== - run_button = gr.Button("Run Evaluation") - output_text = gr.TextArea( - label="执行结果", - lines=20, - interactive=False, - show_copy_button=True - ) - - # ===== 绑定事件 ===== - input_choices.change( - fn=enforce_input_exclusive_and_toggle_fields, - inputs=input_choices, - outputs=[input_choices, api_fields] # ✅ 只输出这两个 - ) - - run_button.click( - fn=toggle_run, - inputs=[ - input_choices, native_choices, other_choices, - output_choices, - api_url_input, api_token_input, - api_provider_dropdown, dataset_dropdown, - max_tokens_slider, min_tokens_slider, parallel_slider, - max_prompt_len_slider, num_req_slider, - model_override_input, - is_running, - mode_dropdown # ✅ 改为新的变量 - ], - outputs=[output_text, is_running, run_button], - show_progress=True - ) - -if __name__ == "__main__": - demo.launch(server_name="0.0.0.0", server_port=7900)