diff --git a/gradio_ui.ok.py b/gradio_ui.ok.py new file mode 100644 index 0000000..5b76082 --- /dev/null +++ b/gradio_ui.ok.py @@ -0,0 +1,214 @@ +import time +import os +import glob +import threading +import subprocess +import gradio as gr + +# 全局变量:当前子进程 +current_process = None + + +# ⬇️⬇️⬇️ 运行 EvalScope 并(可选)启动可视化服务 ⬇️⬇️⬇️ +def run_eval(inputs, native, other, output_choices, api_url, api_token): + """ + 1. 调用 `evalscope perf …` 跑基准测试 + 2. 若用户勾选 “Evaluation Report”,测试完成后后台启动 + `evalscope app` Web 可视化服务,并在文本框追加访问链接 + """ + global current_process + + timestamp = time.strftime("%Y%m%d-%H%M%S") + command = [ + "evalscope", "perf", + "--url", api_url.strip(), + "--api", "openai", + "--model", timestamp, # 以时间戳当模型名,避免冲突 + "--dataset", "openqa", + "--max-tokens", "1024", + "--min-tokens", "1024", + "--parallel", "1", + "--max-prompt-length", "15360", + "--number", "100", + "--api-key", api_token.strip(), + ] + + full_output = f"[Eval Started @ {timestamp}]\n" + yield full_output, True, gr.update(value="Stop Evaluation") + + try: + current_process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, bufsize=1 + ) + + # 实时流式输出 + for line in current_process.stdout: + full_output += line + yield full_output, True, gr.update(value="Stop Evaluation") + + current_process.stdout.close() + current_process.wait() + + except Exception as e: + full_output += f"[Error] {e}\n" + yield full_output, False, gr.update(value="Run Evaluation") + + finally: + current_process = None + + full_output += "[Eval Finished]\n" + + # ========== 可视化报告 ========== + if "Evaluation Report" in output_choices: + vis_port = 7861 + outputs_root = "./outputs" + # ⬇️ EvalScope perf 会在 outputs_root 下生成 timestamp 目录 + # 这里额外取最新目录备用(目前 UI 只需要根目录) + try: + latest_output = max( + glob.glob(os.path.join(outputs_root, "*")), + key=os.path.getmtime + ) + except ValueError: + latest_output = outputs_root # 保险:若 outputs 还不存在 + + vis_cmd = [ + "evalscope", "app", + "--outputs", outputs_root, + "--server-name", "0.0.0.0", + "--server-port", str(vis_port), + ] + + # 后台线程启动,不阻塞 UI + threading.Thread( + target=subprocess.Popen, + args=(vis_cmd,), + kwargs={"stdout": subprocess.DEVNULL, + "stderr": subprocess.STDOUT}, + daemon=True + ).start() + + full_output += f"[Visualization 👉] http://localhost:{vis_port}\n" + + yield full_output, False, gr.update(value="Run Evaluation") + + +# ⬇️⬇️⬇️ 停止按钮逻辑 ⬇️⬇️⬇️ +def stop_eval(): + global current_process + if current_process and current_process.poll() is None: + current_process.terminate() + current_process = None + return "[Stopped by user]\n" + return "[No active process]\n" + + +# ⬇️⬇️⬇️ Run/Stop 控制器(必须是 generator) ⬇️⬇️⬇️ +def toggle_run(inputs, native, other, output_choices, + api_url, api_token, is_running): + if not is_running: + # 开始跑 + yield from run_eval(inputs, native, other, + output_choices, api_url, api_token) + else: + # 用户点 Stop + msg = stop_eval() + yield msg, False, gr.update(value="Run Evaluation") + + +# ⬇️⬇️⬇️ 互斥逻辑:同组保留最后一个选项 ⬇️⬇️⬇️ +def enforce_input_exclusive_and_toggle_fields(selected): + group1 = {"API Models", "Local Models"} + group2 = {"Benchmarks", "Custom Datasets"} + + def keep_only_one(group): + filtered = [item for item in selected if item in group] + return filtered[-1:] + + final_selection = set(selected) + final_selection -= group1 + final_selection |= set(keep_only_one(group1)) + + final_selection -= group2 + final_selection |= set(keep_only_one(group2)) + + show_api_fields = "API Models" in final_selection + return ( + gr.update(value=list(final_selection)), + gr.Row.update(visible=show_api_fields) + ) + + +# ------------- 构建 Gradio UI ------------- +with gr.Blocks(title="EvalScope 全功能界面") as demo: + is_running = gr.State(value=False) + + with gr.Group(): + with gr.Row(): + input_choices = gr.CheckboxGroup( + label="选择输入源", + choices=["API Models", "Local Models", + "Benchmarks", "Custom Datasets"], + interactive=True + ) + + with gr.Row(visible=False) as api_fields: + api_url_input = gr.Textbox( + label="API 地址", + placeholder="https://api.example.com/v1/chat" + ) + api_token_input = gr.Textbox( + label="Token 密钥", + type="password", + placeholder="sk-xxx" + ) + + with gr.Row(): + with gr.Column(): + native_choices = gr.CheckboxGroup( + label="启用本地模块", + choices=["Model Adapter", "Data Adapter", + "Evaluator", "Perf Monitor"] + ) + with gr.Column(): + other_choices = gr.CheckboxGroup( + label="启用外部后端", + choices=["OpenCompass", "VLMEvalKit", + "RAGAS", "MTEB/CMTEB"] + ) + + with gr.Row(): + output_choices = gr.CheckboxGroup( + label="输出形式", + choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] + ) + + run_button = gr.Button("Run Evaluation") + output_text = gr.TextArea( + label="执行结果", + lines=20, + interactive=False, + show_copy_button=True + ) + + # 绑定输入互斥 + input_choices.change( + fn=enforce_input_exclusive_and_toggle_fields, + inputs=input_choices, + outputs=[input_choices, api_fields] + ) + + # 绑定 Run/Stop + run_button.click( + fn=toggle_run, + inputs=[ + input_choices, native_choices, other_choices, + output_choices, api_url_input, api_token_input, is_running + ], + outputs=[output_text, is_running, run_button], + show_progress=True + ) + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7900) diff --git a/gradio_ui.py b/gradio_ui.py index 5b76082..86518db 100644 --- a/gradio_ui.py +++ b/gradio_ui.py @@ -5,35 +5,44 @@ import threading import subprocess import gradio as gr -# 全局变量:当前子进程 +# ---------------- 全局进程句柄 ---------------- current_process = None -# ⬇️⬇️⬇️ 运行 EvalScope 并(可选)启动可视化服务 ⬇️⬇️⬇️ -def run_eval(inputs, native, other, output_choices, api_url, api_token): +# ---------------- 核心运行函数 ---------------- +def run_eval( + inputs, native, other, output_choices, + api_url, api_token, + api_provider, dataset, + max_tokens, min_tokens, parallel_reqs, + max_prompt_len, num_requests, + model_override +): """ - 1. 调用 `evalscope perf …` 跑基准测试 - 2. 若用户勾选 “Evaluation Report”,测试完成后后台启动 - `evalscope app` Web 可视化服务,并在文本框追加访问链接 + 1. 动态拼装 evalscope perf 命令 + 2. 流式打印日志 + 3. (可选)启动可视化报告 """ global current_process timestamp = time.strftime("%Y%m%d-%H%M%S") + model_name = model_override.strip() or timestamp + command = [ "evalscope", "perf", "--url", api_url.strip(), - "--api", "openai", - "--model", timestamp, # 以时间戳当模型名,避免冲突 - "--dataset", "openqa", - "--max-tokens", "1024", - "--min-tokens", "1024", - "--parallel", "1", - "--max-prompt-length", "15360", - "--number", "100", + "--api", api_provider, + "--model", model_name, + "--dataset", dataset, + "--max-tokens", str(int(max_tokens)), + "--min-tokens", str(int(min_tokens)), + "--parallel", str(int(parallel_reqs)), + "--max-prompt-length", str(int(max_prompt_len)), + "--number", str(int(num_requests)), "--api-key", api_token.strip(), ] - full_output = f"[Eval Started @ {timestamp}]\n" + full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n" yield full_output, True, gr.update(value="Stop Evaluation") try: @@ -42,7 +51,6 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token): text=True, bufsize=1 ) - # 实时流式输出 for line in current_process.stdout: full_output += line yield full_output, True, gr.update(value="Stop Evaluation") @@ -59,19 +67,17 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token): full_output += "[Eval Finished]\n" - # ========== 可视化报告 ========== + # ---------- 可视化报告 ---------- if "Evaluation Report" in output_choices: vis_port = 7861 outputs_root = "./outputs" - # ⬇️ EvalScope perf 会在 outputs_root 下生成 timestamp 目录 - # 这里额外取最新目录备用(目前 UI 只需要根目录) try: latest_output = max( glob.glob(os.path.join(outputs_root, "*")), key=os.path.getmtime ) except ValueError: - latest_output = outputs_root # 保险:若 outputs 还不存在 + latest_output = outputs_root vis_cmd = [ "evalscope", "app", @@ -79,8 +85,6 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token): "--server-name", "0.0.0.0", "--server-port", str(vis_port), ] - - # 后台线程启动,不阻塞 UI threading.Thread( target=subprocess.Popen, args=(vis_cmd,), @@ -94,7 +98,7 @@ def run_eval(inputs, native, other, output_choices, api_url, api_token): yield full_output, False, gr.update(value="Run Evaluation") -# ⬇️⬇️⬇️ 停止按钮逻辑 ⬇️⬇️⬇️ +# ---------------- 停止函数 ---------------- def stop_eval(): global current_process if current_process and current_process.poll() is None: @@ -104,20 +108,31 @@ def stop_eval(): return "[No active process]\n" -# ⬇️⬇️⬇️ Run/Stop 控制器(必须是 generator) ⬇️⬇️⬇️ -def toggle_run(inputs, native, other, output_choices, - api_url, api_token, is_running): +# ---------------- Run/Stop 控制器 ---------------- +def toggle_run( + inputs, native, other, output_choices, + api_url, api_token, + api_provider, dataset, + max_tokens, min_tokens, parallel_reqs, + max_prompt_len, num_requests, + model_override, + is_running +): if not is_running: - # 开始跑 - yield from run_eval(inputs, native, other, - output_choices, api_url, api_token) + yield from run_eval( + inputs, native, other, output_choices, + api_url, api_token, + api_provider, dataset, + max_tokens, min_tokens, parallel_reqs, + max_prompt_len, num_requests, + model_override + ) else: - # 用户点 Stop msg = stop_eval() yield msg, False, gr.update(value="Run Evaluation") -# ⬇️⬇️⬇️ 互斥逻辑:同组保留最后一个选项 ⬇️⬇️⬇️ +# ---------------- 互斥逻辑 ---------------- def enforce_input_exclusive_and_toggle_fields(selected): group1 = {"API Models", "Local Models"} group2 = {"Benchmarks", "Custom Datasets"} @@ -140,10 +155,11 @@ def enforce_input_exclusive_and_toggle_fields(selected): ) -# ------------- 构建 Gradio UI ------------- +# ---------------- 构建 Gradio UI ---------------- with gr.Blocks(title="EvalScope 全功能界面") as demo: is_running = gr.State(value=False) + # ===== 输入源 ===== with gr.Group(): with gr.Row(): input_choices = gr.CheckboxGroup( @@ -153,6 +169,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo: interactive=True ) + # ===== API 地址 & Token ===== with gr.Row(visible=False) as api_fields: api_url_input = gr.Textbox( label="API 地址", @@ -164,6 +181,7 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo: placeholder="sk-xxx" ) + # ===== 本地/外部组件 ===== with gr.Row(): with gr.Column(): native_choices = gr.CheckboxGroup( @@ -178,12 +196,53 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo: "RAGAS", "MTEB/CMTEB"] ) - with gr.Row(): - output_choices = gr.CheckboxGroup( - label="输出形式", - choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] + # ===== 运行参数 ===== + with gr.Accordion("运行参数(可选修改)", open=False): + with gr.Row(): + api_provider_dropdown = gr.Dropdown( + label="API Provider (--api)", + choices=["openai", "azure", "ollama", "gemini"], + value="openai" + ) + dataset_dropdown = gr.Dropdown( + label="评测数据集 (--dataset)", + choices=["openqa", "gsm8k", "mmlu", "truthfulqa"], + value="openqa" + ) + model_override_input = gr.Textbox( + label="自定义模型名 (--model),留空则使用时间戳", + placeholder="e.g. my-llm-7b" + ) + with gr.Row(): + max_tokens_slider = gr.Slider( + label="Max Tokens (--max-tokens)", + minimum=256, maximum=8192, step=256, value=1024 + ) + min_tokens_slider = gr.Slider( + label="Min Tokens (--min-tokens)", + minimum=0, maximum=4096, step=64, value=1024 + ) + with gr.Row(): + parallel_slider = gr.Slider( + label="并发请求数 (--parallel)", + minimum=1, maximum=16, step=1, value=1 + ) + num_req_slider = gr.Slider( + label="请求条数 (--number)", + minimum=1, maximum=1000, step=1, value=100 + ) + max_prompt_len_slider = gr.Slider( + label="最大 Prompt 长度 (--max-prompt-length)", + minimum=2048, maximum=32768, step=512, value=15360 ) + # ===== 输出形式 ===== + output_choices = gr.CheckboxGroup( + label="输出形式", + choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"] + ) + + # ===== 控制按钮 & 日志 ===== run_button = gr.Button("Run Evaluation") output_text = gr.TextArea( label="执行结果", @@ -192,19 +251,24 @@ with gr.Blocks(title="EvalScope 全功能界面") as demo: show_copy_button=True ) - # 绑定输入互斥 + # ===== 绑定事件 ===== input_choices.change( fn=enforce_input_exclusive_and_toggle_fields, inputs=input_choices, outputs=[input_choices, api_fields] ) - # 绑定 Run/Stop run_button.click( fn=toggle_run, inputs=[ input_choices, native_choices, other_choices, - output_choices, api_url_input, api_token_input, is_running + output_choices, + api_url_input, api_token_input, + api_provider_dropdown, dataset_dropdown, + max_tokens_slider, min_tokens_slider, parallel_slider, + max_prompt_len_slider, num_req_slider, + model_override_input, + is_running ], outputs=[output_text, is_running, run_button], show_progress=True