evalscope_v0.17.0/gradio_ui.py

import time
import os
import glob
import threading
import subprocess
import gradio as gr
import psutil
import signal

# ---------------- 全局进程句柄 ----------------
current_process = None
should_stop = False

# ---------------- 核心运行函数 ----------------
def run_eval(
    inputs, native, other, output_choices,
    api_url, api_token,
    api_provider, dataset,
    max_tokens, min_tokens, parallel_reqs,
    max_prompt_len, num_requests,
    model_override
):
    global current_process

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    model_name = model_override.strip() or timestamp

    command = [
        "evalscope", "perf",
        "--url", api_url.strip(),
        "--api", api_provider,
        "--model", model_name,
        "--dataset", dataset,
        "--max-tokens", str(int(max_tokens)),
        "--min-tokens", str(int(min_tokens)),
        "--parallel", str(int(parallel_reqs)),
        "--max-prompt-length", str(int(max_prompt_len)),
        "--number", str(int(num_requests)),
        "--api-key", api_token.strip(),
    ]

    full_output = f"[Eval Started @ {timestamp}]\nCmd: {' '.join(command)}\n"
    yield full_output, True, gr.update(value="Stop Evaluation")

    try:
        current_process = subprocess.Popen(
            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
            text=True, bufsize=1, start_new_session=True
        )

        for line in current_process.stdout:
            if should_stop:
                break
            full_output += line
            yield full_output, True, gr.update(value="Stop Evaluation")

        current_process.stdout.close()
        current_process.wait()

    except Exception as e:
        full_output += f"[Error] {e}\n"
        yield full_output, False, gr.update(value="Run Evaluation")

    finally:
        current_process = None

    full_output += "[Eval Finished]\n"

    if "Evaluation Report" in output_choices:
        vis_port = 7861
        outputs_root = "./outputs"
        try:
            latest_output = max(
                glob.glob(os.path.join(outputs_root, "*")),
                key=os.path.getmtime
            )
        except ValueError:
            latest_output = outputs_root

        vis_cmd = [
            "evalscope", "app",
            "--outputs", outputs_root,
            "--server-name", "0.0.0.0",
            "--server-port", str(vis_port),
        ]
        threading.Thread(
            target=subprocess.Popen,
            args=(vis_cmd,),
            kwargs={"stdout": subprocess.DEVNULL,
                    "stderr": subprocess.STDOUT},
            daemon=True
        ).start()

        full_output += f"[Visualization 👉] http://localhost:{vis_port}\n"

    yield full_output, False, gr.update(value="Run Evaluation")


# ---------------- 停止函数 ----------------
def stop_eval():
    global current_process, should_stop
    should_stop = True

    if current_process and current_process.poll() is None:
        try:
            pgid = os.getpgid(current_process.pid)
            os.killpg(pgid, signal.SIGINT)
            time.sleep(2)
            if current_process.poll() is None:
                os.killpg(pgid, signal.SIGKILL)
            return "[✅ 已发送终止信号 (SIGINT → SIGKILL fallback)]\n"
        except Exception as e:
            return f"[❌ 终止失败: {e}]\n"
        finally:
            current_process = None
    else:
        return "[⚠️ 无活动 evalscope 进程]\n"


# ---------------- Run/Stop 控制器 ----------------
def toggle_run(
    inputs, native, other, output_choices,
    api_url, api_token,
    api_provider, dataset,
    max_tokens, min_tokens, parallel_reqs,
    max_prompt_len, num_requests,
    model_override,
    is_running
):
    global should_stop
    if not is_running:
        should_stop = False
        yield from run_eval(
            inputs, native, other, output_choices,
            api_url, api_token,
            api_provider, dataset,
            max_tokens, min_tokens, parallel_reqs,
            max_prompt_len, num_requests,
            model_override
        )
    else:
        msg = stop_eval()
        yield msg, False, gr.update(value="Run Evaluation")


# ---------------- 禁用按钮逻辑 ----------------
def update_button_enable(inputs, outputs):
    enabled = bool(inputs or outputs)
    return gr.update(interactive=enabled)


# ---------------- 互斥逻辑 ----------------
def enforce_input_exclusive_and_toggle_fields(selected):
    order = ["API Models", "Local Models", "Benchmarks", "Custom Datasets"]
    group1 = {"API Models", "Local Models"}
    group2 = {"Benchmarks", "Custom Datasets"}

    def keep_only_one(group):
        filtered = [item for item in selected if item in group]
        return filtered[-1:]

    final_sel = set(selected)
    final_sel -= group1
    final_sel |= set(keep_only_one(group1))
    final_sel -= group2
    final_sel |= set(keep_only_one(group2))

    final_list = [itm for itm in order if itm in final_sel]

    input_update = gr.update() if list(selected) == final_list else gr.update(value=final_list)
    api_field_update = gr.update(visible="API Models" in final_sel)

    return input_update, api_field_update


# ---------------- 构建 Gradio UI ----------------
with gr.Blocks(title="EvalScope 全功能界面") as demo:
    is_running = gr.State(value=False)

    with gr.Group():
        with gr.Row():
            input_choices = gr.CheckboxGroup(
                label="选择输入源",
                choices=["API Models", "Local Models", "Benchmarks", "Custom Datasets"],
                interactive=True
            )

    with gr.Column(visible=False) as api_fields:
        api_url_input = gr.Textbox(label="API 地址", placeholder="https://api.example.com/v1/chat")
        api_token_input = gr.Textbox(label="Token 密钥", type="password", placeholder="sk-xxx")
        with gr.Accordion("运行参数（可选修改）", open=False):
            with gr.Row():
                api_provider_dropdown = gr.Dropdown(
                    label="API Provider (--api)",
                    choices=["openai", "azure", "ollama", "gemini"],
                    value="openai"
                )
                dataset_dropdown = gr.Dropdown(
                    label="评测数据集 (--dataset)",
                    choices=["openqa", "gsm8k", "mmlu", "truthfulqa"],
                    value="openqa"
                )
            model_override_input = gr.Textbox(
                label="自定义模型名 (--model)，留空则使用时间戳",
                placeholder="e.g. my-llm-7b"
            )
            with gr.Row():
                max_tokens_slider = gr.Slider("Max Tokens (--max-tokens)", 256, 8192, 256, value=1024)
                min_tokens_slider = gr.Slider("Min Tokens (--min-tokens)", 0, 4096, 64, value=1024)
            with gr.Row():
                parallel_slider = gr.Slider("并发请求数 (--parallel)", 1, 16, 1, value=1)
                num_req_slider = gr.Slider("请求条数 (--number)", 1, 1000, 1, value=100)
            max_prompt_len_slider = gr.Slider("最大 Prompt 长度 (--max-prompt-length)", 2048, 32768, 512, value=15360)

    with gr.Row():
        with gr.Column():
            native_choices = gr.CheckboxGroup(
                label="启用本地模块",
                choices=["Model Adapter", "Data Adapter", "Evaluator", "Perf Monitor"]
            )
        with gr.Column():
            other_choices = gr.CheckboxGroup(
                label="启用外部后端",
                choices=["OpenCompass", "VLMEvalKit", "RAGAS", "MTEB/CMTEB"]
            )

    output_choices = gr.CheckboxGroup(
        label="输出形式",
        choices=["Evaluation Report", "Gradio", "WandB", "Swanlab"]
    )

    run_button = gr.Button("Run Evaluation", interactive=False)

    output_text = gr.TextArea(
        label="执行结果",
        lines=20,
        interactive=False,
        show_copy_button=True
    )

    input_choices.change(
        fn=enforce_input_exclusive_and_toggle_fields,
        inputs=input_choices,
        outputs=[input_choices, api_fields]
    )

    input_choices.change(
        fn=update_button_enable,
        inputs=[input_choices, output_choices],
        outputs=run_button
    )

    output_choices.change(
        fn=update_button_enable,
        inputs=[input_choices, output_choices],
        outputs=run_button
    )

    run_button.click(
        fn=toggle_run,
        inputs=[
            input_choices, native_choices, other_choices,
            output_choices,
            api_url_input, api_token_input,
            api_provider_dropdown, dataset_dropdown,
            max_tokens_slider, min_tokens_slider, parallel_slider,
            max_prompt_len_slider, num_req_slider,
            model_override_input,
            is_running
        ],
        outputs=[output_text, is_running, run_button],
        show_progress=True
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7900)