sglang_v0.5.2/sglang/test/srt/test_gpt_oss_common.py

112 lines
3.8 KiB
Python

import os
from concurrent.futures import ThreadPoolExecutor
from types import SimpleNamespace
from typing import Dict, List, Literal, Optional
from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)
_base_url = DEFAULT_URL_FOR_TEST
_is_hip = is_hip()
class BaseTestGptOss(CustomTestCase):
def run_test(
self,
model_variant: Literal["20b", "120b"],
quantization: Literal["mxfp4", "bf16"],
expected_score_of_reasoning_effort: Dict[str, float],
other_args: Optional[List[str]] = None,
):
if other_args is None:
other_args = []
model = {
("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
("20b", "mxfp4"): "openai/gpt-oss-20b",
("120b", "mxfp4"): "openai/gpt-oss-120b",
}[(model_variant, quantization)]
if model_variant == "20b":
other_args += ["--cuda-graph-max-bs", "600"]
if _is_hip:
os.environ["SGLANG_USE_AITER"] = "0"
self._run_test_raw(
model=model,
expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
other_args=other_args,
)
def _run_test_raw(
self,
model: str,
expected_score_of_reasoning_effort: Dict[str, float],
other_args: List[str],
):
process = popen_launch_server(
model,
_base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
try:
# run multiple tests in parallel since we are mostly bound by the longest generate sequence
# instead of the number of questions
with ThreadPoolExecutor(max_workers=4) as executor:
list(
executor.map(
lambda d: self._run_one_eval(**d),
[
dict(
model=model,
reasoning_effort=reasoning_effort,
expected_score=expected_score,
)
for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
],
)
)
finally:
kill_process_tree(process.pid)
def _run_one_eval(self, model, reasoning_effort, expected_score):
args = SimpleNamespace(
base_url=_base_url,
model=model,
eval_name="gpqa",
num_examples=198,
# use enough threads to allow parallelism
num_threads=198,
# TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
# otherwise a lot of questions are not answered
max_tokens=4096,
# simple-evals by default use 0.5 and is better than 0.0 temperature
# but here for reproducibility, we use 0.1
temperature=0.1,
reasoning_effort=reasoning_effort,
)
setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}"
print(f"Evaluation start: {setup}")
metrics = run_eval(args)
print(f"Evaluation end: {setup} {metrics=}")
self.assertGreaterEqual(metrics["score"], expected_score)
if is_in_ci():
write_github_step_summary(
f"### test_gpt_oss_common\n"
f"Setup: {setup}\n"
f"Score: {metrics['score']:.2f}\n"
)