import os from concurrent.futures import ThreadPoolExecutor from types import SimpleNamespace from typing import Dict, List, Literal, Optional from sglang.srt.utils import is_hip, kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, is_in_ci, popen_launch_server, write_github_step_summary, ) _base_url = DEFAULT_URL_FOR_TEST _is_hip = is_hip() class BaseTestGptOss(CustomTestCase): def run_test( self, model_variant: Literal["20b", "120b"], quantization: Literal["mxfp4", "bf16"], expected_score_of_reasoning_effort: Dict[str, float], other_args: Optional[List[str]] = None, ): if other_args is None: other_args = [] model = { ("20b", "bf16"): "lmsys/gpt-oss-20b-bf16", ("120b", "bf16"): "lmsys/gpt-oss-120b-bf16", ("20b", "mxfp4"): "openai/gpt-oss-20b", ("120b", "mxfp4"): "openai/gpt-oss-120b", }[(model_variant, quantization)] if model_variant == "20b": other_args += ["--cuda-graph-max-bs", "600"] if _is_hip: os.environ["SGLANG_USE_AITER"] = "0" self._run_test_raw( model=model, expected_score_of_reasoning_effort=expected_score_of_reasoning_effort, other_args=other_args, ) def _run_test_raw( self, model: str, expected_score_of_reasoning_effort: Dict[str, float], other_args: List[str], ): process = popen_launch_server( model, _base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=other_args, ) try: # run multiple tests in parallel since we are mostly bound by the longest generate sequence # instead of the number of questions with ThreadPoolExecutor(max_workers=4) as executor: list( executor.map( lambda d: self._run_one_eval(**d), [ dict( model=model, reasoning_effort=reasoning_effort, expected_score=expected_score, ) for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items() ], ) ) finally: kill_process_tree(process.pid) def _run_one_eval(self, model, reasoning_effort, expected_score): args = SimpleNamespace( base_url=_base_url, model=model, eval_name="gpqa", num_examples=198, # use enough threads to allow parallelism num_threads=198, # TODO 4k is still not enough, we need e.g. 64k token, but that is super slow # otherwise a lot of questions are not answered max_tokens=4096, # simple-evals by default use 0.5 and is better than 0.0 temperature # but here for reproducibility, we use 0.1 temperature=0.1, reasoning_effort=reasoning_effort, ) setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}" print(f"Evaluation start: {setup}") metrics = run_eval(args) print(f"Evaluation end: {setup} {metrics=}") self.assertGreaterEqual(metrics["score"], expected_score) if is_in_ci(): write_github_step_summary( f"### test_gpt_oss_common\n" f"Setup: {setup}\n" f"Score: {metrics['score']:.2f}\n" )