# Comprehensive test for hybrid parallelism (DP/TP attention, DP/TP Dense FFN, TP/EP Sparse FFN, DP/VP LM head) plus speculative decoding. # These tests are not run by default but can be launched on demand. import unittest from types import SimpleNamespace from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST, DEFAULT_MLA_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, popen_launch_server, ) class Test0(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test1(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test2(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test3(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test4(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test5(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test6(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test7(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test8(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test9(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test10(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test11(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test12(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test13(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test14(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test15(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test16(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test17(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test18(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test19(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test20(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test21(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test22(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test23(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test24(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test25(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test26(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test27(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test28(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test29(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test30(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test31(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test32(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test33(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test34(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test35(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test36(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test37(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test38(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test39(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test40(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test41(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test42(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test43(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test44(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test45(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test46(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test47(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test48(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test49(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "normal", "--disable-cuda-graph", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test50(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test51(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test52(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test53(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test54(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test55(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test56(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test57(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test58(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) class Test59(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.5) def test_mgsm_en(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mgsm_en", num_examples=None, num_threads=1024, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.8) if __name__ == "__main__": unittest.main()