import unittest from types import SimpleNamespace import requests from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST_MLA, DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, popen_launch_server, ) class TestPureDP(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "4", "--enable-dp-attention", "--dp", "4", "--moe-a2a-backend", "deepep", "--cuda-graph-max-bs", "128", "--max-running-requests", "512", "--mem-fraction-static", "0.5", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) class TestHybridDPTP(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "4", "--enable-dp-attention", "--dp", "2", "--moe-a2a-backend", "deepep", "--cuda-graph-max-bs", "128", "--max-running-requests", "256", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) class TestTP(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "4", "--moe-a2a-backend", "deepep", "--cuda-graph-max-bs", "128", "--max-running-requests", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) @unittest.skip("covered in test_deepep_large.py") class TestNoGatherdBuffer(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "4", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--moe-a2a-backend", "deepep", "--cuda-graph-max-bs", "32", "--max-running-requests", "512", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) class TestTBO(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "4", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--moe-a2a-backend", "deepep", "--enable-two-batch-overlap", "--cuda-graph-max-bs", "128", "--max-running-requests", "512", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) @unittest.skip("covered in TestMTPWithTBO") class TestMTP(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "4", "--enable-dp-attention", "--dp", "2", "--enable-dp-lm-head", "--moe-a2a-backend", "deepep", "--speculative-algo", "EAGLE", "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--speculative-num-steps", "2", "--speculative-eagle-topk", "3", "--speculative-num-draft-tokens", "3", "--cuda-graph-max-bs", "32", "--max-running-requests", "64", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] print( f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n" f"accuracy={metrics['accuracy']=:.3f}\n" f"{avg_spec_accept_length=:.3f}\n" ) self.assertGreater(avg_spec_accept_length, 2.1) class TestMTPWithTBO(CustomTestCase): @classmethod def setUpClass(cls): import os cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--tp-size", "4", "--enable-dp-attention", "--dp-size", "4", "--enable-two-batch-overlap", "--moe-a2a-backend", "deepep", "--trust-remote-code", "--speculative-algorithm", "EAGLE", "--speculative-num-steps", "2", "--speculative-eagle-topk", "3", "--speculative-num-draft-tokens", "3", "--speculative-draft-model-path", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--chunked-prefill-size", "256", "--cuda-graph-max-bs", "32", "--max-running-requests", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_gsm8k(self): args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) print(metrics) self.assertGreater(metrics["accuracy"], 0.60) server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] print( f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n" f"accuracy={metrics['accuracy']=:.3f}\n" f"{avg_spec_accept_length=:.3f}\n" ) self.assertGreater(avg_spec_accept_length, 2.1) if __name__ == "__main__": unittest.main()