import json import multiprocessing as mp import os import random import threading import time import unittest from concurrent.futures import ThreadPoolExecutor from functools import partial from types import SimpleNamespace from typing import List, Optional import numpy as np import requests import torch import sglang as sgl from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner from sglang.test.test_utils import ( DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, popen_launch_server, run_logprob_check, ) torch_dtype = torch.float16 prefill_tolerance = 5e-2 decode_tolerance: float = 5e-2 class TestEAGLEEngine(CustomTestCase): BASE_CONFIG = { "model_path": DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, "speculative_draft_model_path": DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, "speculative_algorithm": "EAGLE", "speculative_num_steps": 5, "speculative_eagle_topk": 4, "speculative_num_draft_tokens": 8, "mem_fraction_static": 0.7, "cuda_graph_max_bs": 4, } NUM_CONFIGS = 2 def setUp(self): self.prompt = "Today is a sunny day and I like" self.sampling_params = {"temperature": 0, "max_new_tokens": 8} ref_engine = sgl.Engine( model_path=self.BASE_CONFIG["model_path"], cuda_graph_max_bs=1 ) self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"] ref_engine.shutdown() def test_correctness(self): configs = [ # Basic config self.BASE_CONFIG, # Chunked prefill {**self.BASE_CONFIG, "chunked_prefill_size": 4}, ] for i, config in enumerate(configs[: self.NUM_CONFIGS]): with self.subTest(i=i): print(f"{config=}") engine = sgl.Engine(**config, log_level="info", decode_log_interval=10) try: self._test_single_generation(engine) self._test_batch_generation(engine) self._test_eos_token(engine) self._test_acc_length(engine) finally: engine.shutdown() print("=" * 100) def _test_single_generation(self, engine): output = engine.generate(self.prompt, self.sampling_params)["text"] print(f"{output=}, {self.ref_output=}") self.assertEqual(output, self.ref_output) def _test_batch_generation(self, engine): prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] params = {"temperature": 0, "max_new_tokens": 50} outputs = engine.generate(prompts, params) for prompt, output in zip(prompts, outputs): print(f"Prompt: {prompt}") print(f"Generated: {output['text']}") print("-" * 40) print(f"{engine.get_server_info()=}") avg_spec_accept_length = engine.get_server_info()["avg_spec_accept_length"] print(f"{avg_spec_accept_length=}") self.assertGreater(avg_spec_accept_length, 1.9) def _test_eos_token(self, engine): prompt = "[INST] <>\nYou are a helpful assistant.\n<>\nToday is a sunny day and I like [/INST]" params = { "temperature": 0.1, "max_new_tokens": 1024, "skip_special_tokens": False, } tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST) output = engine.generate(prompt, params)["text"] print(f"{output=}") tokens = tokenizer.encode(output, truncation=False) self.assertNotIn(tokenizer.eos_token_id, tokens) def _test_acc_length(self, engine): prompt = [ "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:", ] * 5 # test batched generation sampling_params = {"temperature": 0, "max_new_tokens": 512} output = engine.generate(prompt, sampling_params) output = output[0] if "spec_verify_ct" in output["meta_info"]: acc_length = ( output["meta_info"]["completion_tokens"] / output["meta_info"]["spec_verify_ct"] ) else: acc_length = 1.0 speed = ( output["meta_info"]["completion_tokens"] / output["meta_info"]["e2e_latency"] ) print(f"{acc_length=}") if engine.server_args.model_path == DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST: self.assertGreater(acc_length, 3.6) else: self.assertGreater(acc_length, 2.6) class TestEAGLEEngineTokenMap(TestEAGLEEngine): BASE_CONFIG = { "model_path": "meta-llama/Meta-Llama-3-8B-Instruct", "speculative_draft_model_path": "lmsys/sglang-EAGLE-LLaMA3-Instruct-8B", "speculative_algorithm": "EAGLE", "speculative_num_steps": 5, "speculative_eagle_topk": 4, "speculative_num_draft_tokens": 8, "speculative_token_map": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt", "mem_fraction_static": 0.7, "cuda_graph_max_bs": 4, "dtype": "float16", } NUM_CONFIGS = 1 class TestEAGLE3Engine(TestEAGLEEngine): BASE_CONFIG = { "model_path": "meta-llama/Llama-3.1-8B-Instruct", "speculative_draft_model_path": "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B", "speculative_algorithm": "EAGLE3", "speculative_num_steps": 5, "speculative_eagle_topk": 16, "speculative_num_draft_tokens": 64, "mem_fraction_static": 0.7, "cuda_graph_max_bs": 4, "dtype": "float16", } NUM_CONFIGS = 1 class TestEAGLEServer(CustomTestCase): PROMPTS = [ "[INST] <>\\nYou are a helpful assistant.\\n<>\\nToday is a sunny day and I like[/INST]" '[INST] <>\\nYou are a helpful assistant.\\n<>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]', "[INST] <>\\nYou are a helpful assistant.\\n<>\\nSummarize Russell Brunson's Perfect Webinar Script...[/INST]", "[INST] <>\\nYou are a helpful assistant.\\n<>\\nwho are you?[/INST]", "[INST] <>\\nYou are a helpful assistant.\\n<>\\nwhere are you from?[/INST]", ] @classmethod def setUpClass(cls): cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--speculative-algorithm", "EAGLE", "--speculative-draft-model-path", DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, "--speculative-num-steps", 5, "--speculative-eagle-topk", 8, "--speculative-num-draft-tokens", 64, "--mem-fraction-static", 0.7, "--chunked-prefill-size", 128, "--max-running-requests", 8, ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def send_request(self): time.sleep(random.uniform(0, 2)) for prompt in self.PROMPTS: url = self.base_url + "/generate" data = { "text": prompt, "sampling_params": { "temperature": 0, "max_new_tokens": 1024, }, } response = requests.post(url, json=data) assert response.status_code == 200 def send_requests_abort(self): for prompt in self.PROMPTS: try: time.sleep(random.uniform(0, 2)) url = self.base_url + "/generate" data = { "model": "base", "text": prompt, "sampling_params": { "temperature": 0, "max_new_tokens": 1024, }, } # set timeout = 1s, mock disconnected requests.post(url, json=data, timeout=1) except Exception as e: print(e) pass def test_request_abort(self): concurrency = 4 threads = [ threading.Thread(target=self.send_request) for _ in range(concurrency) ] + [ threading.Thread(target=self.send_requests_abort) for _ in range(concurrency) ] for worker in threads: worker.start() for p in threads: p.join() def test_max_token_one(self): requests.get(self.base_url + "/flush_cache") args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=1, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) # Just run and check it does not hang metrics = run_eval(args) self.assertGreater(metrics["output_throughput"], 50) def test_gsm8k(self): requests.get(self.base_url + "/flush_cache") args = SimpleNamespace( num_shots=5, data_path=None, num_questions=200, max_new_tokens=512, parallel=128, host="http://127.0.0.1", port=int(self.base_url.split(":")[-1]), ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["accuracy"], 0.20) server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] print(f"{avg_spec_accept_length=}") self.assertGreater(avg_spec_accept_length, 3.5) # Wait a little bit so that the memory check happens. time.sleep(4) def test_logprob_start_len(self): logprob_start_len = 4 new_tokens = 4 prompts = [ "I have a very good idea on", "Today is a sunndy day and", ] response = requests.post( self.base_url + "/generate", json={ "text": prompts, "sampling_params": { "temperature": 0, "max_new_tokens": new_tokens, }, "return_logprob": True, "top_logprobs_num": 5, "logprob_start_len": logprob_start_len, }, ) response_json = response.json() print(json.dumps(response_json, indent=2)) for res in response_json: self.assertEqual( res["meta_info"]["prompt_tokens"], logprob_start_len + len(res["meta_info"]["input_token_logprobs"]), ) self.assertEqual(res["meta_info"]["completion_tokens"], new_tokens) self.assertEqual(len(res["meta_info"]["output_token_logprobs"]), new_tokens) def test_logprob_match(self): """Test the output logprobs are close to the input logprobs if we run a prefill again.""" def run_generate( prompt, return_logprob=False, max_new_tokens=512, logprob_start_len=-1 ): if isinstance(prompt, str): prompt_kwargs = {"text": prompt} else: prompt_kwargs = {"input_ids": prompt} response = requests.post( self.base_url + "/generate", json={ **prompt_kwargs, "sampling_params": { "temperature": 1.0, "max_new_tokens": max_new_tokens, "ignore_eos": True, }, "return_logprob": return_logprob, "return_text_in_logprobs": True, "logprob_start_len": logprob_start_len, }, ) return response.json() prompt = "I have a very good idea on how to" gen = run_generate(prompt, return_logprob=True, logprob_start_len=0) output_logprobs = np.array( [x[0] for x in gen["meta_info"]["output_token_logprobs"]] ) num_prompts_tokens = gen["meta_info"]["prompt_tokens"] input_tokens = [x[1] for x in gen["meta_info"]["input_token_logprobs"]] output_tokens = [x[1] for x in gen["meta_info"]["output_token_logprobs"]] new_prompt = input_tokens + output_tokens score = run_generate( new_prompt, return_logprob=True, logprob_start_len=0, max_new_tokens=0 ) output_logprobs_score = np.array( [ x[0] for x in score["meta_info"]["input_token_logprobs"][num_prompts_tokens:] ] ) print(f"{output_logprobs[-10:]=}") print(f"{output_logprobs_score[-10:]=}") diff = np.abs(output_logprobs - output_logprobs_score) max_diff = np.max(diff) self.assertLess(max_diff, 0.25) def test_logprob_mixed(self): args = [] temperature = 0 # input_len, output_len, temperature, logprob_start_len, return_logprob, top_logprobs_num # Llama 2 context length seems to be only 2k, so we can only test small length. for input_len in [200, 500, 1000, 2000]: for output_len in [4, 8]: for logprob_start_len in [0, 100, 300, 800, 1998]: for return_logprob in [True, False]: for top_logprobs_num in [0, 5]: if logprob_start_len >= input_len: continue args.append( ( input_len, output_len, temperature, logprob_start_len, return_logprob, top_logprobs_num, ) ) random.shuffle(args) func = partial(run_logprob_check, self) with ThreadPoolExecutor(8) as executor: list(executor.map(func, args)) def run_decode(self, sampling_params): return_logprob = True top_logprobs_num = 5 return_text = True n = 1 response = requests.post( self.base_url + "/generate", json={ "text": "Human: Write a travel blog post to Hawaii.\n\nAssistant:", "sampling_params": { "max_new_tokens": 48, "n": n, "temperature": 0.7, **sampling_params, }, "return_logprob": return_logprob, "top_logprobs_num": top_logprobs_num, "return_text_in_logprobs": return_text, "logprob_start_len": 0, }, ) self.assertEqual(response.status_code, 200) print(json.dumps(response.json())) print("=" * 100) def test_penalty_mixed(self): args = [ {}, {}, {}, {"frequency_penalty": 2}, {"presence_penalty": 1}, {"min_new_tokens": 16}, {"frequency_penalty": 0.2}, {"presence_penalty": 0.4}, {"min_new_tokens": 8}, {"frequency_penalty": 0.4, "presence_penalty": 0.8}, {"frequency_penalty": 0.4, "min_new_tokens": 12}, {"presence_penalty": 0.8, "min_new_tokens": 12}, {"presence_penalty": -0.3, "frequency_penalty": 1.3, "min_new_tokens": 32}, {"presence_penalty": 0.3, "frequency_penalty": -1.3, "min_new_tokens": 32}, ] random.shuffle(args * 5) with ThreadPoolExecutor(8) as executor: list(executor.map(self.run_decode, args)) class TestEAGLERetract(TestEAGLEServer): @classmethod def setUpClass(cls): # These config helps find a leak. os.environ["SGLANG_CI_SMALL_KV_SIZE"] = "4500" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--speculative-algorithm", "EAGLE", "--speculative-draft-model-path", DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, "--speculative-num-steps", 5, "--speculative-eagle-topk", 8, "--speculative-num-draft-tokens", 64, "--mem-fraction-static", 0.7, "--chunked-prefill-size", 128, "--max-running-requests", 64, ], ) class TestEAGLEServerTriton(TestEAGLEServer): @classmethod def setUpClass(cls): cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--speculative-algorithm", "EAGLE", "--speculative-draft-model-path", DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, "--speculative-num-steps", 5, "--speculative-eagle-topk", 8, "--speculative-num-draft-tokens", 64, "--mem-fraction-static", 0.7, "--attention-backend", "triton", "--max-running-requests", 8, ], ) if __name__ == "__main__": unittest.main()