import json import requests port = 8000 json_schema = json.dumps( { "type": "object", "properties": { "name": {"type": "string", "pattern": "^[\\w]+$"}, "population": {"type": "integer"}, }, "required": ["name", "population"], } ) # JSON response = requests.post( f"http://localhost:{port}/generate", json={ "text": "Here is the information of the capital of France in the JSON format.\n", "sampling_params": { "temperature": 0, "max_new_tokens": 64, "json_schema": json_schema, }, }, ) print(response.json()) # python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --trust-remote-code --disaggregation-mode prefill --tp 2 --disaggregation-ib-device mlx5_roce0,mlx5_roce1 --speculative-algorithm EAGLE --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --host 127.0.0.1 --port 8100