# Reasoning Parser

SGLang supports parsing reasoning content our from "normal" content for reasoning models such as [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1).

## Supported Models

Currently, SGLang supports the following reasoning models:
- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `` and `` tags.
- [QwQ](https://huggingface.co/Qwen/QwQ-32B): The reasoning content is wrapped with `` and `` tags.

## Usage

### Launching the Server

Specify the `--reasoning-parser` option.

In [None]:
import requests
from openai import OpenAI
from sglang.test.test_utils import is_in_ci

if is_in_ci():
 from patch import launch_server_cmd
else:
 from sglang.utils import launch_server_cmd

from sglang.utils import wait_for_server, print_highlight, terminate_process


server_process, port = launch_server_cmd(
 "python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1"
)

wait_for_server(f"http://localhost:{port}")

Note that `--reasoning-parser` defines the parser used to interpret responses. Currently supported parsers include:

- deepseek-r1: DeepSeek R1 series and QwQ (e.g. deepseek-ai/DeepSeek-R1, Qwen/QwQ-32B).

### OpenAI Compatible API

Using the OpenAI compatible API, the contract follows the [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model) established with the release of DeepSeek-R1:

- `reasoning_content`: The content of the CoT.
- `content`: The content of the final answer.

In [None]:
# Initialize OpenAI-like client
client = OpenAI(api_key="None", base_url=f"http://0.0.0.0:{port}/v1")
model_name = client.models.list().data[0].id

messages = [
 {
 "role": "user",
 "content": "What is 1+3?",
 }
]

#### Non-Streaming Request

In [None]:
response_non_stream = client.chat.completions.create(
 model=model_name,
 messages=messages,
 temperature=0.6,
 top_p=0.95,
 stream=False, # Non-streaming
 extra_body={"separate_reasoning": True},
)
print_highlight("==== Reasoning ====")
print_highlight(response_non_stream.choices[0].message.reasoning_content)

print_highlight("==== Text ====")
print_highlight(response_non_stream.choices[0].message.content)

#### Streaming Request

In [None]:
response_stream = client.chat.completions.create(
 model=model_name,
 messages=messages,
 temperature=0.6,
 top_p=0.95,
 stream=True, # Non-streaming
 extra_body={"separate_reasoning": True},
)

reasoning_content = ""
content = ""
for chunk in response_stream:
 if chunk.choices[0].delta.content:
 content += chunk.choices[0].delta.content
 if chunk.choices[0].delta.reasoning_content:
 reasoning_content += chunk.choices[0].delta.reasoning_content

print_highlight("==== Reasoning ====")
print_highlight(reasoning_content)

print_highlight("==== Text ====")
print_highlight(content)

Optionally, you can buffer the reasoning content to the last reasoning chunk (or the first chunk after the reasoning content).

In [None]:
response_stream = client.chat.completions.create(
 model=model_name,
 messages=messages,
 temperature=0.6,
 top_p=0.95,
 stream=True, # Non-streaming
 extra_body={"separate_reasoning": True, "stream_reasoning": False},
)

reasoning_content = ""
content = ""
for chunk in response_stream:
 if chunk.choices[0].delta.content:
 content += chunk.choices[0].delta.content
 if chunk.choices[0].delta.reasoning_content:
 reasoning_content = chunk.choices[0].delta.reasoning_content

print_highlight("==== Reasoning ====")
print_highlight(reasoning_content)

print_highlight("==== Text ====")
print_highlight(content)

The reasoning separation is enable by default when specify . 
**To disable it, set the `separate_reasoning` option to `False` in request.**

In [None]:
response_non_stream = client.chat.completions.create(
 model=model_name,
 messages=messages,
 temperature=0.6,
 top_p=0.95,
 stream=False, # Non-streaming
 extra_body={"separate_reasoning": False},
)

print_highlight("==== Original Output ====")
print_highlight(response_non_stream.choices[0].message.content)

### SGLang Native API 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
input = tokenizer.apply_chat_template(
 messages,
 tokenize=False,
 add_generation_prompt=True,
)

gen_url = f"http://localhost:{port}/generate"
gen_data = {
 "text": input,
 "sampling_params": {
 "skip_special_tokens": False,
 "max_new_tokens": 1024,
 "temperature": 0.6,
 "top_p": 0.95,
 },
}
gen_response = requests.post(gen_url, json=gen_data).json()["text"]

print_highlight("==== Original Output ====")
print_highlight(gen_response)

parse_url = f"http://localhost:{port}/separate_reasoning"
separate_reasoning_data = {
 "text": gen_response,
 "reasoning_parser": "deepseek-r1",
}
separate_reasoning_response_json = requests.post(
 parse_url, json=separate_reasoning_data
).json()
print_highlight("==== Reasoning ====")
print_highlight(separate_reasoning_response_json["reasoning_text"])
print_highlight("==== Text ====")
print_highlight(separate_reasoning_response_json["text"])

In [None]:
terminate_process(server_process)

### Offline Engine API

In [None]:
import sglang as sgl
from sglang.srt.reasoning_parser import ReasoningParser
from sglang.utils import print_highlight

llm = sgl.Engine(model_path="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
input = tokenizer.apply_chat_template(
 messages,
 tokenize=False,
 add_generation_prompt=True,
)
sampling_params = {
 "max_new_tokens": 1024,
 "skip_special_tokens": False,
 "temperature": 0.6,
 "top_p": 0.95,
}
result = llm.generate(prompt=input, sampling_params=sampling_params)

generated_text = result["text"] # Assume there is only one prompt

print_highlight("==== Original Output ====")
print_highlight(generated_text)

parser = ReasoningParser("deepseek-r1")
reasoning_text, text = parser.parse_non_stream(generated_text)
print_highlight("==== Reasoning ====")
print_highlight(reasoning_text)
print_highlight("==== Text ====")
print_highlight(text)

In [None]:
llm.shutdown()

## Supporting New Reasoning Model Schemas

For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningFormatDetector` in `python/sglang/srt/reasoning_parser.py` and specify the reasoning parser for new reasoning model schemas accordingly.

```python
class DeepSeekR1Detector(BaseReasoningFormatDetector):
 """
 Detector for DeepSeek-R1 model.
 Assumes reasoning format:
 ()*(.*)
 Returns all the text before the tag as `reasoning_text`
 and the rest of the text as `normal_text`.

 Args:
 stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
 If True, streams reasoning content as it arrives.
 """

 def __init__(self, stream_reasoning: bool = False):
 # DeepSeek-R1 is assumed to be reasoning until `` token
 super().__init__("", "", True, stream_reasoning=stream_reasoning)
 # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599


class ReasoningParser:
 """
 Parser that handles both streaming and non-streaming scenarios for extracting
 reasoning content from model outputs.

 Args:
 model_type (str): Type of model to parse reasoning from
 stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
 If True, streams reasoning content as it arrives.
 """

 DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
 "deepseek-r1": DeepSeekR1Detector
 }

 def __init__(self, model_type: str = None, stream_reasoning: bool = True):
 if not model_type:
 raise ValueError("Model type must be specified")

 detector_class = self.DetectorMap.get(model_type.lower())
 if not detector_class:
 raise ValueError(f"Unsupported model type: {model_type}")

 self.detector = detector_class(stream_reasoning=stream_reasoning)

 def parse_non_stream(self, full_text: str) -> StreamingParseResult:
 """Non-streaming call: one-time parsing"""
 ret = self.detector.detect_and_parse(full_text)
 return ret.reasoning_text, ret.normal_text

 def parse_stream_chunk(self, chunk_text: str) -> StreamingParseResult:
 """Streaming call: incremental parsing"""
 ret = self.detector.parse_streaming_increment(chunk_text)
 return ret.reasoning_text, ret.normal_text
```