Skip to content

feat: DeepSeek-V3.2 Streaming tool call output#15278

Merged
Fridge003 merged 7 commits intosgl-project:mainfrom
JustinTong0323:stream-dpskv32-tool-call
Dec 18, 2025
Merged

feat: DeepSeek-V3.2 Streaming tool call output#15278
Fridge003 merged 7 commits intosgl-project:mainfrom
JustinTong0323:stream-dpskv32-tool-call

Conversation

@JustinTong0323
Copy link
Copy Markdown
Collaborator

Motivation

FIxes #14711

Modifications

Accuracy Tests

(sglang) ➜  dpskv32 curl --location 'http://localhost:8000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
  "model": "DeepSeek-V3.2",
  "messages": [
    {
      "role": "user",
      "content": "Calculate the derivative of the function 3x^2 + 2x - 1."
    }
  ],
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "calculate_derivative",
        "description": "Calculate the derivative of a polynomial function. Note that the provided function is in Python 3 syntax.",
        "parameters": {
          "type": "object",
          "properties": {
            "function": {
              "type": "string",
              "description": "The polynomial function."
            },
            "x_value": {
              "type": "number",
              "description": "The x-value at which the derivative is calculated. Optional, default to 0.00. <....
data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":"assistant","content":"","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":"I","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":"'ll calculate the derivative","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" of the function ","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":"3x^2","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" + 2x","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" - 1 for","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" you.\n\n","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":"call_57db6d12c73344bc912e2677","index":0,"type":"function","function":{"name":"calculate_derivative","arguments":""}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"{"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"\"function\": \"3*x"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"**2 +"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":" 2*x -"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":" 1"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"\"}"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"tool_calls","matched_stop":1}],"usage":null}

data: [DONE]

Benchmarking and Profiling

Checklist

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
@github-actions github-actions Bot added documentation Improvements or additions to documentation deepseek diffusion SGLang Diffusion labels Dec 16, 2025
@gemini-code-assist
Copy link
Copy Markdown
Contributor

Summary of Changes

Hello @JustinTong0323, I'm Gemini Code Assist1! I'm currently reviewing this pull request and will post my feedback shortly. In the meantime, here's a summary to help you and other reviewers quickly get up to speed!

This pull request introduces streaming capabilities for DeepSeek-V3.2 tool calls, enabling the model to output tool names and arguments incrementally as they are generated. This enhancement significantly improves the user experience for applications requiring real-time interaction with tool-using models by providing immediate feedback on the tool call progress, rather than waiting for a complete tool call to be formed.

Highlights

  • DeepSeek-V3.2 Tool Call Streaming: Implemented incremental streaming for DeepSeek-V3.2 tool calls, allowing partial tool arguments to be emitted as they are generated, improving real-time feedback.
  • Partial XML Parsing Support: Enhanced the XML parameter parsing logic to support partial parameter tags, which is crucial for handling incomplete argument structures during streaming.
  • Incremental Argument Calculation: Introduced a mechanism within the streaming parser to calculate and send only the new, stable portions of tool call arguments, optimizing data transfer for streaming.
  • Docstring Update: Updated the parse_streaming_increment docstring to explicitly reflect the new argument streaming capability.
Using Gemini Code Assist

The full guide for Gemini Code Assist can be found on our documentation page, here are some quick tips.

Invoking Gemini

You can request assistance from Gemini at any point by creating a comment using either /gemini <command> or @gemini-code-assist <command>. Below is a summary of the supported commands on the current page.

Feature Command Description
Code Review /gemini review Performs a code review for the current pull request in its current state.
Pull Request Summary /gemini summary Provides a summary of the current pull request in its current state.
Comment @gemini-code-assist Responds in comments when explicitly tagged, both in pull request comments and review comments.
Help /gemini help Displays a list of available commands.

Customization

To customize Gemini Code Assist for GitHub experience, repository maintainers can create a configuration file and/or provide a custom code review style guide (such as PEP-8 for Python) by creating and adding files to a .gemini/ folder in the base of the repository. Detailed instructions can be found here.

Limitations & Feedback

Gemini Code Assist may make mistakes. Please leave feedback on any instances where its feedback is incorrect or counter productive. You can react with 👍 and 👎 on @gemini-code-assist comments. If you're interested in giving your feedback about your experience with Gemini Code Assist for Github and other Google products, sign up here.

You can also get AI-powered code generation, chat, as well as code reviews directly in the IDE at no cost with the Gemini Code Assist IDE Extension.

Footnotes

  1. Review the Privacy Notices, Generative AI Prohibited Use Policy, Terms of Service, and learn how to configure Gemini Code Assist in GitHub here. Gemini can make mistakes, so double check it and use code with caution.

Copy link
Copy Markdown
Contributor

@gemini-code-assist gemini-code-assist Bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code Review

This pull request introduces streaming support for tool call arguments in the DeepSeek-V3.2 model, which is a great enhancement for user experience. The core logic in deepseekv32_detector.py has been significantly refactored to handle partial parsing and incremental streaming of arguments. The implementation appears solid and correctly handles the complexities of streaming structured data. I have one minor suggestion to simplify a piece of the new logic.

Comment on lines +147 to +158
# For partial values, we just take what we have so far
# We don't try JSON parsing for partial values unless they look complete,
# but simplistic approach is to just treat as string/partial value
if param_type == "true":
# For strings, the value is just the content so far
# We might need to be careful if the value itself contains partial closing tag
# But greedy match .* at end should capture everything
parameters[param_name] = param_value
else:
# For non-strings (JSON), partial parsing is tricky without a dedicated parser
# But we can try to return the raw string or try partial json
parameters[param_name] = param_value
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The if param_type == "true": and else: blocks contain identical code: parameters[param_name] = param_value. This conditional is redundant. You can simplify this section by removing the if/else and using a single assignment, which makes the code cleaner and easier to maintain.

                # For partial values, we just take what we have so far.
                # For both string and JSON-like types, we'll take the raw partial value
                # since proper partial JSON parsing is complex.
                parameters[param_name] = param_value

momaek and others added 2 commits December 16, 2025 20:26
When the buffer contains accumulated content from previous chunks
(e.g., when a chunk ends with "<"), the code was returning `new_text`
instead of `current_text` when determining the content is not DSML.
This caused previously buffered content to be discarded.

For example, when streaming text containing `<user_maybe_say>`:
- Chunk 1: "...<" (buffered, returns empty)
- Chunk 2: "user_maybe_say>..." (returns only new_text, discards "<")

This resulted in `<user_maybe_say>` being output as `user_maybe_say>`
with the leading "<" lost.

The fix changes to return `current_text` (buffer content) instead of
`new_text` (current chunk only), ensuring no content is lost when
the buffer is cleared.
@JustinTong0323
Copy link
Copy Markdown
Collaborator Author

/tag-and-rerun-ci

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
JustinTong0323 and others added 2 commits December 16, 2025 21:11
Co-authored-by: Muqi Li <muqi1029@gmail.com>
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
@Muqi1029
Copy link
Copy Markdown
Contributor

Muqi1029 commented Dec 17, 2025

The reason I don't implement the full streaming is that I must handle annoying trailing part of parameter end token. I have tested this PR, the problem hasn't been solved.

@JustinTong0323 You can reproduce the badcase by setting interval in the test case as 1, you will see
image

Reproducing Script:
Step 1: set interval as 1
image

Step 2:

cd test/registered/function_call
python -m unittest test_function_call_parser.TestDeepSeekV32Detector.test_streaming_xml_format

BTW, I have made the test stricter, maybe you can cherry-pick that commit in the PR.

@fernandaspets
Copy link
Copy Markdown

How did you get toolcalling to work? I can't get it to work with dsv32 awq. hmm..

@JustinTong0323
Copy link
Copy Markdown
Collaborator Author

How did you get toolcalling to work? I can't get it to work with dsv32 awq. hmm..

Could you try again in latest main?

@Fridge003 Fridge003 merged commit 4168353 into sgl-project:main Dec 18, 2025
173 of 184 checks passed
@Fridge003
Copy link
Copy Markdown
Collaborator

@Muqi1029 Can you try latest main? #15155 seems fixing the issue of |DSML| token

@Muqi1029
Copy link
Copy Markdown
Contributor

Muqi1029 commented Dec 18, 2025

@Muqi1029 Can you try latest main? #15155 seems fixing the issue of |DSML| token

Hi, @Fridge003 , Thanks for your remind!

But the question is not the same. And there are still some bugs in this PR. I think you SHOULD NOT merge this into main instantly, which is very dangerous.

I have pointed to the bugs in the review. And you can also use stricter test cases to run the CI.

@JustinTong0323
Copy link
Copy Markdown
Collaborator Author

@Muqi1029 Can you try latest main? #15155 seems fixing the issue of |DSML| token

Hi, @Fridge003 , Thanks for your remind!

But the question is not the same. And there are still some bugs in this PR. I think you SHOULD NOT merge this into main instantly, which is very dangerous.

I have pointed to the bugs in the review. And you can also use stricter test cases to run the CI.

The failure case hasn't happened in my test cases, so we just merge it to support this feature and would aim to solve the issue you referenced before the release, thanks for your check~

Liwansi added a commit to iforgetmyname/sglang that referenced this pull request Dec 19, 2025
…n3_pp

* 'main' of https://github.com/sgl-project/sglang: (74 commits)
  [bug fix][pp] fix inconsistent latency between tp (sgl-project#15379)
  Fix warp illegal instruction in kimi k2 thinking PCG (sgl-project#15306)
  Fix gpt-oss yarn with `truncate` argument (sgl-project#14270)
  Monkey patch deepseek-ocr's `v_head_dim` (sgl-project#15384)
  [model-gateway] Replace PolicyRegistry RwLock with DashMap for lock-free policy lookups (sgl-project#15361)
  [PP] Fix dynamic chunking strategy for PP (sgl-project#15372)
  Fix issue: ENABLE_BELOW_SM90 cannot be enabled on aarch64 CPU (sgl-project#12967)
  Split test_piecewise_cuda_graph.py to optimize CI resource usage (sgl-project#15290)
  unified management of environment variables for vlm cuda ipc transport  (sgl-project#14501)
  Mistral Large 3 NVFP4 TRTLLM MoE support (sgl-project#15049)
  fix: adjust time for test_epd_disaggregation.py (sgl-project#15354)
  Add doc for qwen3 next (sgl-project#15337)
  feat: DeepSeek-V3.2 Streaming tool call output (sgl-project#15278)
  Feature/trtllm mha workspace size configurable sgl-project#15089 (sgl-project#15131)
  [VLM] Support cos sin cache for Qwen3-VL & GLM-4.1V (sgl-project#15205)
  [Deepseek V3.2] Support Overlap Spec + NSA (sgl-project#15307)
  Add request-level timestamp for when prefill finishes (sgl-project#14860)
  [CI] Migrate LoRA tests to test/registered/lora/ (sgl-project#15176)
  Reserve more memory for DeepSeekOCR model and adjust server start timeout for DeepGEMM to reduce flakiness (sgl-project#15277)
  Fix condition check for require_gathered_buffer (sgl-project#15328)
  ...
Prozac614 pushed a commit to Prozac614/sglang that referenced this pull request Dec 23, 2025
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: momaek <momaek17@gmail.com>
Co-authored-by: Muqi Li <muqi1029@gmail.com>
jiaming1130 pushed a commit to zhuyijie88/sglang that referenced this pull request Dec 25, 2025
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: momaek <momaek17@gmail.com>
Co-authored-by: Muqi Li <muqi1029@gmail.com>
YChange01 pushed a commit to YChange01/sglang that referenced this pull request Jan 13, 2026
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: momaek <momaek17@gmail.com>
Co-authored-by: Muqi Li <muqi1029@gmail.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

deepseek diffusion SGLang Diffusion documentation Improvements or additions to documentation run-ci

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[Bug] deepseek v3.2 function call cannot output in a streaming manner

5 participants