feat: DeepSeek-V3.2 Streaming tool call output by JustinTong0323 · Pull Request #15278 · sgl-project/sglang

JustinTong0323 · 2025-12-16T20:04:28Z

Motivation

Modifications

Accuracy Tests

(sglang) ➜  dpskv32 curl --location 'http://localhost:8000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
  "model": "DeepSeek-V3.2",
  "messages": [
    {
      "role": "user",
      "content": "Calculate the derivative of the function 3x^2 + 2x - 1."
    }
  ],
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "calculate_derivative",
        "description": "Calculate the derivative of a polynomial function. Note that the provided function is in Python 3 syntax.",
        "parameters": {
          "type": "object",
          "properties": {
            "function": {
              "type": "string",
              "description": "The polynomial function."
            },
            "x_value": {
              "type": "number",
              "description": "The x-value at which the derivative is calculated. Optional, default to 0.00. <....
data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":"assistant","content":"","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":"I","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":"'ll calculate the derivative","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" of the function ","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":"3x^2","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" + 2x","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" - 1 for","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":" you.\n\n","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":"call_57db6d12c73344bc912e2677","index":0,"type":"function","function":{"name":"calculate_derivative","arguments":""}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"{"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"\"function\": \"3*x"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"**2 +"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":" 2*x -"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":" 1"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":[{"id":null,"index":0,"type":"function","function":{"name":null,"arguments":"\"}"}}]},"logprobs":null,"finish_reason":null,"matched_stop":null}],"usage":null}

data: {"id":"80c29dd86a854296861e793b04d093cc","object":"chat.completion.chunk","created":1765914871,"model":"DeepSeek-V3.2","choices":[{"index":0,"delta":{"role":null,"content":null,"reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"tool_calls","matched_stop":1}],"usage":null}

data: [DONE]

Benchmarking and Profiling

Checklist

Format your code according to the Format code with pre-commit.
Add unit tests according to the Run and add unit tests.
Update documentation according to Write documentations.
Provide accuracy and speed benchmark results according to Test the accuracy and Benchmark the speed.
Follow the SGLang code style guidance.
Work with maintainers to merge your PR. See the PR Merge Process

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

gemini-code-assist · 2025-12-16T20:04:47Z

Summary of Changes

Hello @JustinTong0323, I'm Gemini Code Assist¹! I'm currently reviewing this pull request and will post my feedback shortly. In the meantime, here's a summary to help you and other reviewers quickly get up to speed!

This pull request introduces streaming capabilities for DeepSeek-V3.2 tool calls, enabling the model to output tool names and arguments incrementally as they are generated. This enhancement significantly improves the user experience for applications requiring real-time interaction with tool-using models by providing immediate feedback on the tool call progress, rather than waiting for a complete tool call to be formed.

Highlights

DeepSeek-V3.2 Tool Call Streaming: Implemented incremental streaming for DeepSeek-V3.2 tool calls, allowing partial tool arguments to be emitted as they are generated, improving real-time feedback.
Partial XML Parsing Support: Enhanced the XML parameter parsing logic to support partial parameter tags, which is crucial for handling incomplete argument structures during streaming.
Incremental Argument Calculation: Introduced a mechanism within the streaming parser to calculate and send only the new, stable portions of tool call arguments, optimizing data transfer for streaming.
Docstring Update: Updated the parse_streaming_increment docstring to explicitly reflect the new argument streaming capability.

Using Gemini Code Assist

The full guide for Gemini Code Assist can be found on our documentation page, here are some quick tips.

Invoking Gemini

You can request assistance from Gemini at any point by creating a comment using either /gemini <command> or @gemini-code-assist <command>. Below is a summary of the supported commands on the current page.

Feature	Command	Description
Code Review	`/gemini review`	Performs a code review for the current pull request in its current state.
Pull Request Summary	`/gemini summary`	Provides a summary of the current pull request in its current state.
Comment	@gemini-code-assist	Responds in comments when explicitly tagged, both in pull request comments and review comments.
Help	`/gemini help`	Displays a list of available commands.

Customization

To customize Gemini Code Assist for GitHub experience, repository maintainers can create a configuration file and/or provide a custom code review style guide (such as PEP-8 for Python) by creating and adding files to a .gemini/ folder in the base of the repository. Detailed instructions can be found here.

Limitations & Feedback

Gemini Code Assist may make mistakes. Please leave feedback on any instances where its feedback is incorrect or counter productive. You can react with 👍 and 👎 on @gemini-code-assist comments. If you're interested in giving your feedback about your experience with Gemini Code Assist for Github and other Google products, sign up here.

You can also get AI-powered code generation, chat, as well as code reviews directly in the IDE at no cost with the Gemini Code Assist IDE Extension.

Review the Privacy Notices, Generative AI Prohibited Use Policy, Terms of Service, and learn how to configure Gemini Code Assist in GitHub here. Gemini can make mistakes, so double check it and use code with caution. ↩

gemini-code-assist

Code Review

This pull request introduces streaming support for tool call arguments in the DeepSeek-V3.2 model, which is a great enhancement for user experience. The core logic in deepseekv32_detector.py has been significantly refactored to handle partial parsing and incremental streaming of arguments. The implementation appears solid and correctly handles the complexities of streaming structured data. I have one minor suggestion to simplify a piece of the new logic.

gemini-code-assist · 2025-12-16T20:06:25Z

+                # For partial values, we just take what we have so far
+                # We don't try JSON parsing for partial values unless they look complete,
+                # but simplistic approach is to just treat as string/partial value
+                if param_type == "true":
+                    # For strings, the value is just the content so far
+                    # We might need to be careful if the value itself contains partial closing tag
+                    # But greedy match .* at end should capture everything
+                    parameters[param_name] = param_value
+                else:
+                    # For non-strings (JSON), partial parsing is tricky without a dedicated parser
+                    # But we can try to return the raw string or try partial json
+                    parameters[param_name] = param_value


The if param_type == "true": and else: blocks contain identical code: parameters[param_name] = param_value. This conditional is redundant. You can simplify this section by removing the if/else and using a single assignment, which makes the code cleaner and easier to maintain.

# For partial values, we just take what we have so far. # For both string and JSON-like types, we'll take the raw partial value # since proper partial JSON parsing is complex. parameters[param_name] = param_value

When the buffer contains accumulated content from previous chunks (e.g., when a chunk ends with "<"), the code was returning `new_text` instead of `current_text` when determining the content is not DSML. This caused previously buffered content to be discarded. For example, when streaming text containing `<user_maybe_say>`: - Chunk 1: "...<" (buffered, returns empty) - Chunk 2: "user_maybe_say>..." (returns only new_text, discards "<") This resulted in `<user_maybe_say>` being output as `user_maybe_say>` with the leading "<" lost. The fix changes to return `current_text` (buffer content) instead of `new_text` (current chunk only), ensuring no content is lost when the buffer is cleared.

JustinTong0323 · 2025-12-16T20:34:52Z

/tag-and-rerun-ci

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

Co-authored-by: Muqi Li <muqi1029@gmail.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

Muqi1029 · 2025-12-17T03:18:40Z

The reason I don't implement the full streaming is that I must handle annoying trailing part of parameter end token. I have tested this PR, the problem hasn't been solved.

@JustinTong0323 You can reproduce the badcase by setting interval in the test case as 1, you will see

Reproducing Script:
Step 1: set interval as 1

Step 2:

cd test/registered/function_call
python -m unittest test_function_call_parser.TestDeepSeekV32Detector.test_streaming_xml_format

BTW, I have made the test stricter, maybe you can cherry-pick that commit in the PR.

fernandaspets · 2025-12-17T08:27:35Z

How did you get toolcalling to work? I can't get it to work with dsv32 awq. hmm..

JustinTong0323 · 2025-12-17T15:13:14Z

How did you get toolcalling to work? I can't get it to work with dsv32 awq. hmm..

Could you try again in latest main?

Fridge003 · 2025-12-18T01:44:33Z

@Muqi1029 Can you try latest main? #15155 seems fixing the issue of |DSML| token

Muqi1029 · 2025-12-18T02:38:08Z

@Muqi1029 Can you try latest main? #15155 seems fixing the issue of |DSML| token

Hi, @Fridge003 , Thanks for your remind!

But the question is not the same. And there are still some bugs in this PR. I think you SHOULD NOT merge this into main instantly, which is very dangerous.

I have pointed to the bugs in the review. And you can also use stricter test cases to run the CI.

JustinTong0323 · 2025-12-18T16:18:59Z

@Muqi1029 Can you try latest main? #15155 seems fixing the issue of |DSML| token

Hi, @Fridge003 , Thanks for your remind!

But the question is not the same. And there are still some bugs in this PR. I think you SHOULD NOT merge this into main instantly, which is very dangerous.

I have pointed to the bugs in the review. And you can also use stricter test cases to run the CI.

The failure case hasn't happened in my test cases, so we just merge it to support this feature and would aim to solve the issue you referenced before the release, thanks for your check~

…n3_pp * 'main' of https://github.com/sgl-project/sglang: (74 commits) [bug fix][pp] fix inconsistent latency between tp (sgl-project#15379) Fix warp illegal instruction in kimi k2 thinking PCG (sgl-project#15306) Fix gpt-oss yarn with `truncate` argument (sgl-project#14270) Monkey patch deepseek-ocr's `v_head_dim` (sgl-project#15384) [model-gateway] Replace PolicyRegistry RwLock with DashMap for lock-free policy lookups (sgl-project#15361) [PP] Fix dynamic chunking strategy for PP (sgl-project#15372) Fix issue: ENABLE_BELOW_SM90 cannot be enabled on aarch64 CPU (sgl-project#12967) Split test_piecewise_cuda_graph.py to optimize CI resource usage (sgl-project#15290) unified management of environment variables for vlm cuda ipc transport (sgl-project#14501) Mistral Large 3 NVFP4 TRTLLM MoE support (sgl-project#15049) fix: adjust time for test_epd_disaggregation.py (sgl-project#15354) Add doc for qwen3 next (sgl-project#15337) feat: DeepSeek-V3.2 Streaming tool call output (sgl-project#15278) Feature/trtllm mha workspace size configurable sgl-project#15089 (sgl-project#15131) [VLM] Support cos sin cache for Qwen3-VL & GLM-4.1V (sgl-project#15205) [Deepseek V3.2] Support Overlap Spec + NSA (sgl-project#15307) Add request-level timestamp for when prefill finishes (sgl-project#14860) [CI] Migrate LoRA tests to test/registered/lora/ (sgl-project#15176) Reserve more memory for DeepSeekOCR model and adjust server start timeout for DeepGEMM to reduce flakiness (sgl-project#15277) Fix condition check for require_gathered_buffer (sgl-project#15328) ...

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: momaek <momaek17@gmail.com> Co-authored-by: Muqi Li <muqi1029@gmail.com>

JustinTong0323 added 2 commits December 16, 2025 18:41

runable version

705ab80

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

cleanup

5459b9b

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

JustinTong0323 requested review from CatherineSue, mickqian and yhyang201 as code owners December 16, 2025 20:04

github-actions Bot added documentation Improvements or additions to documentation deepseek diffusion SGLang Diffusion labels Dec 16, 2025

JustinTong0323 mentioned this pull request Dec 16, 2025

[Bug] deepseek v3.2 function call cannot output in a streaming manner #14711

Closed

5 tasks

gemini-code-assist Bot reviewed Dec 16, 2025

View reviewed changes

momaek and others added 2 commits December 16, 2025 20:26

Merge branch 'main' into stream-dpskv32-tool-call

15eb7df

github-actions Bot added the run-ci label Dec 16, 2025

refactor

cbe63db

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

This was referenced Dec 16, 2025

fix(DeepSeek-V3.2 function_call): fix streaming content loss in DeepSeekV32Detector #15217

Closed

[Tool Call][DSV32] Streamline function call parameters #14750

Merged

JustinTong0323 and others added 2 commits December 16, 2025 21:11

pick tests from sgl-project#14750

fa45cca

Co-authored-by: Muqi Li <muqi1029@gmail.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

Merge branch 'main' into stream-dpskv32-tool-call

59ff3bf

Fridge003 merged commit 4168353 into sgl-project:main Dec 18, 2025
173 of 184 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: DeepSeek-V3.2 Streaming tool call output#15278

feat: DeepSeek-V3.2 Streaming tool call output#15278
Fridge003 merged 7 commits intosgl-project:mainfrom
JustinTong0323:stream-dpskv32-tool-call

JustinTong0323 commented Dec 16, 2025

Uh oh!

gemini-code-assist Bot commented Dec 16, 2025

Uh oh!

gemini-code-assist Bot left a comment

Uh oh!

gemini-code-assist Bot Dec 16, 2025

Uh oh!

JustinTong0323 commented Dec 16, 2025

Uh oh!

Muqi1029 commented Dec 17, 2025 •

edited

Loading

Uh oh!

fernandaspets commented Dec 17, 2025

Uh oh!

JustinTong0323 commented Dec 17, 2025

Uh oh!

Uh oh!

Fridge003 commented Dec 18, 2025

Uh oh!

Muqi1029 commented Dec 18, 2025 •

edited

Loading

Uh oh!

JustinTong0323 commented Dec 18, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

Conversation

JustinTong0323 commented Dec 16, 2025

Motivation

Modifications

Accuracy Tests

Benchmarking and Profiling

Checklist

Uh oh!

gemini-code-assist Bot commented Dec 16, 2025

Summary of Changes

Highlights

Footnotes

Uh oh!

gemini-code-assist Bot left a comment

Choose a reason for hiding this comment

Code Review

Uh oh!

gemini-code-assist Bot Dec 16, 2025

Choose a reason for hiding this comment

Uh oh!

JustinTong0323 commented Dec 16, 2025

Uh oh!

Muqi1029 commented Dec 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

fernandaspets commented Dec 17, 2025

Uh oh!

JustinTong0323 commented Dec 17, 2025

Uh oh!

Uh oh!

Fridge003 commented Dec 18, 2025

Uh oh!

Muqi1029 commented Dec 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

JustinTong0323 commented Dec 18, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

Muqi1029 commented Dec 17, 2025 •

edited

Loading

Muqi1029 commented Dec 18, 2025 •

edited

Loading