$ docker run -it --name "spec_decoding_structured_output" --shm-size 256g --gpus all -v "/tmp:/tmp" --ipc=host --network=host --privileged --entrypoint=bash lmsysorg/sglang:latest
$ docker exec -it spec_decoding_structured_output bash
$ python3 -m sglang.launch_server --port=7080 --model=/tmp/openai/gpt-oss-120b --trust-remote-code --tp=8 --max-queued-requests=256 --tool-call-parser=gpt-oss --reasoning-parser=gpt-oss --speculative-algorithm=EAGLE3 --speculative-draft-model-path=/tmp/lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps=5 --speculative-eagle-topk=4 --speculative-num-draft-tokens=8 --enable-metrics --log-requests-level=1 --enable-request-time-stats-logging
$ cat json_schema.py
import openai
import os
from pydantic import BaseModel, Field
from sglang.utils import wait_for_server, print_highlight, terminate_process
port=7080
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
# Define the schema using Pydantic
class CapitalInfo(BaseModel):
name: str = Field(..., pattern=r"^\w+$", description="Name of the capital city")
population: int = Field(..., description="Population of the capital city")
response = client.chat.completions.create(
model="openai/gpt-oss-120b",
messages=[
{
"role": "user",
"content": "Please generate the information of the capital of France in the JSON format.",
},
],
temperature=0,
max_tokens=128,
response_format={
"type": "json_schema",
"json_schema": {
"name": "foo",
# convert the pydantic model to json schema
"schema": CapitalInfo.model_json_schema(),
},
},
)
print(f"===Pydantic json response===\n{response}")
print("\n\n")
# response_content = response.choices[0].message.content
# validate the JSON response by the pydantic model
# capital_info = CapitalInfo.model_validate_json(response_content)
# print_highlight(f"Validated response: {capital_info.model_dump_json()}")
import json
json_schema = json.dumps(
{
"type": "object",
"properties": {
"name": {"type": "string", "pattern": "^[\\w]+$"},
"population": {"type": "integer"},
},
"required": ["name", "population"],
}
)
response = client.chat.completions.create(
model="openai/gpt-oss-120b",
messages=[
{
"role": "user",
"content": "Give me the information of the capital of France in the JSON format.",
},
],
temperature=0,
max_tokens=128,
response_format={
"type": "json_schema",
"json_schema": {"name": "foo", "schema": json.loads(json_schema)},
},
)
print(f"===JSON Schema Directly===:\n{response}")
print("\n\n")
ebnf_grammar = """
root ::= city | description
city ::= "London" | "Paris" | "Berlin" | "Rome"
description ::= city " is " status
status ::= "the capital of " country
country ::= "England" | "France" | "Germany" | "Italy"
"""
response = client.chat.completions.create(
model="openai/gpt-oss-120b",
messages=[
{"role": "system", "content": "You are a helpful geography bot."},
{
"role": "user",
"content": "Give me the information of the capital of France.",
},
],
temperature=0,
max_tokens=32,
extra_body={"ebnf": ebnf_grammar},
)
print(f"===EBNF===\n{response}")
$ python json_schema.py
===Pydantic json response===
ChatCompletion(id='a6b6981bb93b44febb99c098eaeb0383', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning_content=None), matched_stop=None)], created=1762913414, model='openai/gpt-oss-120b', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=128, prompt_tokens=81, total_tokens=209, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})
===JSON Schema Directly===:
ChatCompletion(id='fb9792fc660a4b5180d7cca9d6ca1353', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning_content=None), matched_stop=None)], created=1762913415, model='openai/gpt-oss-120b', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=128, prompt_tokens=81, total_tokens=209, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})
===EBNF===
ChatCompletion(id='f2199a2eaf684706bbe6f3ad4bdb17bd', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning_content=None), matched_stop=None)], created=1762913415, model='openai/gpt-oss-120b', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=32, prompt_tokens=91, total_tokens=123, completion_tokens_details=None, prompt_tokens_details=None, reasoning_tokens=0), metadata={'weight_version': 'default'})
Checklist
Describe the bug
Using SpecDecoding results in empty outputs for structured outputs
Reproduction
Environment
/sgl-workspace/sglang# python3 -m sglang.check_env
Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
CUDA available: True
GPU 0,1,2,3,4,5,6,7: NVIDIA H100 80GB HBM3
GPU 0,1,2,3,4,5,6,7 Compute Capability: 9.0
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 12.9, V12.9.86
CUDA Driver Version: 550.90.07
PyTorch: 2.8.0+cu129
sglang: 0.5.5.post1
sgl_kernel: 0.3.17
flashinfer_python: 0.5.0
flashinfer_cubin: 0.5.0
flashinfer_jit_cache: Module Not Found
triton: 3.4.0
transformers: 4.57.1
torchao: 0.9.0
numpy: 2.3.4
aiohttp: 3.13.2
fastapi: 0.121.1
hf_transfer: 0.1.9
huggingface_hub: 0.36.0
interegular: 0.3.3
modelscope: 1.31.0
orjson: 3.11.4 [0/4445]
outlines: 0.1.11
packaging: 25.0
psutil: 7.1.3
pydantic: 2.12.4
python-multipart: 0.0.20
pyzmq: 27.1.0
uvicorn: 0.38.0
uvloop: 0.21.0
vllm: Module Not Found
xgrammar: 0.1.25
openai: 2.6.1
tiktoken: 0.12.0
anthropic: 0.72.0
litellm: Module Not Found
decord2: 2.0.0
NVIDIA Topology:
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 0-51,104-155 0 N/A
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 0-51,104-155 0 N/A
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 0-51,104-155 0 N/A
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 0-51,104-155 0 N/A
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 52-103,156-207 1 N/A
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 52-103,156-207 1 N/A
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 52-103,156-207 1 N/A
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X 52-103,156-207 1 N/A
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
Hypervisor vendor:: KVM
ulimit soft: 1048576