python -m sglang.launch_server --model /data/models/Qwen3-30B-A3B-MXFP8 --tokenizer-path Qwen/Qwen3-30B-A3B --moe-runner-backend deep_gemm --json-model-override-args "{\"kv_cache_scheme\": null}"
Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00<?, ?it/s]
@@@ weight_name: model.layers.29.mlp.experts.w2_weight
@@@ is_fp4_variant: False
@@@ self.quant_method.__class__.__name__: ModelOptFp8MoEMethod
@@@ per_tensor_conditions: False
@@@ weight_name: model.layers.29.mlp.experts.w2_weight_scale
@@@ is_fp4_variant: False
@@@ self.quant_method.__class__.__name__: ModelOptFp8MoEMethod
@@@ per_tensor_conditions: True
[2026-01-15 01:24:12] Scheduler hit an exception: Traceback (most recent call last):
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2820, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 340, in __init__
self.init_model_worker()
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 521, in init_model_worker
self.init_tp_model_worker()
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 483, in init_tp_model_worker
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 246, in __init__
self._init_model_runner()
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 329, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 383, in __init__
self.initialize(min_per_gpu_memory)
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 460, in initialize
self.load_model()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 889, in load_model
self.model = self.loader.load_model(
^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 2587, in load_model
return super().load_model(
^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 645, in load_model
self.load_weights_and_postprocess(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 653, in load_weights_and_postprocess
model.load_weights(weights)
File "/sgl-workspace/sglang/python/sglang/srt/models/qwen3_moe.py", line 1094, in load_weights
weight_loader(
File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 587, in weight_loader
self._weight_loader_physical(
File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 617, in _weight_loader_physical
self._weight_loader_impl(
File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 738, in _weight_loader_impl
self._load_per_tensor_weight_scale(
File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 308, in _load_per_tensor_weight_scale
param_data[expert_id] = loaded_weight
~~~~~~~~~~^^^^^^^^^^^
RuntimeError: expand(torch.ByteTensor{[2048, 24]}, size=[]): the number of sizes provided (0) must be greater or equal to the number of dimensions in the tensor (2)
[2026-01-15 01:24:12] Received sigquit from a child process. It usually means the child failed.
Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00<?, ?it/s]
Killed
Checklist
Motivation
ModelOpt is adding MXFP8 PTQ support recently (NVIDIA/Model-Optimizer#736). The current code cannot handle MoE weight loading correctly, treating as having per tensor scaling:
sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Lines 722 to 748 in 424a380
Related resources
No response