Skip to content

[Feature] Add MXFP8 quantization support #17093

@zianglih

Description

@zianglih

Checklist

Motivation

ModelOpt is adding MXFP8 PTQ support recently (NVIDIA/Model-Optimizer#736). The current code cannot handle MoE weight loading correctly, treating as having per tensor scaling:

if "ModelOpt" in self.quant_method.__class__.__name__:
# Determine per-tensor weight scale patterns based on variant
is_fp4_variant = isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
# FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
per_tensor_conditions = (
"weight_scale_2" in weight_name
if is_fp4_variant
else "weight_scale" in weight_name
) or "input_scale" in weight_name
if per_tensor_conditions:
self._load_per_tensor_weight_scale(
shard_id=shard_id,
param=param,
loaded_weight=loaded_weight,
expert_id=expert_id,
)
elif "weight" in weight_name:
self._load_model_weight_or_group_weight_scale(
shard_id=shard_id,
shard_dim=shard_dim,
loaded_weight=loaded_weight,
expert_data=expert_data,
tp_rank=tp_rank,
)
return

python -m sglang.launch_server --model /data/models/Qwen3-30B-A3B-MXFP8 --tokenizer-path Qwen/Qwen3-30B-A3B --moe-runner-backend deep_gemm --json-model-override-args "{\"kv_cache_scheme\": null}"
Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]
@@@ weight_name: model.layers.29.mlp.experts.w2_weight
@@@ is_fp4_variant: False
@@@ self.quant_method.__class__.__name__: ModelOptFp8MoEMethod
@@@ per_tensor_conditions: False
@@@ weight_name: model.layers.29.mlp.experts.w2_weight_scale
@@@ is_fp4_variant: False
@@@ self.quant_method.__class__.__name__: ModelOptFp8MoEMethod
@@@ per_tensor_conditions: True
[2026-01-15 01:24:12] Scheduler hit an exception: Traceback (most recent call last):
  File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2820, in run_scheduler_process
    scheduler = Scheduler(
                ^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 340, in __init__
    self.init_model_worker()
  File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 521, in init_model_worker
    self.init_tp_model_worker()
  File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 483, in init_tp_model_worker
    self.tp_worker = TpModelWorker(
                     ^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 246, in __init__
    self._init_model_runner()
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 329, in _init_model_runner
    self._model_runner = ModelRunner(
                         ^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 383, in __init__
    self.initialize(min_per_gpu_memory)
  File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 460, in initialize
    self.load_model()
  File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 889, in load_model
    self.model = self.loader.load_model(
                 ^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 2587, in load_model
    return super().load_model(
           ^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 645, in load_model
    self.load_weights_and_postprocess(
  File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 653, in load_weights_and_postprocess
    model.load_weights(weights)
  File "/sgl-workspace/sglang/python/sglang/srt/models/qwen3_moe.py", line 1094, in load_weights
    weight_loader(
  File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 587, in weight_loader
    self._weight_loader_physical(
  File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 617, in _weight_loader_physical
    self._weight_loader_impl(
  File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 738, in _weight_loader_impl
    self._load_per_tensor_weight_scale(
  File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 308, in _load_per_tensor_weight_scale
    param_data[expert_id] = loaded_weight
    ~~~~~~~~~~^^^^^^^^^^^
RuntimeError: expand(torch.ByteTensor{[2048, 24]}, size=[]): the number of sizes provided (0) must be greater or equal to the number of dimensions in the tensor (2)

[2026-01-15 01:24:12] Received sigquit from a child process. It usually means the child failed.
Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]

Killed

Related resources

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions