sgl-project
diff --git a/‎python/pyproject_other.toml‎
Lines changed: 1 addition & 1 deletion b/‎python/pyproject_other.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/srt/configs/model_config.py‎
Lines changed: 3 additions & 0 deletions b/‎python/sglang/srt/configs/model_config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sglang/srt/layers/quantization/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎python/sglang/srt/layers/quantization/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/sglang/srt/layers/quantization/petit.py‎
Lines changed: 7 additions & 249 deletions b/‎python/sglang/srt/layers/quantization/petit.py‎
Lines changed: 7 additions & 249 deletions
@@ -93,7 +93,7 @@ tracing = [
 srt_hip = [
   "sglang[runtime_common]",
   "torch",
-  "petit_kernel==0.0.2",
+  "petit_kernel==0.0.3",
   "wave-lang==3.8.2",
 ]
 
 
@@ -946,6 +946,7 @@ def _verify_quantization(self) -> None:
             "fbgemm_fp8",
             "w8a8_fp8",
             "petit_nvfp4",
+            "petit_mxfp4",
             "quark",
             "mxfp4",
             "auto-round",
@@ -970,6 +971,7 @@ def _verify_quantization(self) -> None:
             "qoq",
             "w4afp8",
             "petit_nvfp4",
+            "petit_mxfp4",
             "quark",
             "modelslim",
         ]
@@ -978,6 +980,7 @@ def _verify_quantization(self) -> None:
             "modelopt_fp4": ["modelopt"],
             "modelopt_mixed": ["modelopt"],
             "petit_nvfp4": ["modelopt"],
+            "petit_mxfp4": ["mxfp4", "quark"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
             "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
         }
 
@@ -36,7 +36,8 @@ def override_quantization_method(self, *args, **kwargs):
 from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
-from sglang.srt.layers.quantization.petit import PetitNvFp4Config
+from sglang.srt.layers.quantization.petit_mxfp4 import PetitMxfp4Config
+from sglang.srt.layers.quantization.petit_nvfp4 import PetitNvFp4Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
 from sglang.srt.layers.quantization.quark.quark import QuarkConfig
 from sglang.srt.layers.quantization.quark_int4fp8_moe import QuarkInt4Fp8Config
@@ -72,6 +73,7 @@ def override_quantization_method(self, *args, **kwargs):
     "qoq": QoQConfig,
     "w4afp8": W4AFp8Config,
     "petit_nvfp4": PetitNvFp4Config,
+    "petit_mxfp4": PetitMxfp4Config,
     "fbgemm_fp8": FBGEMMFp8Config,
     "quark": QuarkConfig,
     "auto-round": AutoRoundConfig,
 
@@ -1,253 +1,11 @@
-# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+"""Backward-compatible import shim.
 
+Use `petit_nvfp4.py` for implementation. Keep this module for existing imports.
+"""
 
-import logging
-from typing import Any, Dict, List, Optional
-
-import regex as re
-import torch
-from torch.nn.parameter import Parameter
-
-from sglang.srt.layers.linear import LinearBase
-from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
-from sglang.srt.layers.quantization.base_config import (
-    LinearMethodBase,
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
-from sglang.srt.layers.quantization.petit_utils import (
-    apply_petit_nvfp4_linear,
-    prepare_nvfp4_layer_for_petit,
-    verify_petit_nvfp4_supported,
+from sglang.srt.layers.quantization.petit_nvfp4 import (  # noqa: F401
+    PetitNvFp4Config,
+    PetitNvFp4LinearMethod,
 )
-from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
-from sglang.srt.layers.quantization.utils import is_layer_skipped
-from sglang.srt.utils import is_hip
-
-_is_hip = is_hip()
-
-# Initialize logger for the module
-logger = logging.getLogger(__name__)
-
-
-# Configuration class to support the NVFP4 quantized model generated by the ModelOpt quantization tool
-class PetitNvFp4Config(QuantizationConfig):
-    """Config class for Petit FP4."""
-
-    def __init__(
-        self,
-        is_checkpoint_nvfp4_serialized: bool = False,
-        kv_cache_quant_algo: str = None,
-        group_size: int = None,
-        exclude_modules: List[str] = None,
-    ) -> None:
-        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
-        if is_checkpoint_nvfp4_serialized:
-            logger.warning(
-                "Detected nvfp4 checkpoint. Please note that the "
-                "format is experimental and subject to change."
-            )
-        self.group_size = group_size
-        self.kv_cache_quant_algo = kv_cache_quant_algo
-        self.exclude_modules = exclude_modules
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "petit_nvfp4"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16, torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        # Petit supports the gfx90a and gfx942 GPUs
-        return 90
-
-    @classmethod
-    def get_config_filenames(cls) -> List[str]:
-        return ["hf_quant_config.json"]
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "PetitNvFp4Config":
-        quant_config = cls.get_from_keys(config, ["quantization"])
-        quant_method = quant_config["quant_algo"]
-        group_size = quant_config.get("group_size", None)
-        verify_petit_nvfp4_supported(quant_method, group_size)
-
-        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
-        kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
-        if not kv_cache_quant_algo:
-            kv_cache_quant_algo = "auto"
-        exclude_modules = quant_config.get("exclude_modules", None)
-        if not (group_size and kv_cache_quant_algo and (exclude_modules is not None)):
-            logger.warning(
-                f"group_size: {group_size},"
-                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
-                f"exclude_modules: {exclude_modules}"
-            )
-            raise ValueError(
-                "NVFP4 quantization requires group size and "
-                "kv_cache_quant_algo specified in "
-                "hf_quant_config.json"
-            )
-        return cls(
-            is_checkpoint_nvfp4_serialized,
-            kv_cache_quant_algo,
-            group_size,
-            exclude_modules,
-        )
-
-    @classmethod
-    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
-        can_convert = cls.is_petit_nvfp4_compatible(hf_quant_cfg)
-        if can_convert:
-            return cls.get_name()
-        return None
-
-    @classmethod
-    def is_petit_nvfp4_compatible(cls, quant_config: Dict[str, Any]) -> bool:
-        quant_method = quant_config.get("quant_method", "").lower()
-        return _is_hip and quant_method == "modelopt"
-
-    def is_layer_excluded(self, prefix: str, exclude_modules: list):
-        for pattern in exclude_modules:
-            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
-            if re.fullmatch(regex_str, prefix):
-                return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
-                prefix, self.exclude_modules
-            ):
-                return UnquantizedLinearMethod()
-            return PetitNvFp4LinearMethod(self)
-        return None
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-
-class PetitNvFp4LinearMethod(LinearMethodBase):
-    """Linear method for NVFP4.
-    Supports loading NVFP4 checkpoints with the following structure:
-
-    |Tensor Name           | datatype      |  shape      |
-    |----------------------------------------------------|
-    |input_scale           | torch.float32 | scalar      |
-    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
-    |weight_scale          | FP8-E4M3      | [X, Y]      |
-    |weight_scale_2        | torch.float32 | scalar      |
-
-    The weights are quantized per block of 16 elements.
-    Args: quant_config: The ModelOpt quantization config.
-    """
-
-    def __init__(self, quant_config: PetitNvFp4Config):
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        del input_size, output_size
-        if not self.quant_config.is_checkpoint_nvfp4_serialized:
-            raise ValueError(
-                "NVFP4 quantization was selected, "
-                " dynamic quantization is not supported."
-            )
-
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-
-        layer.logical_widths = output_partition_sizes
-
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        if input_size_per_partition % 16 != 0:
-            raise ValueError(
-                "Unsupported model when in features size is " "not multiple of 16"
-            )
-
-        weight_dtype = (
-            torch.float8_e4m3fn
-            if self.quant_config.is_checkpoint_nvfp4_serialized
-            else params_dtype
-        )
-
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                # 2 fp4 data is packed in one uint8 in the input dimension
-                output_size_per_partition,
-                input_size_per_partition // 2,
-                dtype=torch.uint8,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
-        input_scale = PerTensorScaleParameter(
-            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-            weight_loader=weight_loader,
-        )
-
-        layer.register_parameter("input_scale", input_scale)
-
-        weight_scale_2 = PerTensorScaleParameter(
-            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight_scale_2", weight_scale_2)
-
-        weight_scale = ModelWeightParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition // self.quant_config.group_size,
-                dtype=weight_dtype,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-
-        layer.register_parameter("weight_scale", weight_scale)
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        input_scale_2 = layer.input_scale.max().to(torch.float32)
-        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
-        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
-        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
-        layer.alpha = Parameter(
-            layer.input_scale * layer.weight_scale_2, requires_grad=False
-        )
-
-        prepare_nvfp4_layer_for_petit(layer)
-        del layer.input_scale
 
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return apply_petit_nvfp4_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            weight_scale_2=layer.weight_scale_2,
-            size_n=layer.output_size_per_partition,
-            size_k=layer.input_size_per_partition,
-            bias=bias,
-        )
+__all__ = ["PetitNvFp4Config", "PetitNvFp4LinearMethod"]
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ tracing = [`
`93`	`93`	`srt_hip = [`
`94`	`94`	`"sglang[runtime_common]",`
`95`	`95`	`"torch",`
`96`		`- "petit_kernel==0.0.2",`
	`96`	`+ "petit_kernel==0.0.3",`
`97`	`97`	`"wave-lang==3.8.2",`
`98`	`98`	`]`
`99`	`99`