Add optional out argument to F.scaled_mm (#174395)

slayton58 · pytorchmergebot · commit 4c4df174804c · 2026-03-04T13:44:38.000Z
Summary: * Add `out=` argument to `F.scaled_mm` and basic test * Properly guard MXFP4 tests where the build has CUDA but not MSLK. Gracefully refuse to run instead of hard-failing. Test Plan: ``` pytest -v -k "test_float8_out_argument" test/test_scaled_matmul_cuda.py ``` Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: #174395 Approved by: https://github.com/danielvegamyhre
diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
@@ -26,6 +26,7 @@
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_FP8_GROUPED_GEMM,
     PLATFORM_SUPPORTS_MX_GEMM,
+    PLATFORM_SUPPORTS_MXFP4_GEMM,
     PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
     SM100OrLater,
     SM120OrLater,
@@ -218,6 +219,7 @@ def scaled_mm_wrap(
     use_fast_accum=False,
     bias=None,
     wrap_v2=wrap,
+    out=None,
 ):
     if not wrap_v2:
         return torch._scaled_mm(
@@ -249,6 +251,7 @@ def scaled_mm_wrap(
             bias=bias,
             output_dtype=out_dtype,
             use_fast_accum=use_fast_accum,
+            out=out,
         )
         return out
 
@@ -706,6 +709,23 @@ def test_float8_scale(self, device) -> None:
         out_fp8_s = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b)
         self.assertEqual(out_fp8, out_fp8_s)
 
+    def test_float8_out_argument(self, device) -> None:
+        if not _device_supports_scaled_mm_fp8(device):
+            raise unittest.SkipTest(f8_msg)
+        size = (16, 16)
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
+
+        out = torch.empty(size, device=device, dtype=torch.bfloat16)
+
+        scale_one = torch.tensor(1.0, device=device)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a=scale_one, scale_b=scale_one, out=out)
+
+        if out_fp8.data_ptr() != out.data_ptr():
+            raise AssertionError("out_fp8 and out must have the same data pointers")
+
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
     @parametrize("G", [1, 4, 16])
@@ -716,9 +736,12 @@ def test_float8_scale(self, device) -> None:
     def test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(self, G, M, N, K, format):
         torch.manual_seed(42)
 
-        if format == "mxfp4" and SM120OrLater:
+        if (format == "mxfp4") and SM120OrLater and (not PLATFORM_SUPPORTS_MXFP4_GEMM):
             raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
 
+        if (format == "mxfp4") and (not PLATFORM_SUPPORTS_MXFP4_GEMM):
+            raise unittest.SkipTest("MXFP4 not supported on this platform - build with MSLK support")
+
         total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
         input_group_end_offsets = generate_jagged_offs(
             G, total_K, multiple_of=32, device="cuda"
@@ -786,8 +809,10 @@ def test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(self, G, M, N, K, format):
     def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K, format):
         torch.manual_seed(42)
 
-        if format == "mxfp4" and SM120OrLater:
+        if (format == "mxfp4") and SM120OrLater:
             raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
+        if (format == "mxfp4") and (not PLATFORM_SUPPORTS_MXFP4_GEMM):
+            raise unittest.SkipTest("MXFP4 not supported on this platform - build with MSLK support")
 
         # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
         # 2D inputs with groups along M, 3D weights.
@@ -1894,7 +1919,7 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
             raise unittest.SkipTest("nvfp4 not supported on ROCm, skipping")
         if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
             raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
-        if recipe == "mxfp4" and SM120OrLater:
+        if (recipe == "mxfp4") and SM120OrLater or (not PLATFORM_SUPPORTS_MXFP4_GEMM):
             raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
 
         device = "cuda"
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -6755,6 +6755,8 @@ def scaled_mm(
     output_dtype: torch.dtype | None = torch.bfloat16,
     contraction_dim: list[int] | tuple[int, ...] = (),
     use_fast_accum: bool = False,
+    *,
+    out: Optional[torch.Tensor] = None,
 ) -> Tensor:
     r"""
     scaled_mm(mat_a, mat_b, scale_a, scale_recipe_a, scale_b, scale_recipe_b, swizzle_a, swizzle_b, bias, output_dtype,
@@ -6774,6 +6776,7 @@ def scaled_mm(
         output_dtype: dtype used for the output tensor
         contraction_dim: describe which dimensions are :math:`K` in the matmul.
         use_fast_accum: enable/disable tensor-core fast accumulation (Hopper-GPUs only)
+        out: User-provided output tensor
     """
 
     def expand_single_value(v: _Any | list[_Any] | None) -> list[_Any]:
@@ -6821,6 +6824,7 @@ def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
         output_dtype,
         contraction_dim,
         use_fast_accum,
+        out=out,
     )
 
     return out
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
@@ -729,6 +729,8 @@ def scaled_mm(
     output_dtype: _dtype = ...,
     contraction_dim: list[int] | tuple[int, ...] = (),
     use_fast_accum: bool = False,
+    *,
+    out: Tensor | None = None,
 ) -> Tensor: ...
 
 __all__ += ["scaled_mm"]
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -191,7 +191,16 @@ def evaluate_platform_supports_mxfp8_grouped_gemm():
         return built_with_mslk and IS_SM100
     return False
 
+def evaluate_platform_supports_mxfp4_gemm():
+    if torch.cuda.is_available():
+        built_with_mslk = "USE_MSLK" in torch.__config__.show()
+        return bool(torch.version.hip) or built_with_mslk
+
+    return False
+
+
 PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mx_gemm())
+PLATFORM_SUPPORTS_MXFP4_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mxfp4_gemm())
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 PLATFORM_SUPPORTS_FP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_fp8_grouped_gemm())
 PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mxfp8_grouped_gemm())