fix: K=64 block-scaled GEMM TMA layout fixes for SM120

Brandon Music · claude · Brandon Music · commit ede914e4815e · 2026-03-20T20:43:40.000-04:00
Two fixes enabling K=64 block-scaled MoE GEMM on SM120 (99KB SMEM):

1. copy_traits_sm90_tma.hpp: Handle zero-stride basis elements in
   fill_tma_gmem_shape_stride. When a basis element is constant-zero
   (broadcast dimension for SFVectorSize), basis_get returns the entire
   tuple instead of a scalar. Detect is_constant&lt;0&gt; and set shape=1,
   stride=0 directly.

2. sm120_blockscaled_mma_builder.inl: Clamp Blk_SF to
   min(K/SFVectorSize, Blk_SF) and fold the effective block into
   kBasicBlockShape when the tile K is too small for the default block
   size. This keeps outer K dimensions trivial so TMA can construct
   valid descriptors.

For K=64 with SFVectorSize=32: K/SFVectorSize=2 &lt; Blk_SF=4, which
previously created a zero-size dimension in the scale factor SMEM
layout, triggering "TMA requires CTA_Tile and SLayout top-level size
equivalence."

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -869,8 +869,14 @@ fill_tma_gmem_shape_stride(Tensor<GEngine,GLayout>   const& gtensor,           /
     if constexpr (tma_i_rank == 1) {
       // Trivial contribution of this gmem mode to this tma mode
       auto ej = unwrap(get<i>(tma_gbasis_stride));
-      gmem_prob_shape[i]  = basis_get(ej, gmem_shape);
-      gmem_prob_stride[i] = basis_get(ej, gmem_stride);
+      if constexpr (cute::is_constant<0, decltype(ej)>::value) {
+        // Zero-stride basis: broadcast dimension (e.g. SFVectorSize), shape=1, stride=0
+        gmem_prob_shape[i]  = 1;
+        gmem_prob_stride[i] = 0;
+      } else {
+        gmem_prob_shape[i]  = basis_get(ej, gmem_shape);
+        gmem_prob_stride[i] = basis_get(ej, gmem_stride);
+      }
     } else {
       // Apply a recurrence to each gmem mode that contributes to this tma mode
       for_each(get<i>(tma_gbasis_stride), [&](auto ej) {
diff --git a/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl b/include/cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -177,31 +177,53 @@ struct CollectiveBuilder<
   using SmemCopyAtomsB = decltype(cute::make_tuple(SmemCopyAtomB{}, SmemCopyAtomSFB{}));
 
   // Construct SMEM layout for SF
-  // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
-  // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
+  // A single indivisible block will hold Blk_SF (4) scale factors of 128 rows/columns (A/B matrix).
+  // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size
   using Blk_MN    = typename Sm1xxBlkScaledConfig::Blk_MN;
-  using Blk_SF    = typename Sm1xxBlkScaledConfig::Blk_SF; 
-  using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
+  using Blk_SF    = typename Sm1xxBlkScaledConfig::Blk_SF;
+
+  // For tiles where K/SFVectorSize < Blk_SF (e.g. K=64 with SFVectorSize=32 gives only 2 SF
+  // values along K, but Blk_SF=4), clamp the effective block size to avoid TMA layout issues.
+  // When EffBlk_SF < Blk_SF AND MMA_NSF < EffBlk_SF, we fold EffBlk_SF into the kBasicBlock
+  // so that the outer K shape is trivial (all 1s) and gets collapsed by TMA, avoiding nested
+  // tuple types that can't convert to uint64_t in fill_tma_gmem_shape_stride.
+  static constexpr int NumSFAlongK = size<2>(TileShape_MNK{}) / SFVectorSize;
+  using EffBlk_SF    = Int<(NumSFAlongK < Blk_SF{}) ? NumSFAlongK : int(Blk_SF{})>;
+  using EffBlk_Elems = decltype(Blk_MN{} * EffBlk_SF{});
+
+  // Determine if we need to fold EffBlk_SF into the basic block to keep TMA layouts flat.
+  // This is needed when EffBlk_SF > MMA_NSF (i.e. the outer K shape would be non-trivial).
+  static constexpr bool FoldSFIntoBasicBlock = (NumSFAlongK < Blk_SF{}) && (int(EffBlk_SF{}) > MMA_NSF);
 
   // Basic storage block for new Scaling Factor Layouts
   using mnBasicBlockShape  =  Shape<_32,_4>;
   using mnBasicBlockStride = Stride<_16,_4>;
-  using kBasicBlockShape  = Shape<Int<SFVectorSize>, Int<MMA_NSF>>;
+  // When folding: kBasicBlock absorbs EffBlk_SF, making outer K shape all-1 (trivially collapsed by TMA)
+  // When not folding: original kBasicBlock with MMA_NSF
+  using kBasicBlockShape  = cute::conditional_t<FoldSFIntoBasicBlock,
+      Shape<Int<SFVectorSize>, EffBlk_SF>,
+      Shape<Int<SFVectorSize>, Int<MMA_NSF>>>;
   using kBasicBlockStride = Stride<_0, _1>;
-  
+
+  // Outer K shape: when folded, both dimensions are 1 (trivial); when not folded, original formula
+  using OuterK0 = cute::conditional_t<FoldSFIntoBasicBlock, _1, decltype(EffBlk_SF{}/Int<MMA_NSF>{})>;
+  using OuterK1 = decltype(size<2>(TileShape_MNK{}) / Int<SFVectorSize>{} / EffBlk_SF{});
+  // Outer K stride: first element is EffBlk_SF when folded (doesn't matter since shape=1), MMA_NSF when not
+  using OuterKS0 = cute::conditional_t<FoldSFIntoBasicBlock, EffBlk_SF, Int<MMA_NSF>>;
+
   using sSFA_shapeM       = decltype(prepend(size<0>(TileShape_MNK{}) / Blk_MN{},   mnBasicBlockShape{}));
-  using sSF_strideMN      = decltype(prepend(                        Blk_Elems{},  mnBasicBlockStride{}));
+  using sSF_strideMN      = decltype(prepend(                      EffBlk_Elems{},  mnBasicBlockStride{}));
   using sSFA_strideM      = sSF_strideMN;
-  using sSF_shapeK        = decltype(prepend(make_shape( Blk_SF{}/Int<MMA_NSF>{},   size<2>(TileShape_MNK{}) / Int<SFVectorSize>{} / Blk_SF{}),  kBasicBlockShape{}));
-  
-  using sSFA_strideK      = decltype(prepend(make_stride(         Int<MMA_NSF>{},   size<0>(TileShape_MNK{}) / Blk_MN{} * Blk_Elems{}), kBasicBlockStride{}));
+  using sSF_shapeK        = decltype(prepend(make_shape( OuterK0{},   OuterK1{}),  kBasicBlockShape{}));
+
+  using sSFA_strideK      = decltype(prepend(make_stride(         OuterKS0{},   size<0>(TileShape_MNK{}) / Blk_MN{} * EffBlk_Elems{}), kBasicBlockStride{}));
   using sSFA_shape        = decltype(make_shape(  sSFA_shapeM{},   sSF_shapeK{}));
   using sSFA_stride       = decltype(make_stride(sSFA_strideM{}, sSFA_strideK{}));
   using SmemLayoutAtomSFA = decltype(make_layout(  sSFA_shape{},  sSFA_stride{}));
 
   using sSFB_shapeN       = decltype(prepend(size<1>(TileShape_MNK{}) / Blk_MN{},   mnBasicBlockShape{}));
   using sSFB_strideN      = sSF_strideMN;
-  using sSFB_strideK      = decltype(prepend(make_stride(Int<MMA_NSF>{},   size<1>(TileShape_MNK{}) / Blk_MN{} * Blk_Elems{}), kBasicBlockStride{}));
+  using sSFB_strideK      = decltype(prepend(make_stride(OuterKS0{},   size<1>(TileShape_MNK{}) / Blk_MN{} * EffBlk_Elems{}), kBasicBlockStride{}));
   using sSFB_shape        = decltype(make_shape(  sSFB_shapeN{},   sSF_shapeK{}));
   using sSFB_stride       = decltype(make_stride(sSFB_strideN{}, sSFB_strideK{}));
   using SmemLayoutAtomSFB = decltype(make_layout(  sSFB_shape{},  sSFB_stride{}));