diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
index 06a6e2279b6a7..e976319b6c72f 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -transform-interpreter -test-transform-dialect-erase-schedule \
+// RUN:   -transform-interpreter="debug-payload-root-tag=payload" -test-transform-dialect-erase-schedule \
 // RUN:   -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
@@ -8,6 +8,10 @@
 // RUN:   -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils,%native_arm_sme_abi_shlib | \
 // RUN: FileCheck %s
 
+module @payload attributes { transform.target_tag = "payload" } {
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
 func.func @matmul_transpose_a(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>, %C : tensor<?x?xf32>) {
   %res = linalg.matmul_transpose_a ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
                                    outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -54,6 +58,8 @@ func.func @main() {
   return
 }
 
+} // end payload
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module : !transform.any_op {transform.readonly}) {
     %matmul_transpose_a = transform.structured.match ops{["linalg.matmul_transpose_a"]} in %module
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
index 29b0bc0c19606..e6915722b792d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -transform-interpreter -test-transform-dialect-erase-schedule \
+// RUN:   -transform-interpreter="debug-payload-root-tag=payload" -test-transform-dialect-erase-schedule \
 // RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
@@ -7,6 +7,10 @@
 // RUN:   -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils,%native_arm_sme_abi_shlib | \
 // RUN: FileCheck %s
 
+module @payload attributes { transform.target_tag = "payload" } {
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
 func.func @matmul(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>, %C : tensor<?x?xf32>) {
   %res = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
                        outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -52,6 +56,8 @@ func.func @main() {
   return
 }
 
+} // end payload
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module : !transform.any_op {transform.consumed}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
@@ -101,5 +107,3 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
-func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
index 78815a38612e9..3400986028c04 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul-mixed-types.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
-// RUN:   -transform-interpreter -test-transform-dialect-erase-schedule  \
-// RUN:   -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \
+// RUN:   -transform-interpreter="debug-payload-root-tag=payload" -test-transform-dialect-erase-schedule \
+// RUN:   -canonicalize \
 // RUN:   -test-lower-to-arm-sme -convert-vector-to-llvm="enable-arm-sve" \
 // RUN:   -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
@@ -9,6 +9,10 @@
 // RUN:   -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils,%native_arm_sme_abi_shlib,%native_mlir_arm_runner_utils | \
 // RUN: FileCheck %s
 
+module @payload attributes { transform.target_tag = "payload" } {
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
 /// This is very similar to the SME multi-tile-matmul.mlir test, except that it
 /// tests a mixed i8 to i32 matmul and outer product fusion which fuses 16
 /// outer products (four per tile) into four 4-way outer products.
@@ -66,6 +70,8 @@ func.func @main() {
   return
 }
 
+} // end payload
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module : !transform.any_op {transform.consumed}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
index 243f9e5cde9f5..01aae32cb1203 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
-// RUN:   -transform-interpreter -test-transform-dialect-erase-schedule  \
-// RUN:   -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \
+// RUN:   -transform-interpreter="debug-payload-root-tag=payload" -test-transform-dialect-erase-schedule \
+// RUN:   -canonicalize \
 // RUN:   -test-lower-to-arm-sme -convert-vector-to-llvm="enable-arm-sve" \
 // RUN:   -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
@@ -9,6 +9,11 @@
 // RUN:   -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils,%native_arm_sme_abi_shlib,%native_mlir_arm_runner_utils | \
 // RUN: FileCheck %s
 
+module @payload attributes { transform.target_tag = "payload" } {
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+func.func private @setArmSVLBits(%bits : i32)
+
 /// This is very similar to the SME matmul.mlir test, except that it uses a tile
 /// size of [8]x[8]xf32, which is larger than a 32-bit SME virtual tile, which
 /// would be [4]x[4]xf32. The [8]x[8] tile will be decomposed into four
@@ -65,6 +70,8 @@ func.func @main() {
   return
 }
 
+} // end payload
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module : !transform.any_op {transform.consumed}) {
     %matmul = transform.structured.match ops{["linalg.matmul"]} in %module
@@ -107,6 +114,3 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
-func.func private @printMemrefF32(%ptr : tensor<*xf32>)
-func.func private @setArmSVLBits(%bits : i32)