[inductor] Add kernel count verification to bitwise optimizer tests (#177071)

mlazos · pytorchmergebot · commit f249065020fc · 2026-03-11T11:23:23.000Z
Add kernel count checking to CompiledOptimizerBitwiseTests, matching the pattern used in the non-bitwise CompiledOptimizerTests. Kernel counts are computed from KERNEL_COUNTS directly rather than reusing kwargs["kernel_count"] from COMPILED_OPT_KWARG_DB, since those may be assert_expected_inline lambdas tied to specific source locations. Authored with Claude. Pull Request resolved: #177071 Approved by: https://github.com/tianrengao, https://github.com/karthickai
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
@@ -1011,59 +1011,49 @@ class CompiledOptimizerBitwiseTests(TestCase):
     def _test_optimizer_bitwise(
         test_case,
         optim_cls,
+        kernel_count=None,
         num_steps=10,
         **optim_kwargs,
     ):
         """Helper to test optimizer bitwise equality."""
         torch._dynamo.reset()
+        torch._inductor.metrics.reset()
         torch.manual_seed(42)
 
-        params_eager = [
-            torch.randn(64, 64, device=GPU_TYPE, dtype=torch.float32),
-            torch.randn(32, 32, device=GPU_TYPE, dtype=torch.float32),
-        ]
-        params_compiled = [p.clone() for p in params_eager]
-
-        opt_eager = optim_cls(
-            params_eager,
-            **optim_kwargs,
-        )
-        opt_compiled = optim_cls(
-            params_compiled,
-            **optim_kwargs,
+        input = torch.ones([10, 10], device=GPU_TYPE)
+        model_eager = torch.nn.Sequential(
+            *[torch.nn.Linear(10, 10, device=GPU_TYPE) for _ in range(2)]
         )
+        model_eager(input).sum().backward()
 
-        @torch.compile
-        def compiled_step():
-            opt_compiled.step()
+        model_compiled = deepcopy(model_eager)
+        model_compiled(input).sum().backward()
 
-        for step in range(num_steps):
-            # Generate gradients with consistent seed
-            torch.manual_seed(1000 + step)
-            grads = [torch.randn_like(p) for p in params_eager]
-
-            for p, g in zip(params_eager, grads):
-                p.grad = g.clone()
-            for p, g in zip(params_compiled, grads):
-                p.grad = g.clone()
+        opt_eager = optim_cls(model_eager.parameters(), **optim_kwargs)
+        opt_compiled = optim_cls(model_compiled.parameters(), **optim_kwargs)
+        compiled_step = compile_opt(opt_compiled)
 
-            opt_eager.step()
-            compiled_step()
+        with torch.set_grad_enabled(False):
+            for step in range(num_steps):
+                compiled_step()
+                opt_eager.step()
 
-            # Check bitwise equality
-            for i, (p_eager, p_compiled) in enumerate(
-                zip(params_eager, params_compiled)
-            ):
-                test_case.assertEqual(
-                    p_eager,
-                    p_compiled,
-                    atol=0,
-                    rtol=0,
-                    msg=f"Step {step + 1}, param {i}: params differ",
-                )
+                # Check bitwise equality
+                for i, (p_eager, p_compiled) in enumerate(
+                    zip(model_eager.parameters(), model_compiled.parameters())
+                ):
+                    test_case.assertEqual(
+                        p_eager,
+                        p_compiled,
+                        atol=0,
+                        rtol=0,
+                        msg=f"Step {step + 1}, param {i}: params differ",
+                    )
 
         # Also check optimizer state
-        for p_eager, p_compiled in zip(params_eager, params_compiled):
+        for p_eager, p_compiled in zip(
+            model_eager.parameters(), model_compiled.parameters()
+        ):
             for key in opt_eager.state[p_eager]:
                 eager_val = opt_eager.state[p_eager][key]
                 compiled_val = opt_compiled.state[p_compiled][key]
@@ -1076,6 +1066,14 @@ def compiled_step():
                         msg=f"State '{key}' differs",
                     )
 
+        if kernel_count is not None and test_case.check_kernel_count:
+            if isinstance(kernel_count, types.LambdaType):
+                kernel_count(str(torch._inductor.metrics.generated_kernel_count))
+            else:
+                test_case.assertEqual(
+                    torch._inductor.metrics.generated_kernel_count, kernel_count
+                )
+
 
 for optim_cls, name, kwargs, scheduler_cls in COMPILED_OPT_KWARG_DB:
     setattr(
@@ -1085,19 +1083,20 @@ def compiled_step():
     )
 
 
-def _make_bitwise_test(optim_cls, **optim_kwargs):
+def _make_bitwise_test(optim_cls, kernel_count=None, **optim_kwargs):
     @skipIfRocm(msg="ROCm may have different numerical behavior")
     @requires_cuda_and_triton
     @config.patch(
         {
+            "score_fusion_memory_threshold": 1,
             "eager_numerics.division_rounding": True,
             "eager_numerics.use_pytorch_libdevice": True,
             "emulate_precision_casts": True,
         }
     )
     def test_fn(self):
         CompiledOptimizerBitwiseTests._test_optimizer_bitwise(
-            self, optim_cls, **optim_kwargs
+            self, optim_cls, kernel_count=kernel_count, **optim_kwargs
         )
 
     return test_fn
@@ -1117,6 +1116,7 @@ def test_fn(self):
 # SGD doesn't support capturable but has no item() calls
 # so it compiles without graph breaks and can be tested bitwise.
 _BITWISE_NON_CAPTURABLE_OPTIMS = (SGD,)
+
 for optim_cls, name, kwargs, scheduler_cls in COMPILED_OPT_KWARG_DB:
     if (
         kwargs.get("device") == GPU_TYPE
@@ -1130,13 +1130,24 @@ def test_fn(self):
             or optim_cls in _BITWISE_NON_CAPTURABLE_OPTIMS
         )
     ):
+        bitwise_name = name.replace("test_", "test_bitwise_")
+        # Use the same kernel count as the non-bitwise test, including
+        # any overrides for specific test configurations.
+        if name in KERNEL_COUNT_OVERRIDES:
+            kernel_count = KERNEL_COUNT_OVERRIDES[name]
+        else:
+            kernel_count = (
+                KERNEL_COUNTS[optim_cls].multitensor
+                if kwargs.get("foreach", False)
+                else KERNEL_COUNTS[optim_cls].singletensor
+            )
         optim_kwargs = {
             k: v for k, v in kwargs.items() if k not in ("device", "kernel_count")
         }
         setattr(
             CompiledOptimizerTests,
-            name.replace("test_", "test_bitwise_"),
-            _make_bitwise_test(optim_cls, **optim_kwargs),
+            bitwise_name,
+            _make_bitwise_test(optim_cls, kernel_count=kernel_count, **optim_kwargs),
         )