Merge 8b570d6 into 5f038ad

dumko2001 · web-flow · commit 78c5c5594440 · 2026-05-09T16:23:55.000Z
diff --git a/test/inductor/test_efficient_conv_bn_eval.py b/test/inductor/test_efficient_conv_bn_eval.py
@@ -95,6 +95,69 @@ def forward(self, x):
 
 
 class EfficientConvBNEvalTemplate(TestCase):
+    @tf32_on_and_off(0.003)
+    @inductor_config.patch({"efficient_conv_bn_eval_fx_passes": True})
+    def test_functional_batch_norm_defaults(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, mean, var):
+                return torch.nn.functional.batch_norm(x, mean, var)
+
+        mod = Model().eval()
+        x = torch.randn(1, 3, 4, 4)
+        mean = torch.randn(3)
+        var = torch.abs(torch.randn(3))
+
+        device = getattr(self, "device", "cpu")
+        mod.to(device)
+        x = x.to(device)
+        mean = mean.to(device)
+        var = var.to(device)
+
+        opt = torch.compile(mod, backend="inductor")
+        opt(x, mean, var)
+
+    @tf32_on_and_off(0.003)
+    @inductor_config.patch({"efficient_conv_bn_eval_fx_passes": True})
+    def test_fx_graph_batch_norm_defaults(self):
+        """Regression test for issue #169011.
+
+        Tests that torch.compile handles FX graphs containing F.batch_norm
+        with only 3 positional arguments (input, running_mean, running_var).
+        The original bug was an AssertionError: assert len(bn_node.args) == 8.
+        """
+        from torch.fx import Graph, GraphModule
+
+        graph = Graph()
+
+        # Create input placeholders
+        inp = graph.placeholder("input")
+        mean = graph.placeholder("mean")
+        var = graph.placeholder("var")
+
+        # Create F.batch_norm call with only 3 args (the original bug case)
+        z = graph.call_function(torch.nn.functional.batch_norm, args=(inp, mean, var))
+
+        # Create output node
+        graph.output(z)
+
+        # Wrap in a GraphModule
+        gm = GraphModule({}, graph)
+
+        device = getattr(self, "device", "cpu")
+        gm.to(device)
+        gm_compiled = torch.compile(gm, backend="inductor")
+
+        inp_tensor = torch.randn(4, 4, device=device)
+        mean_tensor = torch.randn(4, device=device)
+        var_tensor = torch.abs(torch.randn(4, device=device))  # Must be positive
+
+        # This should not raise AssertionError
+        out = gm_compiled(inp_tensor, mean_tensor, var_tensor)
+
+        # Verify result matches eager evaluation
+        expected = gm(inp_tensor, mean_tensor, var_tensor)
+        self.assertEqual(out, expected)
+
     @tf32_on_and_off(0.003)
     @inductor_config.patch({"efficient_conv_bn_eval_fx_passes": True})
     @functorch_config.patch({"enable_autograd_cache": False})
diff --git a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
@@ -1,4 +1,6 @@
 # mypy: allow-untyped-defs
+import inspect
+
 import torch
 import torch.nn as nn
 from torch._dynamo.utils import counters
@@ -14,6 +16,11 @@
 from .pre_grad import efficient_conv_bn_eval_pass
 
 
+# Cache the signature of F.batch_norm at module load time to avoid repeated
+# introspection during graph transformation (fixes performance regression).
+_BATCH_NORM_SIGNATURE = inspect.signature(torch.nn.functional.batch_norm)
+
+
 def efficient_conv_bn_eval(
     bn: nn.modules.batchnorm._BatchNorm, conv: nn.modules.conv._ConvNd, x: torch.Tensor
 ):
@@ -146,17 +153,37 @@ def efficient_conv_bn_eval_decomposed(
     and inductor_config.efficient_conv_bn_eval_fx_passes,
 )
 def efficient_conv_bn_eval_graph_transform_inlined(match: Match, *args, **kwargs):
+    """
+    Graph transformation pass for fusing F.batch_norm with preceding conv operations.
+
+    This pass handles F.batch_norm calls with default arguments by normalizing
+    the args tuple using inspect.signature. It fuses batch normalization with
+    the preceding convolution for more efficient evaluation.
+    """
     bn_node = match.nodes[0]
     graph = match.graph
-    assert len(bn_node.args) == 8
+
+    # Normalize arguments by binding to cached signature and applying defaults.
+    # This handles cases where F.batch_norm is called with fewer than 8 args.
+    bound_args = _BATCH_NORM_SIGNATURE.bind(*bn_node.args, **bn_node.kwargs)
+    bound_args.apply_defaults()
+    # Use bound_args.args instead of mutating bn_node.args
+    normalized_args = bound_args.args
 
     # We can only use efficient conv-bn for eval mode with track_running_stats
-    # bn_node.args is `training`
-    if bn_node.args[-3]:
+    # normalized_args[5] is the "training" argument
+    training_arg = normalized_args[5]
+
+    # Safety check: if 'training' is a symbolic Node (from tracing/export),
+    # we cannot optimize since we don't know the value at compile time.
+    if isinstance(training_arg, torch.fx.Node):
+        return
+
+    if training_arg:
         return
 
     # Check if the input is Conv
-    input_node = bn_node.args[0]
+    input_node = normalized_args[0]
 
     if input_node.op != "call_function":  # type: ignore[union-attr]
         return
@@ -184,11 +211,11 @@ def efficient_conv_bn_eval_graph_transform_inlined(match: Match, *args, **kwargs
 
     with graph.inserting_before(bn_node):
         # prepare args for the fused function
-        bn_running_mean = bn_node.args[1]
-        bn_running_var = bn_node.args[2]
-        bn_weight = bn_node.args[3]
-        bn_bias = bn_node.args[4]
-        bn_eps = bn_node.args[7]
+        bn_running_mean = normalized_args[1]
+        bn_running_var = normalized_args[2]
+        bn_weight = normalized_args[3]
+        bn_bias = normalized_args[4]
+        bn_eps = normalized_args[7]
         assert len(conv_node.args) >= 2  # type: ignore[union-attr]
         conv_input = conv_node.args[0]  # type: ignore[union-attr]
         conv_weight = conv_node.args[1]  # type: ignore[union-attr]