[inductor] Fix index_reduce_ on view inputs raising AssertionError in assert_functional_graph (#176606)

aorenste · pytorchmergebot · commit 354b0ff88ca5 · 2026-03-06T16:03:36.000Z
The `_index_fill` decomposition used mutable `empty_like + copy_` to restore strides when `index_copy` returned a contiguous tensor, which broke the functional graph invariant. Replace with the functional `prims.copy_strided` prim that does the same thing as a single op. Fixes #144846 Authored with Claude. Pull Request resolved: #176606 Approved by: https://github.com/Lucaskabela
diff --git a/test/inductor/pallas_expected_failures/CpuTests.test_index_reduce_on_view_input_cpu b/test/inductor/pallas_expected_failures/CpuTests.test_index_reduce_on_view_input_cpu
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -15912,6 +15912,19 @@ def run_session(x_param, y_param, size, device):
             out2 = run_session(100, 16, 64, self.device)
             self.assertEqual(out2.device.type, self.device)
 
+    def test_index_reduce_on_view_input(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/144846
+        def fn(x, index, source):
+            return x.index_reduce_(2, index, source, "mean", include_self=False)
+
+        x_base = torch.randn(4, 34, 64, device=self.device)
+        index = torch.randint(0, 34, (64,), device=self.device)
+        source = torch.randn(4, 32, 64, device=self.device)
+
+        expected = fn(x_base.clone()[:, 2:, :], index, source)
+        result = torch.compile(fn)(x_base.clone()[:, 2:, :], index, source)
+        self.assertEqual(result, expected)
+
     # end of class CommonTemplate - add new tests here
 
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
@@ -4319,9 +4319,7 @@ def _index_fill(
             out = out.squeeze(0).clone()
         # index_fill preserves the strides. index_copy always returns contiguous tensors
         if out.stride() != x.stride():
-            new_out = torch.empty_like(x)
-            new_out.copy_(out)
-            out = new_out
+            out = prims.copy_strided(out, x.stride())
         return out