Support negative index slicing with backed symints (#177308)

ColinPeppler · pytorchmergebot · commit 9a7ae22c995f · 2026-03-19T07:29:02.000Z
Pull Request resolved: #177308 Approved by: https://github.com/laithsakka ghstack dependencies: #175819
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -1939,6 +1939,41 @@ def forward(self, x, y):
                 shifts = torch.arange(0, 64, 8, device=x.device, dtype=torch.int64)
                 return (expanded >> shifts) & 255
 
+        torch.cuda.caching_allocator_enable(False)
+        model = Repro()
+        example_inputs = (
+            torch.randint(
+                0, 256, (200, INNER_DIM), device=self.device, dtype=torch.int64
+            ),
+            torch.randn(50, 8, device=self.device),
+        )
+        spec = {
+            "x": (Dim.DYNAMIC, Dim.STATIC),
+            "y": (Dim.DYNAMIC, Dim.STATIC),
+        }
+        self.check_model(model, example_inputs, dynamic_shapes=spec)
+        torch.cuda.caching_allocator_enable(True)
+
+    @skipIfMPS
+    @config.patch({"triton.autotune_at_compile_time": None})
+    @torch.fx.experimental._config.patch("backed_size_oblivious", True)
+    def test_slice_negative_index_backed_symints_no_unbacked(self):
+        # x[-s1:] where x.size(0) = s0-1 should produce Max(s0-1 - s1, 0),
+        # not an unbacked symint with a bad fallback value.
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires triton")
+
+        INNER_DIM = 4224
+
+        class Repro(torch.nn.Module):
+            def forward(self, x, y):
+                x_trimmed = x[:-1]
+                sliced = x_trimmed[-y.size(0) :]
+                reshaped = sliced.reshape(-1, 128, 33)
+                expanded = reshaped.unsqueeze(3).expand(-1, 128, 33, 8)
+                shifts = torch.arange(0, 64, 8, device=x.device, dtype=torch.int64)
+                return (expanded >> shifts) & 255
+
         torch.cuda.caching_allocator_enable(False)
         try:
             model = Repro()
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -3643,6 +3643,11 @@ def loader(idx: Sequence[Expr]) -> OpsValue:
 
 
 class SliceView(View):
+    """View that represents a slice along a single dimension.
+
+    Corresponds to tensor[..., start:end:step, ...].
+    """
+
     @classmethod
     def normalize_start_end(
         cls, x: IRNode, dim: int, start: int, end: int
@@ -3657,6 +3662,14 @@ def normalize_start_end(
         if any(free_unbacked_symbols(x) for x in (start, end, dim_size)):
             min_func = sympy.Min
             max_func = sympy.Max
+        elif any(
+            # Only needed when backed_size_oblivious is on.
+            x.has(sympy.Min, sympy.Max)
+            for x in (start, end, dim_size)
+            if isinstance(x, Expr)
+        ):
+            min_func = sympy.Min
+            max_func = sympy.Max
         else:
             min_func = sizevars.evaluate_min
             max_func = sizevars.evaluate_max
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -1432,6 +1432,9 @@ def compute_slice_index(index, size, default=None):
         elif fn(sympy.Ge(index, 0)):
             # If index >= 0, the resolved index is at most min(index, size).
             return sympy.Min(index, size)
+        elif fn(sympy.Lt(index, 0)):
+            # If index < 0, wrap and clamp: the resolved index is at least 0.
+            return sympy.Max(index + size, 0)
         return None
 
     start_index, end_index = None, None
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
@@ -550,7 +550,7 @@ def evaluate_min(self, left: Expr, right: Expr) -> Expr:
             return right
 
         # Min/Max fallback: we can prove Min(a, b) <= c when any arg <= c, but
-        # sympy doesn't simplify this yet. So, evaluate it here.
+        # sympy doesn't simplify this yet. So, evaluate it here. Same for Max.
         for lhs, rhs in [(left, right), (right, left)]:
 
             def le_rhs(a: Expr) -> bool:
@@ -559,6 +559,9 @@ def le_rhs(a: Expr) -> bool:
             # Min(Min(a, b), c) ==> Min(a, b) if (a <= c) or (b <= c).
             if isinstance(lhs, sympy.Min) and any(le_rhs(a) for a in lhs.args):
                 return lhs
+            # Min(Max(a, b), c) ==> Max(a, b) if (a <= c) and (b <= c).
+            if isinstance(lhs, sympy.Max) and all(le_rhs(a) for a in lhs.args):
+                return lhs
 
         raise TypeError(
             f"evaluate_min({left}, {right}) with unbacked symints"
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
@@ -963,6 +963,8 @@ def _compute_slice_index(size: IntLikeType, index: IntLikeType) -> IntLikeType |
         return size
     elif guard_or_false(index >= 0):
         return torch.sym_min(index, size)
+    elif guard_or_false(index < 0):
+        return torch.sym_max(index + size, 0)
 
     return None
 
@@ -1008,6 +1010,12 @@ def slice_forward(
             new_size = (end_index - start_index + step - 1) // step
         elif guard_or_false(start_index >= end_index):
             new_size = 0
+        else:
+            # Both indices are resolved but we can't statically determine their
+            # ordering (e.g., when they involve Min/Max). Compute the size via
+            # max(end - start, 0) to avoid creating an unbacked symint.
+            diff = torch.sym_max(end_index - start_index, 0)
+            new_size = (diff + step - 1) // step
 
     # create unbacked if case unknown
     if new_size is None:
diff --git a/torch/testing/_internal/common_ops_unbacked.py b/torch/testing/_internal/common_ops_unbacked.py
@@ -153,7 +153,6 @@ def skip(op_name, variant_name="", *, device_type=None, dtypes=None):
     xfail("nn.functional.fractional_max_pool2d"),
     xfail("nn.functional.fractional_max_pool3d"),
     xfail("nn.functional.gaussian_nll_loss"),
-    xfail("nn.functional.glu"),
     xfail("nn.functional.grid_sample"),
     xfail("nn.functional.group_norm"),
     xfail("nn.functional.huber_loss"),