refactor tensorify restart logic to use sources (#141517)

bobrenjc93 · pytorchmergebot · commit 30d8b30db7ea · 2024-12-11T07:15:39.000Z
Differential Revision: [D67066706](https://our.internmc.facebook.com/intern/diff/D67066706) Pull Request resolved: #141517 Approved by: https://github.com/ezyang
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
@@ -40,16 +40,10 @@
     Source,
     TracingContext,
 )
-from torch._subclasses.fake_tensor import FakeTensor
 from torch._utils_internal import signpost_event
 from torch.fx._lazy_graph_module import _make_graph_module  # type: ignore[attr-defined]
 from torch.fx.experimental._backward_state import BackwardState
-from torch.fx.experimental.symbolic_shapes import (
-    free_symbols,
-    guard_scalar,
-    is_symbolic,
-    ShapeEnv,
-)
+from torch.fx.experimental.symbolic_shapes import free_symbols, is_symbolic, ShapeEnv
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
@@ -1343,8 +1337,6 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
             ncalls = count_calls(self.graph)
             counters["stats"]["calls_captured"] += ncalls
 
-            self.remove_tensorify_specialized_graphargs()
-
             # free a bit of memory
             self.real_value_cache.clear()
 
@@ -1681,40 +1673,6 @@ def update_used_symbols(used_symbols, fake: Union[torch.SymInt, torch.Tensor]):
                     # Make sure we delete later occurrences of the same symbol
                     used_symbols.remove(symbol)
 
-    def remove_tensorify_specialized_graphargs(self) -> None:
-        # This is a pretty interesting function. Basically we have this problem
-        # where our compiler tends to choke when we have unused inputs. The way
-        # we support dynamic float arguments is by doing a joint fx pass and
-        # tensorifying away as many symfloats as we can. For the remaining symfloats
-        # we have no choice but to specialize... HOWEVER at that point in time
-        # we can no longer remove graph inputs. So our sledgehammer solution is to
-        # save the state of what inputs we should have specialized in dynamo and
-        # restart analysis. This function incorporates this "view from the future"
-        # state and specializes inputs that we know we won't be able to tensorify
-        # away in the joint pass. In principle we shouldn't choke on unused inputs
-        # and so this shouldn't be necessary. In practice CUDA graphs choke on
-        # unused inputs so we need this for now.
-
-        # Import here to prevent circular import
-        from torch._dynamo.symbolic_convert import TensorifyState
-
-        for node in self.graph.nodes:
-            example_value = node.meta.get("example_value")
-            if (
-                isinstance(example_value, FakeTensor)
-                and example_value.item_memo is not None
-                and hasattr(example_value.item_memo.node._expr, "name")
-                and all(u.target == "item" for u in node.users)
-                and TensorifyState.should_specialize(
-                    # We use _expr instead of expr b/c we want the symbol not the replacement
-                    example_value.item_memo.node._expr.name
-                )
-            ):
-                for u in list(node.users):
-                    u.replace_all_uses_with(guard_scalar(example_value.item_memo))
-                    self.remove_node(u)
-                self.remove_node(node)
-
     def add_output_instructions(self, prefix: List[Instruction]) -> None:
         """
         We call this on the creation of a new compiled subgraph that is inserted
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
@@ -247,17 +247,16 @@ class DistributedState:
 
 
 class TensorifyState:
-    # These are the set of string symfloats names (eg. "zf0") that we collect
-    # from the tensorify_python_scalars.py joint fx pass to inform us about
-    # which float inputs we should specialize when we restart analysis.
-    force_specializations: Set[str] = set()
+    # These are the set of source that we collect from the tensorify_python_scalars.py joint
+    # fx pass to inform us about which float inputs we should specialize when we restart analysis.
+    force_specializations: Set[Source] = set()
 
     @classmethod
-    def specialize(cls, index: str) -> None:
+    def specialize(cls, index: Source) -> None:
         cls.force_specializations.add(index)
 
     @classmethod
-    def should_specialize(cls, index: str) -> bool:
+    def should_specialize(cls, index: Source) -> bool:
         return index in cls.force_specializations
 
     @classmethod
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
@@ -1901,6 +1901,9 @@ def wrap_symint(self, value):
         return unspec_var
 
     def wrap_symfloat(self, value):
+        # To prevent circular import
+        from ..symbolic_convert import TensorifyState
+
         # SymFloat wrapping is special.  We first wrap it in the same way we
         # do an unspecialized primitive, and then we item() it into a
         # SymFloat.  Removal of the item() call is left to a later FX pass,
@@ -1932,6 +1935,7 @@ def wrap_symfloat(self, value):
             or torch._inductor.config.triton.cudagraphs
             or justknobs_check("pytorch/compiler:unspecialize_float_killswitch", False)
             or frame_state_entry.scalar is not auto_dynamic
+            or TensorifyState.should_specialize(self.source)
         ):
             self.install_guards(GuardBuilder.CONSTANT_MATCH)
             return ConstantVariable.create(value=value, source=self.source)
diff --git a/torch/fx/passes/_tensorify_python_scalars.py b/torch/fx/passes/_tensorify_python_scalars.py
@@ -223,19 +223,25 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
             val = node.meta.get("val")
             if isinstance(val, FakeTensor):
                 for dim in val.shape:
-                    if isinstance(dim, torch.SymInt):
-                        for s in dim.node.expr.free_symbols:
-                            name = str(s)
-                            if symbol_is_type(
-                                s, SymT.FLOAT
-                            ) and not TensorifyState.should_specialize(name):
-                                # In principle, we could support float input that
-                                # is used to do size compute. The problem is that
-                                # we don't actually want to tensorify the compute
-                                # in this case, which means we need codegen support for
-                                # all symfloats.
-                                TensorifyState.specialize(name)
-                                should_restart = True
+                    if not isinstance(dim, torch.SymInt):
+                        continue
+
+                    for symbol in dim.node.expr.free_symbols:
+                        if not symbol_is_type(symbol, SymT.FLOAT):
+                            continue
+
+                        sources = shape_env.var_to_sources.get(symbol)
+                        for source in sources:
+                            if TensorifyState.should_specialize(source):
+                                continue
+
+                            # In principle, we could support float input that
+                            # is used to do size compute. The problem is that
+                            # we don't actually want to tensorify the compute
+                            # in this case, which means we need codegen support
+                            # for all symfloats.
+                            TensorifyState.specialize(source)
+                            should_restart = True
 
             # Look for functions to convert
             if node.op == "call_function" and (
@@ -322,21 +328,12 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                     node.replace_all_uses_with(guard_scalar(val))
                     graph.erase_node(node)
 
-    # Sometimes by the time we get to tensorify, there have already been
-    # specializations, eg. in python_arg_parser.h. In these cases,
-    # placeholder nodes no longer have a reference to their original
-    # symfloat and thus we need to deduce specializations have happend
-    # via shape_env.replacements. NB: there's an important invariant here
-    # that symfloats keep consistent names across restarts.
-    for k, v in shape_env.var_to_val.items():
-        if symbol_is_type(k, SymT.FLOAT) and isinstance(v, sympy.core.numbers.Float):
-            name = str(k)
-            if (
-                not TensorifyState.should_specialize(name)
-                and k not in tensorified_symbols
-            ):
-                TensorifyState.specialize(name)
-                should_restart = True
+    for symbol, sources in shape_env.var_to_sources.items():
+        if symbol_is_type(symbol, SymT.FLOAT) and symbol not in tensorified_symbols:
+            for source in sources:
+                if not TensorifyState.should_specialize(source):
+                    TensorifyState.specialize(source)
+                    should_restart = True
 
     if should_restart:
         # Sledgehammer time. Restart dynamo analysis, keeping track of which input sources