torch.save/load torch.compiled models (#97565)

Mark Saroufim · pytorchmergebot · commit bf52d570d9be · 2023-05-05T03:57:49.000Z
Opening this so I can discuss with @albanD I built a proof of concept of an in place API for an nn.Module that allows us to save and load a torch.compiled model with no issues https://github.com/msaroufim/mlsys-experiments/blob/main/save-compiled-model.py So users can run` model.compile()` and then run `torch.save(model, "model.pt")` and `torch.load(model, "model.pt)` with no issues unlike the rather strange current suggestion we give to users which is `opt_mod = torch.compile(mod); torch.save(mod, "model.pt")` Right now I'm trying to extend this to work for nn.modules more generally TODO: Failing tests * [x] torch.jit.load -> issue was because of aliasing `__call__` to `_call_impl`, _call_impl used to be skipped when now it lo longer is so expanded the skip check. I added an explicit `torch.jit.load()` test now which @davidberard98 suggested * [x] functorch seems to be a flake - ran locally and it worked `pytest functorch/test_eager_transforms.py` * [x] a test infra flake - `test_testing.py::TestImports::test_no_mutate_global_logging_on_import_path_functorch` * [x] It seems like I broke inlining in dynamo though `python -m pytest test/dynamo/test_dynamic_shapes.py -k test_issue175` chatting with Voz about it but still not entirely sure how to fix - found a workaround after chatting with @yanboliang * [x] `pytest test/dynamo/test_modules.py` and `test/dynamo/test_dynamic_shapes` `test/dynamo/test_misc.py` seem to be failing in CI but trying it out locally they all pass tests passed with 0 failures * [x] `pytest test/profiler/test_profiler_tree.py ` these tests have ProfilerTrees explicitly printed and will now break if __call__ is not in tree - ran with `EXPECT_ACCEPT=1` * [x] `pytest test/test_torch.py::TestTorch::test_typed_storage_deprecation_warning` a flake, ran this locally and it works fine * [x] I reverted my changes to `_dynamo/nn_module.py` since it looks like @wconstab is now directly handling `_call_impl` there but this is triggering an infinite inlining which is crashing * [x] Tried out to instead override `__call__`, python doesnt like this though #97565 (comment) Pull Request resolved: #97565 Approved by: https://github.com/aaronenyeshi, https://github.com/albanD, https://github.com/voznesenskym
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
@@ -0,0 +1,71 @@
+# Owner(s): ["module: dynamo"]
+
+import os
+import tempfile
+import unittest
+
+import torch
+from torch._dynamo.testing import CompileCounter
+
+
+class ToyModel(torch.nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.linear = torch.nn.Linear(10, 10)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu(self.linear(x))
+
+
+class InPlaceCompilationTests(unittest.TestCase):
+    def test_compilation(self):
+        torch._dynamo.reset()
+        model = ToyModel()
+        cnt = CompileCounter()
+        model.compile(backend=cnt)
+        x = torch.randn(10, 10)
+        model(x)
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_overwrite_call_impl(self):
+        torch._dynamo.reset()
+        model = ToyModel()
+        self.assertTrue(model._compiled_call_impl is None)
+        model.compile()
+        self.assertTrue(model._compiled_call_impl is not None)
+
+    def test_save(self):
+        torch._dynamo.reset()
+        model = ToyModel()
+        model.compile()
+        model(torch.randn(1, 10))
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.save(model, os.path.join(tmpdirname, "model.pt"))
+            loaded_model = torch.load(os.path.join(tmpdirname, "model.pt"))
+            loaded_model(torch.randn(1, 10))
+
+    def test_state_dict_save(self):
+        torch._dynamo.reset()
+        model = ToyModel()
+        model.compile()
+        model(torch.randn(1, 10))
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.save(model.state_dict(), os.path.join(tmpdirname, "model.pt"))
+            loaded_model = ToyModel()
+            loaded_model.load_state_dict(
+                torch.load(os.path.join(tmpdirname, "model.pt"))
+            )
+            loaded_model(torch.randn(1, 10))
+
+    def test_jit_save(self):
+        torch._dynamo.reset()
+        model = ToyModel()
+        model.compile()
+        model(torch.randn(1, 10))
+        scripted_model = torch.jit.script(model)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.jit.save(scripted_model, os.path.join(tmpdirname, "model.pt"))
+            loaded_model = torch.jit.load(os.path.join(tmpdirname, "model.pt"))
+            loaded_model(torch.randn(1, 10))
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
@@ -292,7 +292,10 @@ def test_profiler_experimental_tree(self):
             autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
               torch::autograd::AccumulateGrad
                 aten::detach
-                  detach"""
+                  detach
+            cudaGetDeviceCount
+            cudaGetDeviceCount
+            cudaGetDeviceProperties"""
         )
 
     @ProfilerTree.test
@@ -542,87 +545,95 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                   aten::empty
                   aten::fill_
               nn.Module: MyModule_0
-                <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                test_profiler_tree.py(...): forward
-                  nn.Module: ReLU_0
-                    <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                    torch/nn/modules/activation.py(...): forward
-                      torch/nn/functional.py(...): relu
-                        <built-in function _has_torch_function_unary>
-                        <built-in method relu of type object at 0xXXXXXXXXXXXX>
-                          aten::relu
-                            aten::clamp_min
-                  nn.Module: Linear_0
-                    <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                    torch/nn/modules/linear.py(...): forward
-                      torch/nn/modules/module.py(...): __getattr__
-                      torch/nn/modules/module.py(...): __getattr__
-                      <built-in function linear>
-                        aten::linear
-                          aten::t
-                            aten::transpose
-                              aten::as_strided
-                          aten::matmul
-                            aten::unsqueeze
-                              aten::as_strided
-                            aten::mm
-                              aten::resolve_conj
-                              aten::resolve_conj
-                              aten::resolve_conj
-                            aten::squeeze_
-                              aten::as_strided_
-                          aten::add_
-                  nn.Module: ReLU_1
-                    <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                    torch/nn/modules/activation.py(...): forward
-                      torch/nn/functional.py(...): relu
-                        <built-in function _has_torch_function_unary>
-                        <built-in method relu of type object at 0xXXXXXXXXXXXX>
-                          aten::relu
-                            aten::clamp_min
+                torch/nn/modules/module.py(...): _call_impl
+                  <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                  test_profiler_tree.py(...): forward
+                    nn.Module: ReLU_0
+                      torch/nn/modules/module.py(...): _call_impl
+                        <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                        torch/nn/modules/activation.py(...): forward
+                          torch/nn/functional.py(...): relu
+                            <built-in function _has_torch_function_unary>
+                            <built-in method relu of type object at 0xXXXXXXXXXXXX>
+                              aten::relu
+                                aten::clamp_min
+                    nn.Module: Linear_0
+                      torch/nn/modules/module.py(...): _call_impl
+                        <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                        torch/nn/modules/linear.py(...): forward
+                          torch/nn/modules/module.py(...): __getattr__
+                          torch/nn/modules/module.py(...): __getattr__
+                          <built-in function linear>
+                            aten::linear
+                              aten::t
+                                aten::transpose
+                                  aten::as_strided
+                              aten::matmul
+                                aten::unsqueeze
+                                  aten::as_strided
+                                aten::mm
+                                  aten::resolve_conj
+                                  aten::resolve_conj
+                                  aten::resolve_conj
+                                aten::squeeze_
+                                  aten::as_strided_
+                              aten::add_
+                    nn.Module: ReLU_1
+                      torch/nn/modules/module.py(...): _call_impl
+                        <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                        torch/nn/modules/activation.py(...): forward
+                          torch/nn/functional.py(...): relu
+                            <built-in function _has_torch_function_unary>
+                            <built-in method relu of type object at 0xXXXXXXXXXXXX>
+                              aten::relu
+                                aten::clamp_min
               <built-in method ones of type object at 0xXXXXXXXXXXXX>
                 aten::ones
                   aten::empty
                   aten::fill_
               nn.Module: MyModule_0
-                <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                test_profiler_tree.py(...): forward
-                  nn.Module: ReLU_0
-                    <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                    torch/nn/modules/activation.py(...): forward
-                      torch/nn/functional.py(...): relu
-                        <built-in function _has_torch_function_unary>
-                        <built-in method relu of type object at 0xXXXXXXXXXXXX>
-                          aten::relu
-                            aten::clamp_min
-                  nn.Module: Linear_0
-                    <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                    torch/nn/modules/linear.py(...): forward
-                      torch/nn/modules/module.py(...): __getattr__
-                      torch/nn/modules/module.py(...): __getattr__
-                      <built-in function linear>
-                        aten::linear
-                          aten::t
-                            aten::transpose
-                              aten::as_strided
-                          aten::matmul
-                            aten::unsqueeze
-                              aten::as_strided
-                            aten::mm
-                              aten::resolve_conj
-                              aten::resolve_conj
-                              aten::resolve_conj
-                            aten::squeeze_
-                              aten::as_strided_
-                          aten::add_
-                  nn.Module: ReLU_1
-                    <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
-                    torch/nn/modules/activation.py(...): forward
-                      torch/nn/functional.py(...): relu
-                        <built-in function _has_torch_function_unary>
-                        <built-in method relu of type object at 0xXXXXXXXXXXXX>
-                          aten::relu
-                            aten::clamp_min
+                torch/nn/modules/module.py(...): _call_impl
+                  <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                  test_profiler_tree.py(...): forward
+                    nn.Module: ReLU_0
+                      torch/nn/modules/module.py(...): _call_impl
+                        <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                        torch/nn/modules/activation.py(...): forward
+                          torch/nn/functional.py(...): relu
+                            <built-in function _has_torch_function_unary>
+                            <built-in method relu of type object at 0xXXXXXXXXXXXX>
+                              aten::relu
+                                aten::clamp_min
+                    nn.Module: Linear_0
+                      torch/nn/modules/module.py(...): _call_impl
+                        <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                        torch/nn/modules/linear.py(...): forward
+                          torch/nn/modules/module.py(...): __getattr__
+                          torch/nn/modules/module.py(...): __getattr__
+                          <built-in function linear>
+                            aten::linear
+                              aten::t
+                                aten::transpose
+                                  aten::as_strided
+                              aten::matmul
+                                aten::unsqueeze
+                                  aten::as_strided
+                                aten::mm
+                                  aten::resolve_conj
+                                  aten::resolve_conj
+                                  aten::resolve_conj
+                                aten::squeeze_
+                                  aten::as_strided_
+                              aten::add_
+                    nn.Module: ReLU_1
+                      torch/nn/modules/module.py(...): _call_impl
+                        <built-in method _get_tracing_state of PyCapsule object at 0xXXXXXXXXXXXX>
+                        torch/nn/modules/activation.py(...): forward
+                          torch/nn/functional.py(...): relu
+                            <built-in function _has_torch_function_unary>
+                            <built-in method relu of type object at 0xXXXXXXXXXXXX>
+                              aten::relu
+                                aten::clamp_min
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
                   ..."""
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
@@ -246,7 +246,9 @@ def __call__(self, fn):
             filename = None
         if (
             (filename is None or skipfiles.check(filename))
-            and (getattr(fn, "__name__", "") != "_call_impl")
+            and (
+                getattr(fn, "__name__", "") not in ["_call_impl", "_wrapped_call_impl"]
+            )
             and filename not in DONT_WRAP_FILES
         ):
             # call to a builtin without a frame for us to capture
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
@@ -318,15 +318,14 @@ def call_function(
                     # the call_wrapped currently, and maybe other issues too
                     fn = mod.forward
                 else:
-                    fn = mod.__call__
+                    fn = mod._call_impl
                 fn_source = AttrSource(self.source, "__call__")
                 if istype(fn, types.MethodType):
                     fn = fn.__func__
                     fn_source = AttrSource(fn_source, "__func__")
                     args = [self] + args
                 else:
                     assert istype(fn, types.FunctionType)
-
                 options["source"] = fn_source
                 return tx.inline_user_function_return(
                     variables.UserFunctionVariable(fn, **options),
@@ -374,7 +373,7 @@ def generic_call_method_helper(name):
                 **options,
             )
 
-        if name == "_call_impl":
+        if name in ["_call_impl", "_wrapped_call_impl"]:
             # Example: `self.layer.__call__(x)`
             # This is used for explicit calling `__call__` in a forward function.
             # Dynamo inlines `__call__`, includes hooks.
@@ -683,14 +682,12 @@ def call_function(
     ) -> "VariableTracker":
         options = VariableTracker.propagate(self, args, kwargs.values())
         mod = self.value
-
         # see comment on lazy module handling in NNModuleVariable.call_function for context
         if is_lazy_module(mod):
             if mod.cls_to_become is not None:
                 self.value_type = mod.cls_to_become
             initialize_lazy_module(tx, mod, args, kwargs)
-
-        name = "__call__"
+        name = "_call_impl"
         fn = getattr(self.value_type, name)
         if self.source:
             source = AttrSource(AttrSource(self.source, "__class__"), name)
@@ -711,6 +708,16 @@ def call_method(
         from .builder import VariableBuilder
 
         options = VariableTracker.propagate(self, args, kwargs.values())
+        if name in ["_call_impl", "_wrapped_call_impl"]:
+            fn = getattr(self.value_type, name)
+            if self.source:
+                source = AttrSource(AttrSource(self.source, "__class__"), name)
+            else:
+                source = None
+
+            return variables.UserFunctionVariable(
+                fn, source=source, **options
+            ).call_function(tx, [self] + list(args), kwargs)
 
         if name not in getattr(self.value, "__dict__", {}):
             try:
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
@@ -944,7 +944,7 @@ def fail(self, *args, **kwargs):
         return fail
 
     for name, method in _get_methods(torch.nn.Module):
-        if name.startswith("__"):
+        if name.startswith("__") or name.endswith("_call_impl"):
             continue
         if (
             name not in RecursiveScriptModule.__dict__
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
@@ -433,11 +433,15 @@ def forward(self, x):
     _load_state_dict_post_hooks: Dict[int, Callable]
     _modules: Dict[str, Optional['Module']]
     call_super_init: bool = False
+    _compiled_call_impl : Optional[Callable] = None
+
+
 
     def __init__(self, *args, **kwargs) -> None:
         """
         Initializes internal Module state, shared by both nn.Module and ScriptModule.
         """
+
         torch._C._log_api_usage_once("python.nn_module")
 
         # Backward compatibility: no args used to be allowed when call_super_init=False
@@ -1491,6 +1495,12 @@ def _slow_forward(self, *input, **kwargs):
                 tracing_state.pop_scope()
         return result
 
+    def _wrapped_call_impl(self, *args, **kwargs):
+        if self._compiled_call_impl is not None:
+            return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
+        else:
+            return self._call_impl(*args, **kwargs)
+
     def _call_impl(self, *args, **kwargs):
         forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward)
         # If we don't have any hooks, we want to skip the rest of the logic in
@@ -1572,10 +1582,16 @@ def _call_impl(self, *args, **kwargs):
 
         return result
 
-    __call__ : Callable[..., Any] = _call_impl
+    __call__ : Callable[..., Any] = _wrapped_call_impl
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop("_compiled_call_impl", None)
+        return state
 
     def __setstate__(self, state):
         self.__dict__.update(state)
+
         # Support loading old checkpoints that don't have the following attrs:
         if '_forward_pre_hooks' not in self.__dict__:
             self._forward_pre_hooks = OrderedDict()
@@ -2420,3 +2436,14 @@ def _replicate_for_data_parallel(self):
         replica._is_replica = True  # type: ignore[assignment]
 
         return replica
+
+    def compile(self, *args, **kwargs):
+        """
+        Compile this Module's forward using :func:`torch.compile`.
+
+        This Module's `__call__` method is compiled and all arguments are passed as-is
+        to :func:`torch.compile`.
+
+        See :func:`torch.compile` for details on the arguments for this function.
+        """
+        self._compiled_call_impl = torch.compile(self._call_impl, *args, **kwargs)