Nvfuser guard patch

jjsjann123 · pytorchmergebot · commit d0ce1d13c088 · 2022-04-01T14:23:48.000Z
Fixes issue where CudaFusionGuard would return false on backward graph because `requires_grad` flag doesn't match. This is due to the fact that autodiff uses GradMode switch to turn on/off requires_grad, which is not taken into consideration by nvfuser guard. We verified the implementation under `TensorType::matchTensor`. - [x] Add python test to verify no fallback is observed Pull Request resolved: pytorch#75016 Approved by: https://github.com/eellison
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
@@ -10,6 +10,7 @@
 
 import torch
 from torch.nn import functional
+from torch.profiler import profile, ProfilerActivity
 
 from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR  # TEST_WITH_ROCM
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
@@ -4273,6 +4274,29 @@ def reduce_scalar(temp):
         reduce_scalar(res).backward()
         torch._C._jit_set_nvfuser_guard_mode(old_guard)
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cuda_fusion_guard_backward(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+
+        inp = torch.randn(10, device="cuda", requires_grad=True)
+        grad = torch.randn(10, device="cuda")
+
+        def f(x):
+            a = x.cos().cos()
+            return a
+        scripted = torch.jit.script(f)
+
+        with profile(activities=[ProfilerActivity.CPU]) as prof:
+            for _ in range(5):
+                inp.grad = None
+                out = scripted(inp)
+                out.backward(grad)
+
+        # check that we do not have fallback triggered
+        self.assertEqual(prof.events().table().find("fallback"), -1)
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
 
 class TestPassManagerCudaFuser(JitTestCase):
 
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -114,7 +114,10 @@ bool skipNode(const std::string& symbol_str, bool flip) {
 //! implementation is actually more relaxed.
 //!
 //! Things that we check:
-//!   a. identical rank & scalar type
+//!   a. identical rank & scalar type & device & requires_grad
+//!        note that: requires_grad is tricky! because autodiff might be marking
+//!                   gradMode to overwrite it. Look at TensorType::matchTensor
+//!                   for the check condition
 //!   b. stride check:
 //!        b.1. identical stride order
 //!        b.2. identical contiguity
@@ -146,7 +149,8 @@ bool complyWith(
       (guard_tensor_type->device().has_value() &&
        (guard_tensor_type->device().value() != tensor.device())) ||
       (guard_tensor_type->requiresGrad().has_value() &&
-       guard_tensor_type->requiresGrad().value() != tensor.requires_grad())) {
+       guard_tensor_type->requiresGrad().value() !=
+           (tensor.requires_grad() && at::GradMode::is_enabled()))) {
     return false;
   }