Use wait stream instead of synchronize() in cudagraph warmup (#117578)

eellison · pytorchmergebot · commit 41153542ae92 · 2024-01-18T03:33:44.000Z
Fix for #113895 There are three phases to cudagraph trees. Warmup, recording, and execution. On recording and execution we are executing under the current_stream. In warmup we execute under a side stream that we also use for cudagraph recording so as to reuse memory. After we execute on the side stream we need to sync the current stream to the side stream. Previously there was a `torch.cuda.synchronize` but not a `torch.cuda.current_stream().wait_stream(stream)`. This PR removes the global sync and adds a wait_stream. I have confirmed that it fixes #113895. It's not entirely clear me why torch.cuda.synchronize would be insufficient - I would have thought the global sync would encompass the stream to stream sync. However, we do have a number of [instances](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/compile_fx.py#L748-L749) throughout the code base where we do a stream->stream sync after the global sync so clearly I am missing something here. In any case the stream->stream sync is better perf than a global synchronize. Pull Request resolved: #117578 Approved by: https://github.com/zdevito
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
@@ -708,6 +708,26 @@ def foo(args):
             self.assertEqual(node.cached_tensor_outputs, [None])
             self.assertEqual(node.unaliased_in_all_paths, [False])
 
+        def test_warmup_stream_sync(self):
+            def foo(args):
+                x = args[0]
+                args.clear()
+                x_orig = x
+                for _ in range(100):
+                    x = x @ x
+                return (x,)
+
+            inp = torch.rand([4096, 4096], device="cuda")
+            ref = foo([inp])[0]
+            torch.cuda.synchronize()
+
+            user_stream = torch.cuda.Stream()
+            with torch.cuda.stream(user_stream):
+                foo_cg = self.cudagraphify_impl(foo, [inp], (0,))
+                out = foo_cg([inp])[0]
+                y = out + 1
+                self.assertEqual(y, ref + 1)
+
         def test_unaligned_static_parameter(self):
             def gen_inp():
                 inp = torch.ones([20], device="cuda")
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
@@ -518,6 +518,8 @@ def _use_cuda_memory_pool_manager(device, mem_pool, stream):
             torch._C._cuda_endAllocateCurrentStreamToPool(device, mem_pool)
             torch._C._cuda_releasePool(device, mem_pool)
 
+    torch.cuda.current_stream().wait_stream(stream)
+
 
 def map_to_ref(t: Optional[Tensor]) -> Optional[StorageWeakRefWrapper]:
     if not isinstance(t, torch.Tensor):
@@ -610,9 +612,6 @@ def get_non_cudagraph_inps():
         ), get_history_recording():
             out = self.wrapped_function.model(new_inputs)
 
-        # sync up stream used in `_use_cuda_memory_pool_manager` - TODO - wait stream instead ?
-        torch.cuda.synchronize()
-
         assert len(new_inputs) == 0
 
         # sdpa returns cpu tensors when not recording cuda graph