Update on "[inductor] CUDAGraph P2P pool handling for symm_mem"

tianrengao · tianrengao · commit 8292e2e2af3f · 2026-03-25T23:24:56.000-07:00
Summary: When symm_mem P2P tensors (allocated via empty_strided_p2p with alloc_id) are inputs to a CUDAGraph partition, the cudagraph tree must handle them specially: 1. p2p_input_idxs: detected during node initialization via _has_Standard_Deleter check, added to static_input_idxs so they are passed through without copying into the cudagraph pool (which would lose the P2P property) and their pointer stability is validated on replay. 2. check_memory_pool: filters out P2P allocations (non-standard deleter) before validating against the cudagraph pool, since P2P buffers use cuMemCreate/cuMemMap and are not managed by the CUDA caching allocator. 3. dealloc_current_path_weakrefs: skips standard-deleter assertion for P2P storage wrappers. 4. test_external_allocation_fallback updated: now expects success (auto copy to P2P) instead of RuntimeError, with codegen and runtime correctness checks. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D93914969 cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy kadeng muchulee8 amjames chauhang aakhundov coconutruben jataylo mlazos [ghstack-poisoned]
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
@@ -1593,14 +1593,10 @@ def func(x, w):
     @skip_if_rocm_multiprocess  # requires registered-buffer support
     @skip_if_lt_x_gpu(2)
     @fresh_inductor_cache()
-    def test_cudagraph_p2p_input_passthrough(self):
+    def test_one_shot_all_reduce_with_cudagraph(self):
         """
-        Verify that when a symm_mem collective's input is a cudagraph-managed
-        tensor from a prior compiled graph, the P2P tensor is correctly passed
-        through the cudagraph tree without being copied to the regular pool.
-
-        This tests the p2p_input_idxs mechanism in CUDAGraphNode that adds P2P
-        inputs to static_input_idxs so they are not re-allocated.
+        Verify one_shot_all_reduce correctness under CUDAGraph
+        record + replay (mode="reduce-overhead").
         """
         self._init_process()
 
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
@@ -921,19 +921,15 @@ def __init__(
             if isinstance(t, torch.Tensor) and self._is_cuda_graph_recorded_tensor(t)
         ]
 
-        # P2P symmetric memory inputs (allocated via empty_strided_p2p with
-        # alloc_id).  These have stable addresses and are passed through
-        # without copying into the cudagraph pool.
-        #
-        # Detection: P2P buffers are allocated via cuMemCreate/cuMemMap (not
-        # the CUDA caching allocator), so they lack a standard deleter.
-        # TODO: Replace with a positive is_p2p check on StorageImpl (requires C++ change).
+        # P2P symmetric memory inputs are not from the caching allocator.
+        # They are allocated via empty_strided_p2p and have stable addresses.
+        # Add them to static_input_idxs to prevent re-allocation from the cudagraph pool.
         self.p2p_input_idxs: list[int] = [
             idx
             for idx, t in enumerate(inputs)
             if isinstance(t, torch.Tensor)
             and t.is_cuda
-            and not torch._C._has_Standard_Deleter(t.untyped_storage()._cdata)
+            and _is_external_storage(t.untyped_storage()._cdata)
         ]
 
         # (depth, offset) of live tensors which are alias of previous graph outputs
@@ -1872,6 +1868,17 @@ def format_tb(frames: list[Any]) -> str:
     return "".join(traceback.format_list(formatted_traceback))
 
 
+def _is_external_storage(storage_cdata: int) -> bool:
+    """Check if a storage is not allocated by the CUDA caching allocator.
+    In the cudagraph tree, all standard CUDA tensors use the caching
+    allocator's deleter (raw_deleter).  External allocations such as P2P
+    symmetric memory (via cuMemCreate) use a different deleter, so we
+    can distinguish them with _has_Standard_Deleter.
+    TODO: add a positive is_p2p / is_external flag on StorageImpl so we
+    don't rely on deleter identity."""
+    return not torch._C._has_Standard_Deleter(storage_cdata)
+
+
 def check_memory_pool(
     device: int,
     pool_id: tuple[int, int],
@@ -1884,10 +1891,10 @@ def check_memory_pool(
         storage_ptr = stor()
         if storage_ptr is None:
             continue
-        # Skip non-pool allocations (e.g., P2P symmetric memory buffers allocated
-        # via cuMemCreate/cuMemMap). These are not managed by the CUDA caching
-        # allocator and should not be validated against the cudagraph pool.
-        if not torch._C._has_Standard_Deleter(storage_ptr):
+        # Skip non-pool allocations, for example, P2P symmetric memory buffers allocated via cuMemCreate.
+        # They are not managed by the CUDA caching allocator and should not be validated
+        # against the cudagraph pool.
+        if _is_external_storage(storage_ptr):
             continue
         unique_storages.add(stor.data_ptr())
 
@@ -2731,8 +2738,9 @@ def apply_checkpoint_execution_state_in_allocator(self) -> None:
             for wrapper in live_storages_wrappers:
                 storage_ptr = wrapper()
                 assert storage_ptr is not None
-                # Skip non-pool allocations (e.g., P2P symmetric memory buffers)
-                if torch._C._has_Standard_Deleter(storage_ptr):
+                # P2P storages are not in the cudagraph pool, so
+                # skip the deallocation check for them.
+                if not _is_external_storage(storage_ptr):
                     assert wrapper.data_ptr() not in ptrs_to_deallocate
 
     def live_cudagraph_pool_storages_in_curr_execution(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -111,7 +111,6 @@ CUDAPeerAllocInfo::~CUDAPeerAllocInfo() {
   if (is_finalizing()) {
     return;
   }
-  // Best-effort free -- ignore errors during process teardown.
   c10::cuda::CUDAGuard guard(local_device_idx_);
   (void)cudaFree(buffers_dev_);
   (void)cudaFree(signal_pads_dev_);

Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,6 @@ CUDAPeerAllocInfo::~CUDAPeerAllocInfo() {`
`111`	`111`	`if (is_finalizing()) {`
`112`	`112`	`return;`
`113`	`113`	`}`
`114`		`- // Best-effort free -- ignore errors during process teardown.`
`115`	`114`	`c10::cuda::CUDAGuard guard(local_device_idx_);`
`116`	`115`	`(void)cudaFree(buffers_dev_);`
`117`	`116`	`(void)cudaFree(signal_pads_dev_);`