Enable Copy Engine all-gather in FSDP (pytorch#176613)

kwen2501 · pytorchmergebot · commit 761237cd9e23 · 2026-03-13T01:49:39.000Z
Resolves [[RFC] Enable Copy Engine all-gather in FSDP](pytorch#176418) Productization of micro benchmark pytorch#172714, as it showed 15% end-to-end speedup when the all-gather is overlapped with GEMM, compared to non-CE case. Basic recipe pytorch#170265, i.e. using symmetric memory for all-gather buffer (and turn on NCCL zero-CTA policy). ## Implementation - Added a `SymmMemAllocMixin` in FSDP which could allocate symmetric memory for all-gather buffer. - To enable reuse of symmetric buffer, used MemPool around the allocation. (Verified from profile below that rendezvous is not repeatedly called). - Added a `set_symm_mem_for_comm` API for user to turn on this feature. ## Profile - Added test `TestFullyShardSymmMem`. - Flip `PROFILE` to True in the TestCase - Run: `python test/distributed/_composable/fsdp/test_fully_shard_comm.py TestFullyShardSymmMem.test_fully_shard_symm_mem` All-gather's are done by Copy Engine now: <img width="1239" height="213" alt="Screenshot 2026-03-05 at 10 41 59 PM" src="https://github.com/user-attachments/assets/885eaf55-5356-43a6-87b4-2faefae2b590" /> ## TODO - Add a similar `SymmMemAllocMixin` for reduce-scatter. That would not trigger Copy Engine because reduce-scatter still needs compute. But it will trigger a newest symmetric kernel for RS in NCCL 2.29, which is faster, and more scalable. Special thanks to @xuwchen @qiangyicheng for your help Pull Request resolved: pytorch#176613 Approved by: https://github.com/weifengpy
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -13,6 +13,8 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+from torch._C._autograd import DeviceType
+from torch._C._distributed_c10d import _SymmetricMemory
 from torch.distributed._composable import checkpoint, replicate
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
@@ -44,8 +46,10 @@
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental import implicit_replication
+from torch.testing._internal.common_cuda import SM90OrLater, TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    requires_multicast_support,
+    MultiProcContinuousTest,
+    PLATFORM_SUPPORTS_SYMM_MEM,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_fsdp import (
@@ -59,7 +63,9 @@
     patch_unshard,
 )
 from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
     run_tests,
+    skip_but_pass_in_sandcastle_if,
     TEST_WITH_ROCM,
     TEST_XPU,
     xfailIf,
@@ -70,6 +76,7 @@
     Transformer,
     TransformerBlock,
 )
+from torch.testing._internal.inductor_utils import skipCUDAIf
 
 
 c10d_ops = torch.ops.c10d
@@ -1638,8 +1645,15 @@ def _run(cls, *args, **kwargs):
     @skip_if_lt_x_gpu(2)
     # The NCCL PG refuses to allocate tensors if multicast is unavailable, see
     # https://github.com/pytorch/pytorch/blob/503362d019b3782581492af7767945dbd75ca1c9/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L5634
-    @requires_multicast_support()
     def test_fully_shard_alloc_from_pg(self):
+        # Run this check inside test instead of using @requires_multicast_support().
+        # The decorator would trigger an initialization of SymmMem allocator
+        # when Python statically initializes classes in this file, causing
+        # SymmMem to fix the allocate backend to "CUDA". This is unfriendly for
+        # other tests in this file that requires NCCL backend
+        if not _SymmetricMemory.has_multicast_support(DeviceType.CUDA, 0):
+            self.skipTest("multicast support is not available")
+
         torch.manual_seed(42)
         model_args = ModelArgs()
         model = Transformer(model_args)
@@ -1691,6 +1705,58 @@ def test_exception_when_used_together_with_comm_hooks(self):
             model.set_allocate_memory_from_process_group_for_comm(True)
 
 
+@requires_cuda_p2p_access()
+@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Not enough GPUs to run the test")
+@unittest.skipIf(
+    not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this platform"
+)
+@skipCUDAIf(TEST_WITH_ROCM, "requires NVIDIA GPUs")
+@skipCUDAIf(not SM90OrLater, "requires sm90+")
+class TestFullyShardSymmMem(MultiProcContinuousTest):
+    @classmethod
+    def backend_str(cls) -> Optional[str]:
+        return "nccl"
+
+    @classmethod
+    def opts(cls):
+        if not dist.is_nccl_available():
+            return None
+        # Enable Zero-CTA policy for CE collectives
+        opts = dist.ProcessGroupNCCL.Options()
+        opts.config.cta_policy = dist.ProcessGroupNCCL.NCCL_CTA_POLICY_ZERO
+        return opts
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cuda", self.rank)
+
+    def test_fully_shard_symm_mem(self):
+        torch.manual_seed(42 + self.rank)
+        device = torch.device("cuda", self.rank)
+        torch.cuda.set_device(device)
+        seq_len = 64
+        model_args = ModelArgs()
+        model_args.dim = 4096
+        model_args.max_seq_len = seq_len
+        model = Transformer(model_args).to(device)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module)
+                module.set_symm_mem_for_comm()
+        fully_shard(model)
+        model.set_symm_mem_for_comm()
+
+        bs = 4
+        inp = torch.randint(0, model_args.vocab_size, (bs, seq_len), device=device)
+
+        def run():
+            loss = model(inp)
+            loss.sum().backward()
+
+        run()
+        torch.cuda.synchronize(device)
+
+
 class TestFullyShardForceSumReduction(FSDPTest):
     # The messages might change when we move to a different NCCL version.
     # Please update this test if it starts failing.
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -1,10 +1,11 @@
 import math
 from collections.abc import Callable, Sequence
 from itertools import chain
-from typing import Any, cast, NamedTuple
+from typing import Any, cast, Literal, NamedTuple
 
 import torch
 import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
 from torch.distributed.device_mesh import _get_device_handle
 from torch.distributed.distributed_c10d import ReduceOp
 from torch.distributed.fsdp._fully_shard._fsdp_api import AllGather, ReduceScatter
@@ -77,6 +78,36 @@ def allocate(
         return torch.empty(*size, dtype=dtype, device=device)
 
 
+class SymmMemAllocMixin:
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        backend: Literal["NCCL"] = "NCCL",
+        *args: Any,
+        **kwargs: Any,
+    ):
+        self._group = group
+        symm_mem.set_backend(backend)
+        # Force initialization of communicator; otherwise, the rendezvous may
+        # see empty communicator.
+        # TODO: Remove this, maybe by warning user to perform eager dist init.
+        # For now, it is okay since it isjust a one-time cost at init.
+        dist.barrier(group=group)
+
+    def allocate(
+        self,
+        size: Sequence[int | torch.SymInt],
+        *,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> torch.Tensor:
+        # Leverage MemPool to reuse the symmetric buffer, avoiding allocation
+        # and rendezvous overhead
+        mempool = symm_mem.get_mem_pool(device)
+        with torch.cuda.use_mem_pool(mempool):
+            return torch.empty(size, dtype=dtype, device=device)
+
+
 class DefaultAllGather(DefaultAllocMixin, AllGather):
     def __call__(
         self,
@@ -112,6 +143,35 @@ def __call__(
         )
 
 
+class SymmMemAllGather(SymmMemAllocMixin, AllGather):
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        backend: Literal["NCCL"] = "NCCL",
+    ) -> None:
+        super().__init__(group, backend)
+
+    def __call__(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        group: dist.ProcessGroup,
+        async_op: bool = False,
+    ) -> dist.Work | None:
+        # We are doing inplace all-gather, so we need to rendezvous the output tensor only
+        symm_mem.rendezvous(output_tensor, group=group.group_name)
+        # Calling regular all-gather would already cause libraries like NCCL to
+        # use its optimized all-gather implementation for symmetric memory:
+        # - Copy Engine All-Gather (when zero-CTA policy is enabled)
+        # - Symmetric Kernel All-Gather (when zero-CTA policy is not enabled)
+        return dist.all_gather_into_tensor(
+            output_tensor,
+            input_tensor,
+            group=group,
+            async_op=async_op,
+        )
+
+
 class DefaultReduceScatter(DefaultAllocMixin, ReduceScatter):
     def __call__(
         self,
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -3,7 +3,7 @@
 
 import contextlib
 import logging
-from typing import Any, cast, NamedTuple, TYPE_CHECKING
+from typing import Any, cast, Literal, NamedTuple, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
@@ -29,6 +29,7 @@
     ProcessGroupAllocAllGather,
     ProcessGroupAllocReduceScatter,
     ReduceScatter,
+    SymmMemAllGather,
 )
 from ._fsdp_common import (
     _dynamo_disable,
@@ -275,6 +276,16 @@ def lazy_init(self):
         self._init_mp_dtypes()
         self._register_state_dict_hooks()
 
+    def set_symm_mem(self, backend: Literal["NCCL"] = "NCCL") -> None:
+        if not isinstance(self._all_gather_comm, (DefaultAllGather | SymmMemAllGather)):
+            raise AssertionError(
+                "cannot call set_symm_mem() "
+                f"when all gather comm is custom: {self._all_gather_comm.__class__.__name__}"
+            )
+        self._all_gather_comm = SymmMemAllGather(
+            self._all_gather_process_group, backend
+        )
+
     def set_allocate_memory_from_process_group(self, enable: bool) -> None:
         """
         Whether to (try to) use the ProcessGroup's allocate_tensor method for
diff --git a/torch/distributed/fsdp/_fully_shard/_fully_shard.py b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
@@ -5,7 +5,7 @@
 
 import functools
 from contextlib import contextmanager
-from typing import Any, cast, NoReturn, overload, TYPE_CHECKING
+from typing import Any, cast, Literal, NoReturn, overload, TYPE_CHECKING
 from typing_extensions import deprecated
 
 import torch
@@ -629,6 +629,40 @@ def set_allocate_memory_from_process_group_for_comm(self, enable: bool) -> None:
         for fsdp_param_group in state._fsdp_param_groups:
             fsdp_param_group.set_allocate_memory_from_process_group(enable)
 
+    def set_symm_mem_for_comm(self, backend: Literal["NCCL"] = "NCCL") -> None:
+        """
+        Sets the symmetric memory (``symm_mem``) backend for allocating the
+        staging buffers used in all-gather collectives. This allows NCCL to use
+        optimized all-gather implementations via symmetric memory. Such
+        optimization may depend on the topology of the system.  For single node,
+        Copy Engine All-Gather may be used. For multi-node, Symmetric Kernel
+        All-Gather may be used.
+
+        To enable Copy Engine All-Gather, you need to set the NCCL process group
+        with the zero-CTA policy.
+        ```python
+        opts = dist.ProcessGroupNCCL.Options()
+        opts.config.cta_policy = dist.ProcessGroupNCCL.NCCL_CTA_POLICY_ZERO
+        dist.init_process_group(backend="nccl", pg_options=opts, device_id=device)
+        ```
+        Alternatively, you can set the environment variable `NCCL_CTA_POLICY` to 2.
+        ```bash
+        export NCCL_CTA_POLICY=2
+        ```
+        For more details, see [Copy Engine
+        Collectives](https://docs.pytorch.org/docs/2.11/symmetric_memory.html#copy-engine-collectives).
+
+        This cannot be used together with :meth:`set_custom_all_gather` or
+        :meth:`set_custom_reduce_scatter`.
+
+        Args:
+            backend (str): The symmetric memory backend to use. Defaults to
+                ``"NCCL"``. Currently, only ``"NCCL"`` is supported.
+        """
+        state = self._get_fsdp_state()
+        for fsdp_param_group in state._fsdp_param_groups:
+            fsdp_param_group.set_symm_mem(backend)
+
     def _set_unshard_async_op(self, async_op: bool):
         """
         Sets whether to use ``async_op=True`` or ``False`` for the pre-forward