Implement gather primitive for ProcessGroupNCCL (#66745)

Wanchao Liang · pytorchmergebot · commit 9b53d3194c55 · 2022-01-27T19:37:55.000Z
Summary: Pull Request resolved: #66745 This PR implement NCCL gather and add gather to ProcessGroupNCCL using nccl send/recv api. NCCL doesn’t directly provide primitives for gather, so we need to be implemented on top of NCCL’s send/recv API. 1. In ProcessGroupNCCL.cpp, the outputTensors are first flattened, then inputTensors and outputFlattened are passed by the collective class to gather() function in nccl.cpp. 1. In nccl.cpp, gather is implemented using ncclSend/ncclRecv: all the ranks send inputTensor to the root rank, and the root rank uses a for loop to receive these inputTensors. ghstack-source-id: 147754838 Test Plan: test_gather_ops test_gather_checks test_gather_stress Reviewed By: pritamdamania87 Differential Revision: D29616361 fbshipit-source-id: b500d9b8e67113194c5cc6575fb0e5d806dc7782 (cherry picked from commit d560ee7)
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
@@ -37,7 +37,7 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.
 +----------------+-----+-----+-----+-----+-----+-----+
 | all_gather     | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
 +----------------+-----+-----+-----+-----+-----+-----+
-| gather         | ✓   | ✘   | ✓   | ?   | ✘   | ✘   |
+| gather         | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
 +----------------+-----+-----+-----+-----+-----+-----+
 | scatter        | ✓   | ✘   | ✓   | ?   | ✘   | ✘   |
 +----------------+-----+-----+-----+-----+-----+-----+
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -422,7 +422,7 @@ def test_allgather_ops(self):
 
         def allgather(output_ts, input_ts):
             work = pg.allgather(output_ts, input_ts)
-            work.wait()
+            return work.wait()
 
         tensors = [torch.empty(2, 2).fill_(2).cuda(device=i) for i in local_device_ids]
         output_tensors = []
@@ -435,7 +435,7 @@ def allgather(output_ts, input_ts):
             output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
             expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
 
-        allgather(output_tensors, tensors)
+        result = allgather(output_tensors, tensors)
 
         # Verification
         self.assertEqual(output_tensors, expected_output)
@@ -495,6 +495,140 @@ def allgather_base(output_t, input_t):
             # fails the check because the dtype is different
             allgather_base(output_t, tensor)
 
+    @requires_nccl()
+    @sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
+    def test_gather_ops(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def gather(output_t, input_t, rootRank):
+            opts = c10d.GatherOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.gather(output_t, input_t, opts)
+            else:
+                work = pg.gather([], input_t, opts)
+            work.wait()
+
+        # init input
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([self.rank]).cuda(device_id))
+
+        # init output
+        output_ts = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            output_ts.append([])
+            for rank in range(self.world_size):
+                output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
+
+        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+        for rank in range(self.world_size):
+            gather(output_ts, tensors, rank)
+            if rank == self.rank:
+                self.assertEqual(expected, output_ts)
+
+    @requires_nccl()
+    @sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
+    def test_gather_stress(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def gather(output_t, input_t, rootRank):
+            opts = c10d.GatherOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.gather(output_t, input_t, opts)
+            else:
+                work = pg.gather([], input_t, opts)
+            work.wait()
+
+        stress_length = 1000
+
+        # init input
+        tensors = []
+        for i in range(stress_length):
+            tensors.append([])
+            for device_id in local_device_ids:
+                tensors[i].append(torch.tensor([self.rank]).cuda(device_id))
+
+        # init output
+        output_ts = []
+        for i in range(stress_length):
+            output_ts.append([[] for _ in range(num_gpus)])
+            for idx, ls in enumerate(output_ts[i]):
+                gpu_idx = local_device_ids[idx]
+                for _ in range(self.world_size):
+                    ls.append(torch.tensor([-1]).cuda(gpu_idx))
+
+        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+        for i in range(stress_length):
+            for rank in range(self.world_size):
+                gather(output_ts[i], tensors[i], rank)
+                # Verification
+                if rank == self.rank:
+                    self.assertEqual(output_ts[i], expected)
+
+    @requires_nccl()
+    @sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
+    def test_gather_checks(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        # init input
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([self.rank]).cuda(device_id))
+
+        # init output
+        output_ts = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            output_ts.append([])
+            for rank in range(self.world_size):
+                output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
+
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
+            opts = c10d.GatherOptions()
+            opts.rootRank = -1
+            pg.gather(output_ts, tensors, opts)
+
+        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+            pg.gather(output_ts, tensors, 0)
+
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
+            opts = c10d.GatherOptions()
+            opts.rootRank = self.world_size
+            pg.gather(output_ts, tensors, opts)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Tensor list must be nonempty"
+        ):
+            opts = c10d.GatherOptions()
+            opts.rootRank = 0
+            pg.gather(output_ts, [], opts)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Tensors must be on distinct GPU devices"
+        ):
+            # init input
+            tensors2 = []
+            for device_id in local_device_ids:
+                tensors2.append(torch.tensor([self.rank]).cuda(device_id))
+                tensors2.append(torch.tensor([self.rank]).cuda(device_id))
+
+            opts = c10d.GatherOptions()
+            opts.rootRank = 0
+            pg.gather(output_ts, tensors2, opts)
+
+
     @requires_nccl()
     @sandcastle_skip_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
     def test_reduce_scatter_base_basics(self):
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
@@ -809,6 +809,54 @@ void recv(
 #endif
 }
 
+
+void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && (NCCL_MAJOR * 10 + NCCL_MINOR) >= 27
+  using namespace torch::cuda::nccl::detail;
+
+  auto comm = to_nccl_comm(_comm);
+  int numranks, cur_rank;
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
+
+  size_t count = inputs.numel();
+  auto type = to_nccl_data_type(inputs);
+  const auto* sendbuff = reinterpret_cast<char*>(inputs.data_ptr());
+
+  NCCL_CHECK(ncclGroupStart());
+
+  if (cur_rank == root)
+  {
+    for (int r = 0; r < numranks; r++)
+    {
+      if (r != root) {
+        auto* recvbuff =  reinterpret_cast<char*>(outputs[r].data_ptr());
+        NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
+      } else {
+        // on its own rank, simply copy from the input
+        outputs[r].copy_(inputs);
+      }
+    }
+  } else {
+    NCCL_CHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  }
+  NCCL_CHECK(ncclGroupEnd());
+
+#else
+  AT_ERROR("gather is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+
 } // namespace nccl
 } // namespace cuda
 } // namespace torch
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
@@ -150,6 +150,13 @@ TORCH_CUDA_CPP_API void all_gather(
     const stream_list& streams = {},
     const comm_list& user_comms = {});
 
+TORCH_CUDA_CPP_API void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root = 0);
+
 TORCH_CUDA_CPP_API void all2all_single_equal_split(
     at::Tensor& input,
     at::Tensor& output,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2213,10 +2213,76 @@ void ProcessGroupNCCL::groupEnd() {
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::gather(
-    std::vector<std::vector<at::Tensor>>& /* unused */,
-    std::vector<at::Tensor>& /* unused */,
-    const GatherOptions& /* unused */) {
-  TORCH_CHECK(false, "ProcessGroupNCCL does not support gather");
+    std::vector<std::vector<at::Tensor>>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const GatherOptions& opts) {
+  static auto invalidArgument = [](const std::string& msg) {
+    TORCH_CHECK(false, "ProcessGroupNCCL::gather: " + msg);
+  };
+
+  assertRootRank(invalidArgument, opts.rootRank, size_);
+  check_gpu_tensors_different_devices(inputTensors);
+  assertSingleElementInput(invalidArgument, inputTensors);
+
+  // @lint-ignore CLANGTIDY
+  auto tensor = inputTensors.back();
+  RECORD_PARAM_COMMS(
+      rank_,                    // rank
+      "gather",                 // colName
+      tensor.numel(),           // inSize
+      tensor.numel() *
+        this->getSize(),        // outSize
+      tensor.scalar_type(),     // dType
+      std::vector<int64_t>(),   // inSplitSizes
+      std::vector<int64_t>());  // outSplitSize
+
+  std::vector<at::Tensor> outputs;
+
+  if (getRank() == opts.rootRank) {
+    if (outputTensors.size() != 1) {
+      std::stringstream ss;
+      ss << "requires a single-element output list containing a list with "
+         << getSize() << " tensors.";
+      invalidArgument(ss.str());
+    } else if (outputTensors[0].size() != static_cast<size_t>(getSize())) {
+      std::stringstream ss;
+      ss << "Incorrect output list size " << outputTensors[0].size()
+         << ". Output list size should be " << getSize()
+         << ", same as size of the process group.";
+      invalidArgument(ss.str());
+    }
+
+    const auto& options = inputTensors[0].options();
+    const auto& sizes = inputTensors[0].sizes();
+    assertTypeAndSizesMatch(invalidArgument, outputTensors[0], options, sizes);
+    outputs = outputTensors[0];
+  } else {
+    // if not in the root rank, initialize outputs as empty list
+    if (outputTensors.size() != 0) {
+      invalidArgument("requires empty output on non-root");
+    }
+    outputs = {};
+  }
+
+  return collective(
+      inputTensors,
+      outputs,
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream) {
+        const auto root = opts.rootRank;
+        if (getRank() == root) {
+          for(auto output: outputs) {
+            c10::cuda::CUDACachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
+        }
+        torch::cuda::nccl::gather(inputTensors[0], outputs, comm, stream, root);
+        return ncclSuccess;
+      },
+      OpType::GATHER,
+      "nccl:gather");
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
@@ -69,7 +69,6 @@ class DistTestCases:
     # Backends that do not support a specific collective
     skip_collective = {}
     skip_collective["allgather_coalesced"] = {"nccl", "mpi"}
-    skip_collective["gather"] = {"nccl"}
     skip_collective["scatter"] = {"nccl"}
     skip_collective["reduce"] = set()
     skip_collective["sendrecv anysource"] = {"nccl"}
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py