Avoid watchdog polling during graph capture in ROCm

chinmaydk99 · AMD · commit 23503e4a7df7 · 2026-03-12T14:31:25.000Z
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -22,6 +22,17 @@ static bool _cuda_graphs_debug = false;
 static std::mutex _currently_capturing_graphs_mutex;
 static ska::flat_hash_map<CaptureId_t, CUDAGraph*> _currently_capturing_graphs;
 
+
+#if defined(USE_ROCM)
+// Returns true when at least one CUDAGraph capture is currently active in this
+// process. Uses the same mutex-protected capture map as capture lifecycle
+// bookkeeping.
+bool is_graph_capture_active() {
+  std::unique_lock<std::mutex> lock(_currently_capturing_graphs_mutex);
+  return !_currently_capturing_graphs.empty();
+}
+#endif // defined(USE_ROCM)
+
 MempoolId_t graph_pool_handle() {
   // Sets just the second value, to distinguish it from MempoolId_ts created from
   // cudaStreamGetCaptureInfo id_s in capture_begin.
@@ -140,15 +151,18 @@ void CUDAGraph::capture_end() {
   TORCH_CHECK(stream.stream() == capture_stream_.stream(),
               "Capture must end on the same stream it began on.");
 
-  AT_CUDA_CHECK(cudaStreamEndCapture(capture_stream_, &graph_));
-
+  // Capture is over once cudaStreamEndCapture returns (success or failure).
+  // Clear bookkeeping before propagating the return status so watchdog-side
+  // checks cannot observe stale "capture active" state on error paths.
+  cudaError_t endCaptureErr = cudaStreamEndCapture(capture_stream_, &graph_);
   {
     std::unique_lock<std::mutex> lock(_currently_capturing_graphs_mutex);
     TORCH_CHECK(
         _currently_capturing_graphs.count(capture_id_),
         "capture_end() called before capture_begin().");
     _currently_capturing_graphs.erase(capture_id_);
   }
+  AT_CUDA_CHECK(endCaptureErr);
 
   c10::cuda::CUDACachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
   at::getHostAllocator(at::kCUDA)->end_allocate_to_pool(mempool_id_);
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
@@ -30,6 +30,16 @@ namespace cuda {
 // to CUDAGraph::capture_begin
 TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle();
 
+// Returns true if any CUDAGraph capture is currently active in this process.
+// Used by ProcessGroupNCCL's ROCm watchdog workaround to avoid calling
+// hipEventQuery during active capture on HIP runtimes without the
+// event-query capture-mode fix (https://github.com/ROCm/clr/pull/3176).
+// Not needed on CUDA/NVIDIA where cross-thread event query does not have this
+// restriction.
+#if defined(USE_ROCM)
+TORCH_CUDA_CPP_API bool is_graph_capture_active();
+#endif // defined(USE_ROCM)
+
 struct TORCH_CUDA_CPP_API CUDAGraph {
   CUDAGraph(bool keep_graph=false);
   ~CUDAGraph();
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -11,8 +11,10 @@
 #include <utility>
 
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include <c10/core/DeviceType.h>
 #include <c10/cuda/CUDAAllocatorConfig.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
@@ -221,6 +223,65 @@ std::string getExceptionMsgFromExceptionPtr(
   }
 }
 
+#ifdef USE_ROCM
+// Indicates that we're in the watchdog's event-query phase. This allows ROCm
+// workaround behavior to be applied only to watchdog-side queries, while
+// preserving existing behavior for user/main-thread `WorkNCCL::isCompleted()`
+// and `wait()` calls.
+thread_local bool g_in_rocm_watchdog_event_query_context = false;
+
+struct RocmWatchdogEventQueryContextGuard {
+  RocmWatchdogEventQueryContextGuard()
+      : previous_(g_in_rocm_watchdog_event_query_context) {
+    g_in_rocm_watchdog_event_query_context = true;
+  }
+  ~RocmWatchdogEventQueryContextGuard() {
+    g_in_rocm_watchdog_event_query_context = previous_;
+  }
+
+ private:
+  bool previous_;
+};
+#endif // USE_ROCM
+
+#ifdef USE_ROCM
+// Watchdog-side cudaEventQuery workaround for HIP runtimes without the
+// capture-mode fix.
+// TODO: Remove once all supported runtimes include
+// https://github.com/ROCm/rocm-systems/pull/3176
+bool queryEventWithRocmWatchdogCaptureWorkaround(
+    const std::shared_ptr<at::cuda::CUDAEvent>& event) {
+  if (!event->isCreated()) {
+    return true;
+  }
+
+  // Must unconditionally return false here during watchdog + active capture:
+  // on affected HIP runtimes, even calling cudaEventQuery from the watchdog
+  // thread while another thread has GLOBAL capture active can invalidate that
+  // capture and cause downstream failures. Skip the query entirely and report
+  // "not complete yet"; the watchdog will re-poll once capture ends. Timeout
+  // enforcement is also deferred during this window (see the
+  // is_graph_capture_active() gate in the watchdog loop).
+  if (g_in_rocm_watchdog_event_query_context &&
+      at::cuda::is_graph_capture_active()) {
+    return false;
+  }
+
+  const cudaError_t err =
+      C10_CUDA_ERROR_HANDLED(cudaEventQuery(event->event()));
+  if (err == cudaSuccess) {
+    return true;
+  } else if (err != cudaErrorNotReady) {
+    C10_CUDA_CHECK(err);
+  } else {
+    // ignore and clear the error if not ready
+    (void)cudaGetLastError();
+  }
+
+  return false;
+}
+#endif // USE_ROCM
+
 inline void errorIfCapturingNonCapturableNCCL(c10::cuda::CaptureStatus status) {
   // parentheses avoid some compiler warnings
   static const uint64_t min_version =
@@ -644,7 +705,11 @@ bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const {
     return false;
   }
   // Checking the work's corresponding CUDA event's status
+#ifdef USE_ROCM
+  if (!queryEventWithRocmWatchdogCaptureWorkaround(ncclStartEvent_)) {
+#else
   if (!ncclStartEvent_->query()) {
+#endif
     return false;
   }
   return true;
@@ -657,7 +722,11 @@ bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
   // hang if another thread is holding the CUDA global context lock. For
   // example, when doing a `cudaDeviceSynchronize` or even
   // `cudaStreamSynchronize`.
+#ifdef USE_ROCM
+  if (!queryEventWithRocmWatchdogCaptureWorkaround(ncclEndEvent_)) {
+#else
   if (!ncclEndEvent_->query()) {
+#endif
     return false;
   }
   return true;
@@ -2291,9 +2360,23 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
         }
       }
 
-      // Then check if work has timed out
-      // Skip if work has encountered an error
-      bool timedout = !work.exception() && work.checkTimeout();
+      // Then check if work has timed out.
+      // Skip if work has encountered an error.
+
+      bool timedout = false;
+#ifdef USE_ROCM
+      // On ROCm, watchdog event queries may be intentionally skipped during
+      // active graph capture to avoid HIP runtime capture invalidation.
+      // In that window, timeout checks can report false positives for
+      // otherwise-complete work, so we defer timeout enforcement.
+      // TODO: Remove once all supported HIP runtimes include:
+      // https://github.com/ROCm/clr/pull/3176
+      if (!at::cuda::is_graph_capture_active()) {
+        timedout = !work.exception() && work.checkTimeout();
+      }
+#else
+      timedout = !work.exception() && work.checkTimeout();
+#endif
 
       // Report desync state in case of timeout (if TORCH_NCCL_DESYNC_DEBUG is
       // turned on; otherwise, run() is no-op)
@@ -2358,7 +2441,11 @@ void ProcessGroupNCCL::Watchdog::runLoop() {
       // allow watchdog to do an event query on a side thread
       at::cuda::CUDAGuard device_guard(work.ncclEndEvent_->device_index());
       at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeThreadLocal};
-
+#ifdef USE_ROCM
+      // Mark this thread/scope as watchdog event-query context so the ROCm
+      // workaround applies only here (not to main-thread wait()/isCompleted()).
+      RocmWatchdogEventQueryContextGuard watchdog_event_query_context_guard;
+#endif
       // a work could be started but not completed, so we should not update
       // lastStartedSeq and lastStartedOpName if the work state is checked
       // multiple times after the start