[ROCm] fastSpecializedAtomicAdd for MI300 (pytorch#135770)

jeffdaily · pytorchmergebot · commit d33a5e2a577d · 2024-09-29T21:52:09.000Z
MI300 adds HW support for packed bfloat16 and fp16. Enable via existing fastSpecializedAtomicAdd. Pull Request resolved: pytorch#135770 Approved by: https://github.com/xw285cornell, https://github.com/jianyuh
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -5,6 +5,74 @@
 #include <cuda_bf16.h>
 #endif
 
+// ROCm 6.3 is planned to have these functions, but until then here they are.
+#if defined(USE_ROCM) && ROCM_VERSION >= 60201
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+
+__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
+  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
+  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
+  union {
+    __hip_bfloat162_raw bf162_raw;
+    vec_short2 vs2;
+  } u{static_cast<__hip_bfloat162_raw>(value)};
+  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
+  return static_cast<__hip_bfloat162>(u.bf162_raw);
+#else
+  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
+  union u_hold {
+    __hip_bfloat162_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+
+__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
+  // The api expects an ext_vector_type of half
+  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
+  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
+  union {
+    __half2_raw h2r;
+    vec_fp162 fp16;
+  } u {static_cast<__half2_raw>(value)};
+  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
+  return static_cast<__half2>(u.h2r);
+#else
+  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
+  union u_hold {
+    __half2_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+#define ATOMICADD preview_unsafeAtomicAdd
+#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
+#else
+#define ATOMICADD atomicAdd
+#define NATIVE_ZERO_BF16 __int2bfloat16_rz(0)
+#endif
+
 namespace at:: native {
 
 __device__ __forceinline__ size_t
@@ -47,7 +115,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     const index_t numel,
     scalar_t value) {
 #if (                      \
-    (defined(USE_ROCM)) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
   gpuAtomicAddNoReturn(
       reinterpret_cast<at::Half*>(tensor) + index,
@@ -61,17 +129,22 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     __half2 value2;
     value2.x = static_cast<__half>(value);
     value2.y = __int2half_rz(0);
-    atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
+    ATOMICADD(reinterpret_cast<__half2*>(target_addr), value2);
 
   } else if (!low_byte && index > 0) {
     __half2 value2;
     value2.x = __int2half_rz(0);
     value2.y = static_cast<__half>(value);
-    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
+    ATOMICADD(reinterpret_cast<__half2*>(target_addr - 1), value2);
 
   } else {
+#ifdef USE_ROCM
+    gpuAtomicAddNoReturn(
+        reinterpret_cast<at::Half*>(tensor) + index, static_cast<at::Half>(value));
+#else
     atomicAdd(
         reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value));
+#endif
   }
 #endif
 }
@@ -87,7 +160,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
     const index_t numel,
     scalar_t value) {
 #if (                      \
-    (defined(USE_ROCM)) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
   gpuAtomicAddNoReturn(
       reinterpret_cast<at::BFloat16*>(tensor) + index,
@@ -100,18 +173,23 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
   if (low_byte && index < (numel - 1)) {
     __nv_bfloat162 value2;
     value2.x = *reinterpret_cast<__nv_bfloat16*>(&value);
-    value2.y = __int2bfloat16_rz(0);
-    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
+    value2.y = NATIVE_ZERO_BF16;
+    ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
 
   } else if (!low_byte && index > 0) {
     __nv_bfloat162 value2;
-    value2.x = __int2bfloat16_rz(0);
+    value2.x = NATIVE_ZERO_BF16;
     value2.y = *reinterpret_cast<__nv_bfloat16*>(&value);
-    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
+    ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
 
   } else {
+#ifdef USE_ROCM
+    gpuAtomicAddNoReturn(
+        reinterpret_cast<at::BFloat16*>(tensor) + index, static_cast<at::BFloat16>(value));
+#else
     atomicAdd(
         reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value));
+#endif
   }
 #endif
 }
@@ -144,4 +222,7 @@ __device__ __forceinline__ void fastAtomicAdd(
   }
 }
 
+#undef ATOMICADD
+#undef NATIVE_ZERO_BF16
+
 } // namespace at::native