Use Intel x86 intrinsics for half/float conversion (#1789)

Michael Gschwind · facebook-github-bot · commit deb24a849aad · 2024-01-31T18:43:51.000-08:00
Summary: Pull Request resolved: #1789 Use Intel x86 intrinsics for half/float conversion with GCC and clang Reviewed By: cpuhrsch Differential Revision: D53283798 fbshipit-source-id: a2365485258304b6b0b67b22d766ede7d4ba6994
diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h
@@ -12,6 +12,16 @@
 #include <cstdint>
 #include <cstring>
 
+#if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
+    defined(_M_IX86)
+#if defined(__AVX2__)
+#define X86_F16 1
+#include <immintrin.h> // import conversion ops from f16cintrin.h
+#endif // __AVX2__
+#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86
+#endif // __GNUC__ || __clang__
+
 namespace torch {
 namespace executor {
 
@@ -165,6 +175,10 @@ inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
  * between integer and floating-point variables.
  */
 inline float fp16_ieee_to_fp32_value(uint16_t h) {
+#ifdef X86_F16
+  return _cvtsh_ss(h);
+#else
+
   /*
    * Extend the half-precision floating-point number to 32 bits and shift to the
    * upper part of the 32-bit word:
@@ -287,6 +301,8 @@ inline float fp16_ieee_to_fp32_value(uint16_t h) {
       (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
                                    : fp32_to_bits(normalized_value));
   return fp32_from_bits(result);
+
+#endif // not X86_F16
 }
 
 /*
@@ -299,6 +315,10 @@ inline float fp16_ieee_to_fp32_value(uint16_t h) {
  * between integer and floating-point variables.
  */
 inline uint16_t fp16_ieee_from_fp32_value(float f) {
+#ifdef X86_F16
+  return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT);
+#else
+
   // const float scale_to_inf = 0x1.0p+112f;
   // const float scale_to_zero = 0x1.0p-110f;
   constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
@@ -332,6 +352,7 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) {
   return static_cast<uint16_t>(
       (sign >> 16) |
       (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
+#endif // not X86_F16
 }
 
 } // namespace internal