Fix overflow in torch.remainder when dividend is very large (#37758)

xwang233 · facebook-github-bot · commit 63b1ae69831c · 2020-05-08T16:46:55.000-07:00
Summary: This will fix the GPU implementation in #37743 and #24861. Please also check my [comment](#37743 (comment)). The fixed `remainder_kernel` follows the similar implementation in numpy. See https://github.com/numpy/numpy/blob/79d7bc276afbe89c746e462d28d4bfbb4fc56148/numpy/core/src/npymath/npy_math_internal.h.src#L649-L658 I also slightly update the doc for `torch.remainder`, to make it similar to `torch.fmod`. I'm not sure how to modify the Vec256 code of CPU remainder_kernel, so I just leave it there. Pull Request resolved: #37758 Differential Revision: D21388417 Pulled By: ngimel fbshipit-source-id: 770ba5801cf34619b2b68b8b0cf95d8cfa52e6f6
diff --git a/aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu b/aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu
@@ -77,7 +77,9 @@ void remainder_kernel_cuda(TensorIterator& iter) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "remainder_cuda", [&]() {
       gpu_kernel_with_scalars(iter,
         []GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
-          return a - b * static_cast<scalar_t>(std::floor(a / b));
+          auto mod = ::fmod(a, b);
+          if ((mod != 0) && ((b < 0) != (mod < 0))) mod += b;
+          return mod;
         });
     });
   }
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -15449,6 +15449,27 @@ def test_remainder(self, device, dtype):
                 long_res1 = long_m1.clone()
                 long_res1.remainder_(long_qs.unsqueeze(0).expand_as(long_res1))
 
+    # remove onlyCUDA after CPU impl of remainder_kernel be fixed
+    @onlyCUDA
+    @dtypes(torch.float, torch.double)
+    def test_remainder_fmod_large_dividend(self, device, dtype):
+        alarge = 1e9
+        pi = 3.14159265358979
+        for avalue in [alarge, -alarge]:
+            for bvalue in [pi, -pi]:
+                a = torch.tensor([avalue], dtype=dtype, device=device)
+                b = torch.tensor([bvalue], dtype=dtype, device=device)
+                c = torch.remainder(a, b)
+                d = torch.fmod(a, b)
+                self.assertTrue((b[0] > 0) == (c[0] > 0))  # remainder has same sign as divisor
+                self.assertTrue((a[0] > 0) == (d[0] > 0))  # fmod has same sign as dividend
+                self.assertTrue(abs(c[0]) < abs(b[0]))     # remainder is within range of divisor
+                self.assertTrue(abs(d[0]) < abs(b[0]))     # fmod is within range of divisor
+                if ((a[0] > 0) == (b[0] > 0)):
+                    self.assertTrue(c[0] == d[0])   # remainder is same as fmod
+                else:
+                    self.assertTrue(abs(c[0] - d[0]) == abs(b[0]))  # differ by one divisor
+
     @dtypes(torch.int64, torch.float64)
     def test_remainder_edge_cases(self, device, dtype):
         # Test variations of negative values used as input
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
@@ -4902,8 +4902,8 @@ def merge_dicts(*dicts):
 
 Computes the element-wise remainder of division.
 
-The divisor and dividend may contain both for integer and floating point
-numbers. The remainder has the same sign as the divisor.
+The dividend and divisor may contain both for integer and floating point
+numbers. The remainder has the same sign as the divisor :attr:`other`.
 
 When :attr:`other` is a tensor, the shapes of :attr:`input` and
 :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.