[PyTorch] Reduce errors of foreach functions (#56993)

crcrpar · facebook-github-bot · commit 7eade660c615 · 2021-05-25T10:50:20.000-07:00
Summary: This is based on #48224. To make `foreach` more flexible, this PR pushes unsupported cases to slow path. Also, this adds some tests to verify that - `foreach` functions work with tensors of different dtypes and/or memory layouts in 7bd4b2c - `foreach` functions work with tensors on different devices in a list, but are on the same device if the indices are the same: def4b9b Future plans: 1. Improve the coverage of unittests using `ops` decorator & updating `foreach_unary_op_db` and creating `foreach_(binary|pointwise|minmax)_db`. 2. Support broadcasting in slow path. Ref: #52448 3. Support type promotion in fast path. Ref #52449 CC: ngimel mcarilli ptrblck Pull Request resolved: #56993 Reviewed By: zou3519 Differential Revision: D28630580 Pulled By: ngimel fbshipit-source-id: e26ee74a39a591025e18c1ead48948cb7ec53c19
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
@@ -6,17 +6,23 @@
 namespace at {
 namespace native {
 namespace {
+// Check if tensor list has either a boolean tensor or a integer tensor
+bool has_integral_tensor(TensorList tensors, const bool includeBool) {
+  return std::any_of(tensors.begin(), tensors.end(),
+    [&includeBool](const auto & t) { return at::isIntegralType(t.scalar_type(), includeBool); });
+}
+// check if tensor list has bool tensors
+bool has_bool_tensor(TensorList tensors) {
+  return std::any_of(tensors.begin(), tensors.end(),
+    [](const auto & t) -> bool { return t.scalar_type() == ScalarType::Bool; });
+}
+
 // Check foreach API restrictions
 // - Tensor lists must be non-empty.
-// - All tensors in all lists must have the same dtype.
 // - All TensorLists and ScalarLists must have the same number of elements.
 // - Corresponding tensors must have the same size.
 void check_foreach_api_restrictions(TensorList tensors) {
   TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
-  auto expected_dtype = tensors[0].dtype();
-  for (const auto& t : tensors) {
-    TORCH_CHECK(t.dtype() == expected_dtype, "All tensors in the tensor list must have the same dtype.");
-  }
 }
 
 void check_foreach_api_restrictions(TensorList tensors, ArrayRef<Scalar> scalars) {
@@ -29,11 +35,7 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   TORCH_CHECK(tensors2.size() > 0, "Tensor list must have at least one tensor.");
   TORCH_CHECK(tensors1.size() == tensors2.size(), "Tensor lists must have the same number of tensors, got ", tensors1.size(), " and ", tensors2.size());
 
-  auto expected_dtype = tensors1[0].dtype();
-
   for (const auto i : c10::irange(tensors1.size())) {
-    TORCH_CHECK(tensors1[i].dtype() == expected_dtype, "All tensors in the tensor list must have the same dtype.");
-    TORCH_CHECK(tensors2[i].dtype() == expected_dtype, "All tensors in the tensor list must have the same dtype.");
     TORCH_CHECK(tensors1[i].sizes() == tensors2[i].sizes(), "Corresponding tensors in lists must have the same size, got ", tensors1[i].sizes(), " and ", tensors2[i].sizes());
   }
 }
@@ -45,11 +47,7 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2, Te
   TORCH_CHECK(tensors1.size() == tensors2.size(), "Tensor lists must have the same number of tensors, got ", tensors1.size(), " and ", tensors2.size());
   TORCH_CHECK(tensors1.size() == tensors3.size(), "Tensor lists must have the same number of tensors, got ", tensors1.size(), " and ", tensors3.size());
 
-  auto expected_dtype = tensors1[0].dtype();
-
   for (const auto i : c10::irange(tensors1.size())) {
-    TORCH_CHECK(tensors1[i].dtype() == expected_dtype, "All tensors in the tensor list must have the same dtype.");
-    TORCH_CHECK(tensors2[i].dtype() == expected_dtype, "All tensors in the tensor list must have the same dtype.");
     TORCH_CHECK(tensors1[i].sizes() == tensors2[i].sizes(), "Corresponding tensors in lists must have the same size, got ", tensors1[i].sizes(), " and ", tensors2[i].sizes());
     TORCH_CHECK(tensors1[i].sizes() == tensors3[i].sizes(), "Corresponding tensors in lists must have the same size, got ", tensors1[i].sizes(), " and ", tensors3[i].sizes());
   }
@@ -61,20 +59,24 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2, Te
 }
 
 // To go via 'fast' path, several conditions must be satisfied
+// - All tensors in all lists must have the same dtype.
 // - All tensors must be on the same device
 // - All tensors must have strided layout
 // - All tensors must be non-overlapping and dense
 // - Resulting tensor must have the same dtype as the input one
 
+// TODO(mkozuki): Consider whether we really need this function or not.
+// Note that, there is a possibility that foreach fastpath supports type promotion in the future,
+// which might complicate the functionality this function should provides.
+// However, as of now, the check of division op with integer inputs is duplicated.
+// `check_fast_path_restrictions` does the same thing in it before calling this function.
 bool will_promote_tensor(const Tensor& tensor, const Scalar& scalar, bool does_op_promote_integer_inputs_to_float = false) {
   // In case of division, integer inputs will result in float
-  if (does_op_promote_integer_inputs_to_float) {
-    if (at::isIntegralType(tensor.scalar_type(), /*includeBool*/ true)) {
-      return true;
-    }
+  if (does_op_promote_integer_inputs_to_float &&
+      at::isIntegralType(tensor.scalar_type(), /* includeBool */ true)) {
+    return true;
   }
-  auto result_dtype = at::result_type(tensor, scalar);
-  return result_dtype != tensor.scalar_type();
+  return tensor.scalar_type() != at::native::result_type(scalar, tensor);
 }
 
 // Please, make sure to call check_foreach_api_restrictions before calling this method.
@@ -83,10 +85,12 @@ bool check_fast_path_restrictions(
   ArrayRef<TensorList> tensorLists,
   ArrayRef<Scalar> scalarList = {},
   bool does_op_promote_integer_inputs_to_float = false) {
-    auto expected_device = tensorLists[0][0].device();
+    const auto expected_dtype = tensorLists[0][0].dtype();
+    const auto expected_device = tensorLists[0][0].device();
 
     auto is_tensor_okay = [&](const Tensor& tensor) {
-      return tensor.device() == expected_device &&
+      return tensor.dtype() == expected_dtype &&
+             tensor.device() == expected_device &&
              tensor.layout() == at::kStrided &&
              tensor.is_non_overlapping_and_dense();
     };
@@ -108,9 +112,11 @@ bool check_fast_path_restrictions(
       }
     }
 
-    // For all j, tensorList[j][0] have the same shape and dtype. (this was a precondition
-    // checked by `check_foreach_api_restrictions`). This means we only need to check if
-    // {tensorList[0][0], tensorList[0][1], tensorList[0][2], ...} do type promotion with scalarLIst.
+    // This function has already checked that `tensorList[j][i]` for all j, i has the same dtype
+    // using `is_tensor_okay` function above.
+    // checked by `check_foreach_api_restrictions`).
+    // This means we only need to check if {tensorList[0][0], tensorList[0][1], tensorList[0][2], ...}
+    // do type promotion with scalarLIst.
     for (int i=0; i < tensorLists[0].size(); i++) {
       if (does_op_promote_integer_inputs_to_float) {
         if (at::isIntegralType(tensorLists[0][i].scalar_type(), /*includeBool*/ true)) {
@@ -123,6 +129,8 @@ bool check_fast_path_restrictions(
           return false;
         }
       } else if (scalarList.size() > 1) {
+        // FIXME(mkozuki): Consider specializing `TensorListScalarListMetadata` for complex dtypes
+        // to access the following comment.
         // Complex scalar list is not supported due to the limit for kernel launch argument (4KB)
         if (scalarList[i].isComplex()) {
           return false;
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -113,6 +113,7 @@ void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads,
     //  - all scaled_grads are strided
     //  - all scaled_grads are non overlapping and dense
     //  - all scaled_grads are on the same device
+    //  - all scaled_grads are of the same dtype
     TORCH_CHECK(scaled_grads[0].is_cuda(), "scaled_grads must be CUDA tensors.");
     // Sets up MTA launch to use scaled_grads as-is.
     tensor_lists.emplace_back(scaled_grads.vec());
@@ -126,12 +127,13 @@ void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads,
     tensor_lists.resize(1);
     tensor_lists[0].reserve(scaled_grads.size());
     auto expected_device = scaled_grads[0].device();
+    const auto expected_dtype = scaled_grads[0].scalar_type();
     for (const Tensor& t : scaled_grads) {
       // Ensures GradScaler filtered scaled_grads by device.
       TORCH_CHECK(t.is_cuda(), "one of scaled_grads was not a CUDA tensor.");
       TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
       TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
-      if (!t.is_non_overlapping_and_dense()) {
+      if (!t.is_non_overlapping_and_dense() || t.scalar_type() != expected_dtype) {
         // t is acceptable but not MTA-safe.  Falls back to single-tensor TensorIterator kernel.
         _amp_non_finite_check_and_unscale_cuda_(const_cast<Tensor&>(t),
                                                 found_inf,
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -105,7 +105,7 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
 std::vector<Tensor> foreach_tensor_##NAME##_scalar_cuda(TensorList input, TensorList tensors1, TensorList tensors2, const Scalar& scalar) {  \
     check_foreach_api_restrictions(input, tensors1, tensors2);                                                                        \
                                                                                                                                       \
-    if (!can_use_fast_route({input, tensors1, tensors2}, scalar)) {                                                                   \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalar) || has_integral_tensor(input, /* includeBool */ true)) {             \
         return at::native::foreach_tensor_##NAME##_scalar_slow(input, tensors1, tensors2, scalar);                                    \
     }                                                                                                                                 \
                                                                                                                                       \
@@ -115,7 +115,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_cuda(TensorList input, Tensor
 void foreach_tensor_##NAME##_scalar_cuda_(TensorList input, TensorList tensors1, TensorList tensors2, const Scalar& scalar) {                \
     check_foreach_api_restrictions(input, tensors1, tensors2);                                                                        \
                                                                                                                                       \
-    if (!can_use_fast_route({input, tensors1, tensors2}, scalar)) {                                                                   \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalar) || has_integral_tensor(input, /* includeBool */ true)) {             \
         return at::native::foreach_tensor_##NAME##_scalar_slow_(input, tensors1, tensors2, scalar);                                   \
     }                                                                                                                                 \
                                                                                                                                       \
@@ -127,7 +127,7 @@ void foreach_tensor_##NAME##_scalar_cuda_(TensorList input, TensorList tensors1,
 std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_cuda(TensorList input, TensorList tensors1, TensorList tensors2, at::ArrayRef<Scalar> scalars) {  \
     check_foreach_api_restrictions(input, tensors1, tensors2, scalars);                                                                                  \
                                                                                                                                                          \
-    if (!can_use_fast_route({input, tensors1, tensors2}, scalars)) {                                                                                     \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) || has_integral_tensor(input, /* includeBool */ true)) {                               \
         return at::native::foreach_tensor_##NAME##_scalarlist_slow(input, tensors1, tensors2, scalars);                                                  \
     }                                                                                                                                                    \
                                                                                                                                                          \
@@ -137,7 +137,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_cuda(TensorList input, Te
 void foreach_tensor_##NAME##_scalarlist_cuda_(TensorList input, TensorList tensors1, TensorList tensors2, at::ArrayRef<Scalar> scalars) {                \
     check_foreach_api_restrictions(input, tensors1, tensors2, scalars);                                                                                  \
                                                                                                                                                          \
-    if (!can_use_fast_route({input, tensors1, tensors2}, scalars)) {                                                                                     \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) || has_integral_tensor(input, /* includeBool */ true)) {                               \
         return at::native::foreach_tensor_##NAME##_scalarlist_slow_(input, tensors1, tensors2, scalars);                                                 \
     }                                                                                                                                                    \
                                                                                                                                                          \
@@ -149,10 +149,14 @@ FOREACH_POINTWISE_OP_SCALAR(addcdiv, std::divides);
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul, std::multiplies);
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv, std::divides);
 
+
+// Why bool tensors are pushed to slowpath?
+// Because `AT_DISPATCH_ALL_TYPES_AND` is used below.
+// TODO(mkozuki): Check whether it's possible to handle bool tensors in fastpath.
 #define FOREACH_MAXIMUM_MINIMUM_OP(NAME, OP)                                                               \
 std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors1, TensorList tensors2) {               \
     check_foreach_api_restrictions(tensors1, tensors2);                                                    \
-    if (!can_use_fast_route({tensors1, tensors2})) {                                                       \
+    if (!can_use_fast_route({tensors1, tensors2}) || has_bool_tensor(tensors1)) {                          \
         return at::native::foreach_tensor_##NAME##_slow(tensors1, tensors2);                               \
     }                                                                                                      \
                                                                                                            \
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -133,14 +133,14 @@ struct functor_name {                                                \
 #define OP_CUSTOM_FUNCTOR(function, op_name, functor_name)                \
 std::vector<Tensor> foreach_tensor_##op_name##_cuda(TensorList tensors) { \
     check_foreach_api_restrictions(tensors);                              \
-    if (!can_use_fast_route(tensors)) {                                   \
+    if (!can_use_fast_route(tensors) || has_integral_tensor(tensors, /* includeBool */ true)) { \
         return at::native::foreach_tensor_##op_name##_slow(tensors);      \
     }                                                                     \
     return function<functor_name>(tensors);                               \
 }                                                                         \
 void foreach_tensor_##op_name##_cuda_(TensorList tensors) {               \
     check_foreach_api_restrictions(tensors);                              \
-    if (!can_use_fast_route(tensors)) {                                   \
+    if (!can_use_fast_route(tensors) || has_integral_tensor(tensors, /* includeBool */ true)) { \
         return at::native::foreach_tensor_##op_name##_slow_(tensors);     \
     }                                                                     \
                                                                           \
@@ -247,13 +247,9 @@ struct Abs {
 
 std::vector<Tensor> foreach_tensor_abs_cuda(TensorList tensors) {
     check_foreach_api_restrictions(tensors);
-    bool has_complex = false;
-    for (auto t : tensors) {
-        if (at::isComplexType(t.scalar_type())) {
-            has_complex = true;
-        }
-    }
-
+    const bool has_complex = std::any_of(
+        tensors.begin(), tensors.end(),
+        [](const auto & t) { return at::isComplexType(t.scalar_type()); });
     if (!can_use_fast_route(tensors) || has_complex) {
         return at::native::foreach_tensor_abs_slow(tensors);
     }
@@ -263,13 +259,9 @@ std::vector<Tensor> foreach_tensor_abs_cuda(TensorList tensors) {
 
 void foreach_tensor_abs_cuda_(TensorList tensors) {
     check_foreach_api_restrictions(tensors);
-    bool has_complex = false;
-    for (auto t : tensors) {
-        if (at::isComplexType(t.scalar_type())) {
-            has_complex = true;
-        }
-    }
-
+    const bool has_complex = std::any_of(
+        tensors.begin(), tensors.end(),
+        [](const auto & t) { return at::isComplexType(t.scalar_type()); });
     if (!can_use_fast_route(tensors) || has_complex) {
         return at::native::foreach_tensor_abs_slow_(tensors);
     }
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -1908,13 +1908,16 @@ def test_grad_scaling_unscale(self, dtype=torch.float):
                 for grad in grads:
                     self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
 
-        # Passing lists with mismatched devices or dtypes to a raw
+        # When passing lists with mismatched dtypes to a raw
+        # _amp_foreach_non_finite_check_and_unscale_ call,
+        # it's expected to fall back to single-tensor TensorIterator kernel.
+        grads = [g.clone(), g.to(dtype=torch.float16)]
+        torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
+        for grad in grads:
+            self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
+
+        # Passing lists with mismatched devices to a raw
         # _amp_foreach_non_finite_check_and_unscale_ call should raise errors.
-        with self.assertRaisesRegex(RuntimeError, r"must have the same dtype"):
-            torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(dtype=torch.float16)],
-                                                             found_inf,
-                                                             inv_scale)
-
         if TEST_MULTIGPU:
             with self.assertRaisesRegex(RuntimeError, r"Expected all tensors to be on the same device"):
                 torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")],
diff --git a/test/test_foreach.py b/test/test_foreach.py

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,`
`105`	`105`	`std::vector<Tensor> foreach_tensor_##NAME##_scalar_cuda(TensorList input, TensorList tensors1, TensorList tensors2, const Scalar& scalar) { \`
`106`	`106`	`check_foreach_api_restrictions(input, tensors1, tensors2); \`
`107`	`107`	`\`
`108`		`- if (!can_use_fast_route({input, tensors1, tensors2}, scalar)) { \`
	`108`	`+ if (!can_use_fast_route({input, tensors1, tensors2}, scalar) \|\| has_integral_tensor(input, /* includeBool */ true)) { \`
`109`	`109`	`return at::native::foreach_tensor_##NAME##_scalar_slow(input, tensors1, tensors2, scalar); \`
`110`	`110`	`} \`
`111`	`111`	`\`
`@@ -115,7 +115,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_cuda(TensorList input, Tensor`
`115`	`115`	`void foreach_tensor_##NAME##_scalar_cuda_(TensorList input, TensorList tensors1, TensorList tensors2, const Scalar& scalar) { \`
`116`	`116`	`check_foreach_api_restrictions(input, tensors1, tensors2); \`
`117`	`117`	`\`
`118`		`- if (!can_use_fast_route({input, tensors1, tensors2}, scalar)) { \`
	`118`	`+ if (!can_use_fast_route({input, tensors1, tensors2}, scalar) \|\| has_integral_tensor(input, /* includeBool */ true)) { \`
`119`	`119`	`return at::native::foreach_tensor_##NAME##_scalar_slow_(input, tensors1, tensors2, scalar); \`
`120`	`120`	`} \`
`121`	`121`	`\`
`@@ -127,7 +127,7 @@ void foreach_tensor_##NAME##_scalar_cuda_(TensorList input, TensorList tensors1,`
`127`	`127`	`std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_cuda(TensorList input, TensorList tensors1, TensorList tensors2, at::ArrayRef<Scalar> scalars) { \`
`128`	`128`	`check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \`
`129`	`129`	`\`
`130`		`- if (!can_use_fast_route({input, tensors1, tensors2}, scalars)) { \`
	`130`	`+ if (!can_use_fast_route({input, tensors1, tensors2}, scalars) \|\| has_integral_tensor(input, /* includeBool */ true)) { \`
`131`	`131`	`return at::native::foreach_tensor_##NAME##_scalarlist_slow(input, tensors1, tensors2, scalars); \`
`132`	`132`	`} \`
`133`	`133`	`\`
`@@ -137,7 +137,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_cuda(TensorList input, Te`
`137`	`137`	`void foreach_tensor_##NAME##_scalarlist_cuda_(TensorList input, TensorList tensors1, TensorList tensors2, at::ArrayRef<Scalar> scalars) { \`
`138`	`138`	`check_foreach_api_restrictions(input, tensors1, tensors2, scalars); \`
`139`	`139`	`\`
`140`		`- if (!can_use_fast_route({input, tensors1, tensors2}, scalars)) { \`
	`140`	`+ if (!can_use_fast_route({input, tensors1, tensors2}, scalars) \|\| has_integral_tensor(input, /* includeBool */ true)) { \`
`141`	`141`	`return at::native::foreach_tensor_##NAME##_scalarlist_slow_(input, tensors1, tensors2, scalars); \`
`142`	`142`	`} \`
`143`	`143`	`\`
`@@ -149,10 +149,14 @@ FOREACH_POINTWISE_OP_SCALAR(addcdiv, std::divides);`
`149`	`149`	`FOREACH_POINTWISE_OP_SCALARLIST(addcmul, std::multiplies);`
`150`	`150`	`FOREACH_POINTWISE_OP_SCALARLIST(addcdiv, std::divides);`
`151`	`151`
	`152`	`+`
	`153`	`+// Why bool tensors are pushed to slowpath?`
	`154`	+// Because `AT_DISPATCH_ALL_TYPES_AND` is used below.
	`155`	`+// TODO(mkozuki): Check whether it's possible to handle bool tensors in fastpath.`
`152`	`156`	`#define FOREACH_MAXIMUM_MINIMUM_OP(NAME, OP) \`
`153`	`157`	`std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors1, TensorList tensors2) { \`
`154`	`158`	`check_foreach_api_restrictions(tensors1, tensors2); \`
`155`		`- if (!can_use_fast_route({tensors1, tensors2})) { \`
	`159`	`+ if (!can_use_fast_route({tensors1, tensors2}) \|\| has_bool_tensor(tensors1)) { \`
`156`	`160`	`return at::native::foreach_tensor_##NAME##_slow(tensors1, tensors2); \`
`157`	`161`	`} \`
`158`	`162`	`\`