pytorch
diff --git a/‎aten/src/ATen/native/BinaryOps.h‎
Lines changed: 9 additions & 0 deletions b/‎aten/src/ATen/native/BinaryOps.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/ForeachUtils.h‎
Lines changed: 21 additions & 7 deletions b/‎aten/src/ATen/native/ForeachUtils.h‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpList.cu‎
Lines changed: 5 additions & 5 deletions b/‎aten/src/ATen/native/cuda/ForeachBinaryOpList.cu‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu‎
Lines changed: 33 additions & 8 deletions b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu‎
Lines changed: 33 additions & 8 deletions
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu‎
Lines changed: 37 additions & 9 deletions b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu‎
Lines changed: 37 additions & 9 deletions
@@ -25,6 +25,15 @@ inline void sub_check(const Tensor& self, const Tensor& other) {
               "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
 }
 
+inline void sub_check(const Tensor& self, const Scalar& scalar) {
+  TORCH_CHECK(self.scalar_type() != kBool || !scalar.isBoolean(),
+              "Subtraction, the `-` operator, with two bool tensors is not supported."
+              "Use the `^` or `logical_xor()` operator instead.")
+  TORCH_CHECK(self.scalar_type() != kBool && !scalar.isBoolean(),
+              "Subtraction, the `-` operator, with a bool tensor is not supported. "
+              "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+}
+
 using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, Scalar alpha);
 
 using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha);
 
@@ -64,7 +64,13 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2, Te
 // - All tensors must be non-overlapping and dense
 // - Resulting tensor must have the same dtype as the input one
 
-bool will_promote_tensor(const Tensor& tensor, Scalar scalar) {
+bool will_promote_tensor(const Tensor& tensor, Scalar scalar, bool does_op_promote_integer_inputs_to_float = false) {
+  // In case of division, integer inputs will result in float
+  if (does_op_promote_integer_inputs_to_float) {
+    if (at::isIntegralType(tensor.scalar_type(), /*includeBool*/ true)) {
+      return true;
+    }
+  }
   auto result_dtype = at::result_type(tensor, scalar);
   return result_dtype != tensor.scalar_type();
 }
@@ -73,7 +79,8 @@ bool will_promote_tensor(const Tensor& tensor, Scalar scalar) {
 // There is a set of preconditions that have to be satisfied.
 bool check_fast_path_restrictions(
   ArrayRef<TensorList> tensorLists,
-  ArrayRef<Scalar> scalarList = {}) {
+  ArrayRef<Scalar> scalarList = {},
+  bool does_op_promote_integer_inputs_to_float = false) {
     auto expected_device = tensorLists[0][0].device();
 
     auto is_tensor_okay = [&](const Tensor& tensor) {
@@ -103,12 +110,18 @@ bool check_fast_path_restrictions(
     // checked by `check_foreach_api_restrictions`). This means we only need to check if
     // {tensorList[0][0], tensorList[0][1], tensorList[0][2], ...} do type promotion with scalarLIst.
     for (int i=0; i < tensorLists[0].size(); i++) {
+      if (does_op_promote_integer_inputs_to_float) {
+        if (at::isIntegralType(tensorLists[0][i].scalar_type(), /*includeBool*/ true)) {
+          return false;
+        }
+      }
+
       if (scalarList.size() == 1) {
         if (will_promote_tensor(tensorLists[0][i], scalarList[0])) {
           return false;
         }
       } else if (scalarList.size() > 1) {
-        // Complex scalar list is not supported.
+        // Complex scalar list is not supported due to the limit for kernel launch argument (4KB)
         if (scalarList[i].isComplex()) {
           return false;
         }
@@ -123,19 +136,20 @@ bool check_fast_path_restrictions(
 }
 
 bool can_use_fast_route(ArrayRef<TensorList> tensorLists,
-                        ArrayRef<Scalar> scalarList = {}) {
+                        ArrayRef<Scalar> scalarList = {},
+                        bool does_op_promote_integer_inputs_to_float = false) {
 #ifdef __HIP_PLATFORM_HCC__
   return false;
 #else
-  return check_fast_path_restrictions(tensorLists, scalarList);
+  return check_fast_path_restrictions(tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
 #endif
 }
 
-bool can_use_fast_route(TensorList tensors1, TensorList tensors2) {
+bool can_use_fast_route(TensorList tensors1, TensorList tensors2, bool does_op_promote_integer_inputs_to_float = false) {
 #ifdef __HIP_PLATFORM_HCC__
   return false;
 #else
-  return can_use_fast_route({tensors1, tensors2}, {});
+  return can_use_fast_route({tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float);
 #endif
 }
 
 
@@ -49,10 +49,10 @@ void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, Scalar al
     });
 }
 
-#define FOREACH_BINARY_OP_LIST(NAME, OP)                                                                    \
+#define FOREACH_BINARY_OP_LIST(NAME, OP, DIVISION_OP)                                                       \
 void foreach_tensor_##NAME##_list_kernel_cuda_(TensorList tensors1, TensorList tensors2) {                  \
     check_foreach_api_restrictions(tensors1, tensors2);                                                     \
-    if (!can_use_fast_route({tensors1, tensors2})) {                                                        \
+    if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) {                                             \
         return at::native::foreach_tensor_##NAME##_list_kernel_slow_(tensors1, tensors2);                   \
     }                                                                                                       \
                                                                                                             \
@@ -61,7 +61,7 @@ void foreach_tensor_##NAME##_list_kernel_cuda_(TensorList tensors1, TensorList t
                                                                                                             \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_cuda(TensorList tensors1, TensorList tensors2) {    \
     check_foreach_api_restrictions(tensors1, tensors2);                                                     \
-    if (!can_use_fast_route({tensors1, tensors2})) {                                                        \
+    if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) {                                             \
         return at::native::foreach_tensor_##NAME##_list_kernel_slow(tensors1, tensors2);                    \
     }                                                                                                       \
                                                                                                             \
@@ -89,7 +89,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_cuda(TensorList tensors1
 
 FOREACH_BINARY_OP_LIST_ALPHA(add, std::plus);
 FOREACH_BINARY_OP_LIST_ALPHA(sub, std::minus);
-FOREACH_BINARY_OP_LIST(mul, std::multiplies);
-FOREACH_BINARY_OP_LIST(div, std::divides);
+FOREACH_BINARY_OP_LIST(mul, std::multiplies, /*division_op*/ false);
+FOREACH_BINARY_OP_LIST(div, std::divides, /*division_op*/ true);
 
 }} // namespace at::native
@@ -1,7 +1,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/cuda/ForeachFunctors.cuh>
-
+#include <ATen/native/BinaryOps.h>
 namespace at { namespace native {
 
 template<template<class> class Op>
@@ -46,10 +46,10 @@ void foreach_binary_op_(TensorList tensors, Scalar scalar) {
     });
 }
 
-#define FOREACH_BINARY_OP_SCALAR(NAME, OP)                                                          \
+#define FOREACH_BINARY_OP_SCALAR(NAME, OP, DIVISION_OP)                                             \
 void foreach_tensor_##NAME##_scalar_kernel_cuda_(TensorList tensors, Scalar scalar) {               \
     check_foreach_api_restrictions(tensors);                                                        \
-    if (!can_use_fast_route(tensors, scalar)) {                                                     \
+    if (!can_use_fast_route(tensors, scalar, DIVISION_OP)) {                                        \
         return at::native::foreach_tensor_##NAME##_scalar_kernel_slow_(tensors, scalar);            \
     }                                                                                               \
                                                                                                     \
@@ -58,16 +58,41 @@ void foreach_tensor_##NAME##_scalar_kernel_cuda_(TensorList tensors, Scalar scal
                                                                                                     \
 std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_cuda(TensorList tensors, Scalar scalar) { \
     check_foreach_api_restrictions(tensors);                                                        \
-    if (!can_use_fast_route(tensors, scalar)) {                                                     \
+    if (!can_use_fast_route(tensors, scalar, DIVISION_OP)) {                                        \
         return at::native::foreach_tensor_##NAME##_scalar_kernel_slow(tensors, scalar);             \
     }                                                                                               \
                                                                                                     \
     return foreach_binary_op<OP>(tensors, scalar);                                                  \
 }
 
-FOREACH_BINARY_OP_SCALAR(add, std::plus);
-FOREACH_BINARY_OP_SCALAR(sub, std::minus);
-FOREACH_BINARY_OP_SCALAR(mul, std::multiplies);
-FOREACH_BINARY_OP_SCALAR(div, std::divides);
+FOREACH_BINARY_OP_SCALAR(add, std::plus, false);
+FOREACH_BINARY_OP_SCALAR(mul, std::multiplies, false);
+
+// In the case of division, integer inputs will result in float.
+// Currently multi tensor apply can only return result of the same type as input.
+FOREACH_BINARY_OP_SCALAR(div, std::divides, true);
+
+// In the case of subtraction, we dont allow scalar to be boolean following the torch.sub logic
+void foreach_tensor_sub_scalar_kernel_cuda_(TensorList tensors, Scalar scalar) {
+    check_foreach_api_restrictions(tensors);
+    at::native::sub_check(tensors[0], scalar);
+
+    if (!can_use_fast_route(tensors, scalar)) {
+        return at::native::foreach_tensor_sub_scalar_kernel_slow_(tensors, scalar);
+    }
+
+    foreach_binary_op_<std::minus>(tensors, scalar);
+}
+
+std::vector<Tensor> foreach_tensor_sub_scalar_kernel_cuda(TensorList tensors, Scalar scalar) {
+    check_foreach_api_restrictions(tensors);
+    at::native::sub_check(tensors[0], scalar);
+
+    if (!can_use_fast_route(tensors, scalar)) {
+        return at::native::foreach_tensor_sub_scalar_kernel_slow(tensors, scalar);
+    }
+
+    return foreach_binary_op<std::minus>(tensors, scalar);
+}
 
 }} // namespace at::native
@@ -1,6 +1,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/cuda/ForeachFunctors.cuh>
+#include <ATen/native/BinaryOps.h>
 
 namespace at { namespace native {
 
@@ -16,7 +17,7 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<Scalar> s
     tensor_lists.emplace_back(tensors.vec());
     tensor_lists.emplace_back(vec_res);
 
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<2, opmath_t>(tensor_lists,
                                         scalars,
@@ -35,7 +36,7 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
     std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<1, opmath_t>(tensor_lists,
                                         scalars,
@@ -47,10 +48,10 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
     });
 }
 
-#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP)                                                                           \
+#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP, DIV_OP)                                                                   \
 void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<Scalar> scalars) {                 \
     check_foreach_api_restrictions(tensors, scalars);                                                                    \
-    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+    if (!can_use_fast_route(tensors, scalars, DIV_OP)) {                                                                 \
         return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars);                            \
     }                                                                                                                    \
                                                                                                                          \
@@ -59,16 +60,43 @@ void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::Arr
                                                                                                                          \
 std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<Scalar> scalars) {   \
     check_foreach_api_restrictions(tensors, scalars);                                                                    \
-    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+    if (!can_use_fast_route(tensors, scalars, DIV_OP)) {                                                                 \
         return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars);                             \
     }                                                                                                                    \
                                                                                                                          \
     return foreach_binary_op<OP>(tensors, scalars);                                                                      \
 }
 
-FOREACH_BINARY_OP_SCALARLIST(add, std::plus);
-FOREACH_BINARY_OP_SCALARLIST(sub, std::minus);
-FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies);
-FOREACH_BINARY_OP_SCALARLIST(div, std::divides);
+FOREACH_BINARY_OP_SCALARLIST(add, std::plus, /*div_op*/ false);
+FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies, /*div_op*/ false);
+FOREACH_BINARY_OP_SCALARLIST(div, std::divides, /*div_op*/ true);
+
+// This does not use FOREACH_BINARY_OP_SCALARLIST because
+// In the case of subtraction, we dont allow scalar to be boolean following the torch.sub logic
+void foreach_tensor_sub_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
+    check_foreach_api_restrictions(tensors, scalars);
+    for (int i = 0; i < tensors.size(); i++) {
+        sub_check(tensors[i], scalars[i]);
+    }
+
+    if (!can_use_fast_route({tensors}, scalars)) {
+        return at::native::foreach_tensor_sub_scalarlist_kernel_slow_(tensors, scalars);
+    }
+
+    foreach_binary_op_<std::minus>(tensors, scalars);
+}
+
+std::vector<Tensor> foreach_tensor_sub_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<Scalar> scalars) {
+    check_foreach_api_restrictions(tensors, scalars);
+    for (int i = 0; i < tensors.size(); i++) {
+        sub_check(tensors[i], scalars[i]);
+    }
+
+    if (!can_use_fast_route({tensors}, scalars)) {
+        return at::native::foreach_tensor_sub_scalarlist_kernel_slow(tensors, scalars);
+    }
+
+    return foreach_binary_op<std::minus>(tensors, scalars);
+}
 
 }} // namespace at::native