[iOS GPU] Support element-wise broadcasting for binary ops in shaders (#53949)

xta0 · facebook-github-bot · commit 530dc828ae11 · 2021-03-14T22:12:52.000-07:00
Summary: Pull Request resolved: #53949 As title says ghstack-source-id: 123849745 Test Plan: `buck test pp-mac` ``` 2021-03-11 18:25:07.922375-0800 PyTorchPlayground[8324:5122672] [bool test_add()],[1 180 12 12 ],[SUCCEED] 2021-03-11 18:25:07.960812-0800 PyTorchPlayground[8324:5122672] [bool test_add_broadcast()],[2 17 58 67 ],[SUCCEED] 2021-03-11 18:25:07.978399-0800 PyTorchPlayground[8324:5122672] [bool test_add_broadcast2()],[2 17 1 67 ],[SUCCEED] 2021-03-11 18:25:08.021570-0800 PyTorchPlayground[8324:5122672] [bool test_sub()],[5 3 167 222 ],[SUCCEED] 2021-03-11 18:25:08.034218-0800 PyTorchPlayground[8324:5122672] [bool test_sub_broadcast()],[1 3 1 1 ],[SUCCEED] 2021-03-11 18:25:08.069419-0800 PyTorchPlayground[8324:5122672] [bool test_sub_broadcast2()],[3 3 192 192 ],[SUCCEED] 2021-03-11 18:25:08.112967-0800 PyTorchPlayground[8324:5122672] [bool test_mul()],[2 7 262 119 ],[SUCCEED] 2021-03-11 18:25:08.136691-0800 PyTorchPlayground[8324:5122672] [bool test_mul_broadcast()],[4 3 192 192 ],[SUCCEED] 2021-03-11 18:25:08.148920-0800 PyTorchPlayground[8324:5122672] [bool test_mul_broadcast2()],[1 3 192 192 ],[SUCCEED] ``` Reviewed By: SS-JIA Differential Revision: D27000487 fbshipit-source-id: f86fca5ac1960ca0a56636da17ae05020c1a4138
diff --git a/aten/src/ATen/native/metal/MetalShaders.h b/aten/src/ATen/native/metal/MetalShaders.h
@@ -18,73 +18,106 @@ constant ushort ushort_arg_9[[function_constant(9)]];
 constant float float_arg_0 [[function_constant(10)]];
 constant float float_arg_1 [[function_constant(11)]];
 
-
 inline constexpr ushort divRoundUp(ushort x, ushort y) { return (x + (y - 1)) / y; }
 
+enum broadcastOp {
+    Add,
+    Sub,
+    Mul,
+    Div,
+};
+
+void elementwise_broadcast_nonarray(texture2d<half, access::read> in0,
+                                   texture2d<half, access::read> in1,
+                                   texture2d<half, access::write> out,
+                                   ushort2 gid,
+                                   broadcastOp op) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+    ushort2 in0_stride = ushort2(in0.get_width() > 1, in0.get_height() > 1);
+    ushort2 in1_stride = ushort2(in1.get_width() > 1, in1.get_height() > 1);
+
+    ushort2 gid0 = gid.xy * in0_stride;
+    ushort2 gid1 = gid.xy * in1_stride;
+
+    if(op == Add) {
+        out.write(in0.read(gid0) + in1.read(gid1), gid);
+    } else if(op == Sub) {
+        out.write(in0.read(gid0) - in1.read(gid1), gid);
+    } else if(op == Mul) {
+        out.write(in0.read(gid0) * in1.read(gid1), gid);
+    } else if(op == Div) {
+        out.write(in0.read(gid0) / in1.read(gid1), gid);
+    }
+}
+
+void elementwise_broadcast(texture2d_array<half, access::read> in0,
+                           texture2d_array<half, access::read> in1,
+                           texture2d_array<half, access::write> out,
+                           ushort3 gid,
+                           broadcastOp op) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+        return;
+    }
+
+    ushort2 in0_stride = ushort2(in0.get_width() > 1, in0.get_height() > 1);
+    ushort2 in1_stride = ushort2(in1.get_width() > 1, in1.get_height() > 1);
+
+    ushort2 gid0 = gid.xy * in0_stride;
+    ushort2 gid1 = gid.xy * in1_stride;
+
+    if(op == Add) {
+        out.write(in0.read(gid0, gid.z) + in1.read(gid1, gid.z), gid.xy, gid.z);
+    } else if(op == Sub) {
+        out.write(in0.read(gid0, gid.z) - in1.read(gid1, gid.z), gid.xy, gid.z);
+    } else if(op == Mul) {
+        out.write(in0.read(gid0, gid.z) * in1.read(gid1, gid.z), gid.xy, gid.z);
+    } else if(op == Div) {
+        out.write(in0.read(gid0, gid.z) / in1.read(gid1, gid.z), gid.xy, gid.z);
+    }
+}
+
 kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
                                      texture2d<half, access::read> in1[[texture(1)]],
                                      texture2d<half, access::write> out[[texture(2)]],
                                      ushort2 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
-    }
-    out.write(in0.read(gid) + in1.read(gid), gid);
+    elementwise_broadcast_nonarray(in0, in1, out, gid, Add);
 }
 
 kernel void elementwise_add(texture2d_array<half, access::read> in0[[texture(0)]],
                             texture2d_array<half, access::read> in1[[texture(1)]],
                             texture2d_array<half, access::write> out[[texture(2)]],
                             ushort3 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
-    }
-    ushort2 gid_ = gid.xy;
-    out.write(in0.read(gid_, gid.z) + in1.read(gid_, gid.z), gid_, gid.z);
+    elementwise_broadcast(in0, in1, out, gid, Add);
 }
 
 kernel void elementwise_sub_nonarray(texture2d<half, access::read> in0[[texture(0)]],
                                      texture2d<half, access::read> in1[[texture(1)]],
                                      texture2d<half, access::write> out[[texture(2)]],
                                      ushort2 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
-    }
-    ushort2 gid2{0,0};
-    out.write(in0.read(gid) - in1.read(gid2), gid);
+    elementwise_broadcast_nonarray(in0, in1, out, gid, Sub);
 }
 
 kernel void elementwise_sub(texture2d_array<half, access::read> in0[[texture(0)]],
                             texture2d_array<half, access::read> in1[[texture(1)]],
                             texture2d_array<half, access::write> out[[texture(2)]],
                             ushort3 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
-    }
-    ushort2 gid1 = gid.xy;
-    ushort2 gid2{0,0};
-    out.write(in0.read(gid1, gid.z) - in1.read(gid2, gid.z), gid1, gid.z);
+    elementwise_broadcast(in0, in1, out, gid, Sub);
 }
+
 kernel void elementwise_mul_nonarray(texture2d<half, access::read> in0[[texture(0)]],
                                      texture2d<half, access::read> in1[[texture(1)]],
                                      texture2d<half, access::write> out[[texture(2)]],
                                      ushort2 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
-    }
-    ushort2 gid2{0,0};
-    out.write(in0.read(gid) * in1.read(gid2), gid);
+    elementwise_broadcast_nonarray(in0, in1, out, gid, Mul);
 }
 
 kernel void elementwise_mul(texture2d_array<half, access::read> in0[[texture(0)]],
                             texture2d_array<half, access::read> in1[[texture(1)]],
                             texture2d_array<half, access::write> out[[texture(2)]],
                             ushort3 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
-    }
-    ushort2 gid1 = gid.xy;
-    ushort2 gid2{0,0};
-    out.write(in0.read(gid1, gid.z) * in1.read(gid2, gid.z), gid1, gid.z);
+    elementwise_broadcast(in0, in1, out, gid, Mul);
 }
 
 kernel void copy_nchw_to_metal(constant float* in[[buffer(0)]],
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
@@ -13,6 +13,7 @@ bool test_relu();
 bool test_addmm();
 bool test_add();
 bool test_add_broadcast();
+bool test_add_broadcast2();
 bool test_sub();
 bool test_sub_broadcast();
 bool test_sub_broadcast2();
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
@@ -76,9 +76,28 @@ bool TEST(const std::vector<int64_t>& sizes, std::string name, Func block) {
   return b;
 }
 
+void PRINT_TENSOR(std::string name, const at::Tensor& tensor){
+    std::string str = name + ": ";
+    auto print = [&](const at::Tensor& t){
+        for(int i=0; i<t.numel(); ++i){
+            NSString* sf = [NSString stringWithFormat:@"%.2f",t.data_ptr<float>()[i]];
+            str += sf.UTF8String;
+            str += ", ";
+        }
+        std::cout<<str<<std::endl;
+    };
+    if(tensor.is_metal()){
+        MPSImage* image = at::native::metal::imageFromTensor(tensor);
+        auto t = at::native::metal::staticImageToTensor(image);
+        print(t);
+    } else {
+        print(tensor);
+    }
+}
+
 }
 
- using namespace at::native::metal;
+using namespace at::native::metal;
 
 bool test_synchronization() {
   __block std::vector<int64_t> size{1, 3, 2, 2};
@@ -324,6 +343,21 @@ bool test_add_broadcast() {
   });
 }
 
+bool test_add_broadcast2() {
+  __block std::vector<int64_t> x1{2, 17, 1, 67};
+  __block std::vector<int64_t> x2{2, 17, 58, 67};
+  return TEST(x1, __PRETTY_FUNCTION__, ^bool {
+    auto X1 = at::rand(x1, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto X2 = at::rand(x2, at::TensorOptions(at::kCPU).dtype(at::kFloat));
+    auto Y1 = at::add(X1, X2);
+    auto MX1 = X1.metal();
+    auto MX2 = X2.metal();
+    auto Y2 = at::add(MX1, MX2).cpu();
+    return almostEqual(Y1, Y2);
+  });
+}
+
+
 bool test_sub() {
   __block std::vector<int64_t> x{5, 3, 167, 222};
   return TEST(x, __PRETTY_FUNCTION__, ^bool {
@@ -338,8 +372,8 @@ bool test_sub() {
 }
 
 bool test_sub_broadcast() {
-  __block std::vector<int64_t> x1{3, 3, 1, 1};
-  __block std::vector<int64_t> x2{3, 3, 192, 192};
+  __block std::vector<int64_t> x1{1, 3, 1, 1};
+  __block std::vector<int64_t> x2{1, 3, 192, 192};
   return TEST(x1, __PRETTY_FUNCTION__, ^bool {
     auto X1 = at::rand(x1, at::TensorOptions(at::kCPU).dtype(at::kFloat));
     auto X2 = at::rand(x2, at::TensorOptions(at::kCPU).dtype(at::kFloat));
@@ -393,8 +427,8 @@ bool test_mul_broadcast() {
 }
 
 bool test_mul_broadcast2() {
-  __block std::vector<int64_t> x1{4, 3, 192, 1};
-  __block std::vector<int64_t> x2{4, 3, 192, 192};
+  __block std::vector<int64_t> x2{1, 3, 192, 1};
+  __block std::vector<int64_t> x1{1, 3, 192, 192};
   return TEST(x1, __PRETTY_FUNCTION__, ^bool {
     auto X1 = at::rand(x1, at::TensorOptions(at::kCPU).dtype(at::kFloat));
     auto X2 = at::rand(x2, at::TensorOptions(at::kCPU).dtype(at::kFloat));
diff --git a/aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm b/aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm
@@ -165,13 +165,12 @@ Tensor binaryElementwiseMPSCNNKernel(
 Tensor add_Tensor(const Tensor& input1, const Tensor& input2, Scalar alpha) {
   TORCH_CHECK(input1.is_metal());
   TORCH_CHECK(input1.dim() == input2.dim());
+  TORCH_CHECK(input1.sizes()[0] == input2.sizes()[0]);
+  TORCH_CHECK(input1.sizes()[1] == input2.sizes()[1]);
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel<MPSCNNAdd>(input1, input2_);
   } else {
-    // TODO: support broadcast in shader functions for iOS 10 users
-    TORCH_CHECK(input1.sizes()[2] == input2.sizes()[2]);
-    TORCH_CHECK(input1.sizes()[3] == input2.sizes()[3]);
     return binaryElementwiseShaderKernel(
         input1, input2_, @"elementwise_add", @"elementwise_add_nonarray");
   }
@@ -180,13 +179,12 @@ Tensor add_Tensor(const Tensor& input1, const Tensor& input2, Scalar alpha) {
 Tensor& add__Tensor(Tensor& input1, const Tensor& input2, Scalar alpha) {
   TORCH_CHECK(input1.is_metal());
   TORCH_CHECK(input1.dim() == input2.dim());
+  TORCH_CHECK(input1.sizes()[0] == input2.sizes()[0]);
+  TORCH_CHECK(input1.sizes()[1] == input2.sizes()[1]);
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel_<MPSCNNAdd>(input1, input2_);
   } else {
-    // TODO: support broadcast in for iOS 10 users
-    TORCH_CHECK(input1.sizes()[2] == input2.sizes()[2]);
-    TORCH_CHECK(input1.sizes()[3] == input2.sizes()[3]);
     return binaryElementwiseShaderKernel_(
         input1, input2_, @"elementwise_add", @"elementwise_add_nonarray");
   }
@@ -195,12 +193,12 @@ Tensor add_Tensor(const Tensor& input1, const Tensor& input2, Scalar alpha) {
 Tensor sub_Tensor(const Tensor& input1, const Tensor& input2, Scalar alpha) {
   TORCH_CHECK(input1.is_metal());
   TORCH_CHECK(input1.dim() == input2.dim());
+  TORCH_CHECK(input1.sizes()[0] == input2.sizes()[0]);
+  TORCH_CHECK(input1.sizes()[1] == input2.sizes()[1]);
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel<MPSCNNSubtract>(input1, input2_);
   } else {
-    // TODO: support non-broadcast for iOS 10 users
-    TORCH_CHECK(input2.sizes()[2] == input2.sizes()[3] == 1);
     return binaryElementwiseShaderKernel(
         input1, input2_, @"elementwise_sub", @"elementwise_sub_nonarray");
   }
@@ -209,12 +207,12 @@ Tensor sub_Tensor(const Tensor& input1, const Tensor& input2, Scalar alpha) {
 Tensor mul_Tensor(const Tensor& input1, const Tensor& input2) {
   TORCH_CHECK(input1.is_metal());
   TORCH_CHECK(input1.dim() == input2.dim());
+  TORCH_CHECK(input1.sizes()[0] == input2.sizes()[0]);
+  TORCH_CHECK(input1.sizes()[1] == input2.sizes()[1]);
   auto input2_ = input2.is_metal() ? input2 : input2.metal();
   if (@available(iOS 11.3, *)) {
     return binaryElementwiseMPSCNNKernel<MPSCNNMultiply>(input1, input2_);
   } else {
-    // TODO: support non-broadcast for iOS 10 users
-    TORCH_CHECK(input2.sizes()[2] == input2.sizes()[3] == 1);
     return binaryElementwiseShaderKernel(
         input1, input2_, @"elementwise_mul", @"elementwise_mul_nonarray");
   }