[pytorch][cuda] Some speedup on depth wise convolution 2D forward (#125362)

valentinandrei · pytorchmergebot · commit 8bf9e99cead3 · 2024-05-14T07:27:02.000Z
This PR does a few things: - Adds a generic implementation for `conv_depthwise2d` when the filter size is non standard. This implementation works faster because it doesn't do edge condition checks inside the innermost loops. We avoid the checks by calculating the boundaries ahead of the loop. - Hints to nvcc to minimize the register usage so that we squeeze more memory bandwidth - Adds filter size 5 as a common size where we can use the template implementation to improve unrolling and generate more efficient code The implementation doesn't completely fix the issue described in #18631. For that we need to rewrite the kernel using the suggestions described in the issue chat. This PR uses the same order of accessing the tensor as before but just removes overhead instructions in the inner loops to get the speedup. Before: ``` conv2d-performance: B C iH iW kH kW native (cpu) conv2d (cuda) conv2d-fp16 (cuda) 0 8.0 64.0 1024.0 1008.0 5.0 5.0 149.052643 24.982176 3.236192 1 8.0 64.0 1008.0 1008.0 5.0 5.0 150.810333 24.643536 3.237760 2 4.0 48.0 720.0 539.0 6.0 1.0 15.747776 2.636320 1.788672 3 4.0 120.0 379.0 283.0 6.0 1.0 12.234080 1.791712 1.231360 4 4.0 32.0 713.0 532.0 6.0 1.0 10.362272 1.731584 1.170544 5 4.0 3.0 712.0 542.0 31.0 31.0 24.965248 3.406304 4.165440 6 4.0 120.0 379.0 288.0 1.0 6.0 10.772512 1.215616 0.939936 7 1024.0 384.0 1.0 928.0 1.0 3.0 60.051582 7.594256 2.861344 8 4.0 24.0 687.0 512.0 6.0 1.0 10.231536 1.196704 0.818432 9 96.0 96.0 112.0 112.0 5.0 5.0 21.025631 5.110096 0.715520 10 96.0 80.0 56.0 56.0 5.0 5.0 9.730064 1.016080 0.207424 11 64.0 128.0 64.0 84.0 3.0 3.0 18.759552 0.616736 0.200832 12 16.0 960.0 7.0 7.0 5.0 5.0 0.274880 0.020288 0.014688 13 16.0 64.0 112.0 112.0 3.0 3.0 6.425696 0.189088 0.053728 ``` After ``` B C iH iW kH kW native (cpu) conv2d (cuda) conv2d-fp16 (cuda) 0 8.0 64.0 1024.0 1008.0 5.0 5.0 122.534370 12.915648 3.269936 1 8.0 64.0 1008.0 1008.0 5.0 5.0 126.026978 12.826848 3.236608 2 4.0 48.0 720.0 539.0 6.0 1.0 14.488160 1.803424 1.794368 3 4.0 120.0 379.0 283.0 6.0 1.0 11.556304 1.251200 1.240736 4 4.0 32.0 713.0 532.0 6.0 1.0 9.737841 1.186240 1.174128 5 4.0 3.0 712.0 542.0 31.0 31.0 19.394785 2.017056 2.310368 6 4.0 120.0 379.0 288.0 1.0 6.0 9.586752 0.828736 0.843712 7 1024.0 384.0 1.0 928.0 1.0 3.0 48.939903 5.529312 2.860768 8 4.0 24.0 687.0 512.0 6.0 1.0 13.474000 0.831920 0.825280 9 96.0 96.0 112.0 112.0 5.0 5.0 15.439168 2.611616 0.724864 10 96.0 80.0 56.0 56.0 5.0 5.0 5.991968 0.520352 0.207456 11 64.0 128.0 64.0 84.0 3.0 3.0 9.381472 0.609680 0.202832 12 16.0 960.0 7.0 7.0 5.0 5.0 0.265504 0.015680 0.014496 13 16.0 64.0 112.0 112.0 3.0 3.0 2.384832 0.187168 0.053280 ``` Pull Request resolved: #125362 Approved by: https://github.com/ezyang
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
@@ -29,9 +29,120 @@ PackedTensorAccessor32<scalar_t, ndim, PtrTraits> dummy_packed_accessor32() {
   return {nullptr, zeros.data(), zeros.data()};
 }
 
+template <typename scalar_t, typename index_t>
+__global__ void
+#if !defined(USE_ROCM)
+C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
+#endif
+conv_depthwise2d_forward_kernel_generic(
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
+    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
+    bool biasEnabled,
+    index_t totalElements,
+    const int outputChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight) {
+  using acc_t = at::acc_type<scalar_t, true>;
+
+  CUDA_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
+    //calculate n,c,h,w indices, replacing modulos by divide and multiply add,
+    //result is same as would be in the code below
+    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
+    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
+    //const int h = (linearIndex / outputWidth) % outputHeight;
+    //const int w = linearIndex % outputWidth;
+
+    int indtmp1 = linearIndex/outputWidth;
+    const int w = linearIndex - indtmp1 * outputWidth;
+    int indtmp2 = indtmp1/outputHeight;
+    const int h = indtmp1 - indtmp2 * outputHeight;
+    indtmp1 = indtmp2;
+    indtmp2 = indtmp1/outputChannels;
+    const int c = indtmp1 - indtmp2 * outputChannels;
+    const int n = indtmp2;
+
+    int inputChannel = c;
+    int inputChannels = outputChannels;
+    if (depthwiseMultiplier !=1) {
+      inputChannel /= depthwiseMultiplier;
+      inputChannels /= depthwiseMultiplier;
+    }
+
+    int weightOffset = c * kernelHeight * kernelWidth;
+
+    // By precisely computing the filtering boundaries, we avoid repeating several
+    // expensive edge condition checks for every fetched item. If the input element is
+    // resident in L1, then the extra branches and comparisons would have been
+    // comparable in terms of cycles with the actual data fetch. Therefore computing
+    // boundaries ahead of the loop showed significant performance boost.
+
+    int kHmin = 0, kHmax = kernelHeight, kWmin = 0, kWmax = kernelWidth;
+
+    // Top
+    int h_in_min = -padHeight + h * strideHeight;
+    if (h_in_min < 0) {
+      kHmin =  -h_in_min / dilationHeight;
+      if ((-h_in_min) % dilationHeight > 0) {
+        kHmin++;
+      }
+    }
+
+    // Bottom
+    int h_in_max = h_in_min + (kernelHeight - 1) * dilationHeight - inputHeight + 1;
+    if (h_in_max >= 0) {
+      kHmax = kernelHeight - h_in_max / dilationHeight;
+      if (h_in_max % dilationHeight > 0) {
+        kHmax--;
+      }
+    }
+
+    // Left
+    int w_in_min = -padWidth + w * strideWidth;
+    if (w_in_min < 0) {
+      kWmin = -w_in_min / dilationWidth;
+      if ((-w_in_min) % dilationWidth > 0) {
+        kWmin++;
+      }
+    }
+
+    // Right
+    int w_in_max = w_in_min + (kernelWidth - 1) * dilationWidth - inputWidth + 1;
+    if (w_in_max >= 0) {
+      kWmax = kernelWidth - w_in_max / dilationWidth;
+      if (w_in_max % dilationWidth > 0) {
+        kWmax--;
+      }
+    }
+
+    acc_t value = biasEnabled ? static_cast<acc_t>(bias.data()[c]) : acc_t(0);
+    const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
+
+    for (int kH = kHmin; kH < kHmax; ++kH) {
+      const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
+      for (int kW = kWmin; kW < kWmax; ++kW) {
+        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
+        const index_t offset = offset0 + h_in * inputWidth + w_in;
+        value += (static_cast<acc_t>(weight.data()[weightOffset + kH * kernelWidth + kW]) *
+                    static_cast<acc_t>(input.data()[offset]));
+      }
+    }
+    output.data()[linearIndex] = static_cast<scalar_t>(value);
+  }
+}
 
 template <int kSize, typename scalar_t, typename index_t>
-__global__ void conv_depthwise2d_forward_kernel(
+__global__ void
+#if !defined(USE_ROCM)
+C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
+#endif
+conv_depthwise2d_forward_kernel(
     const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
     PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
     const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
@@ -315,7 +426,13 @@ void conv_depthwise2d_forward_out(
     const auto bias_a = has_bias ?
       bias.packed_accessor32<const scalar_t, 1>() :
       dummy_packed_accessor32<const scalar_t, 1>();
-    if (kW == 3 && kH == 3) {
+    if (kW == 5 && kH == 5) {
+      conv_depthwise2d_forward_kernel<5> <<<grid, block, 0, stream>>>(
+        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else if (kW == 3 && kH == 3) {
       conv_depthwise2d_forward_kernel<3> <<<grid, block, 0, stream>>>(
         input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
         width, height, outputWidth, outputHeight,
@@ -328,7 +445,7 @@ void conv_depthwise2d_forward_out(
         kW, kH, dW, dH, padW, padH, dilationW, dilationH);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      conv_depthwise2d_forward_kernel<0> <<<grid, block, 0, stream>>>(
+      conv_depthwise2d_forward_kernel_generic<<<grid, block, 0, stream>>>(
         input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
         width, height, outputWidth, outputHeight,
         kW, kH, dW, dH, padW, padH, dilationW, dilationH);