[cuda] Limit grid size for torch.cat kernel on aligned16 contig tensors (#103233)

valentinandrei · pytorchmergebot · commit c62fcedc4412 · 2023-06-08T19:14:14.000Z
When torch.cat gets called on a list of contiguous tensors that are aligned on a 16B boundary in memory, the number of thread blocks used is directly proportional with the maximum size of the tensors in the list. If one or more tensors are very large while the others are small, a high number of thread blocks results in useless redundant loads of the input metadata. This PR limits the grid size and improves the performance of cat when used on list of tensors with large variations in size. Used the same test program from #102815 but added new cases with list of tensors with varying sizes. <img width="735" alt="Screenshot 2023-06-07 at 10 14 18 PM" src="https://github.com/pytorch/pytorch/assets/23515689/72d0e5cb-5840-400e-b53b-d1418e664f19"> Pull Request resolved: #103233 Approved by: https://github.com/malfet
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
@@ -55,12 +55,21 @@ inline std::tuple<dim3, dim3> getCatGridContig(unsigned int max_elements_per_ten
   ptrdiff_t nTensors) {
   constexpr unsigned int threads_per_block = 128;
   constexpr unsigned int min_aligned_vec_per_thread = 1;
+  constexpr unsigned int max_tb_per_sm = 32;
 
   unsigned int elements_per_thread = ALIGNED_VEC_LOAD_BYTES / sizeof(T) *
     min_aligned_vec_per_thread;
   unsigned int max_threads = ceil_div(max_elements_per_tensor, elements_per_thread);
+  unsigned int thread_blocks = ceil_div(max_threads, threads_per_block);
+
+  // Limit the number of thread blocks to prevent too many threads to load the metadata
+  // if they operate on very small tensors.
+
+  const unsigned int num_sm = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  thread_blocks = std::min(num_sm * max_tb_per_sm, thread_blocks);
+
   dim3 block = dim3(threads_per_block);
-  dim3 grid = dim3(ceil_div(max_threads, threads_per_block), (long long)nTensors);
+  dim3 grid = dim3(thread_blocks, (long long)nTensors);
 
   return std::make_tuple(grid, block);
 }