Skip to content

Commit d9227bb

Browse files
zasdfgbnmfacebook-github-bot
authored andcommitted
Target 4096 blocks instead of split to large grid for large reduction (#35997)
Summary: Pull Request resolved: #35997 When the number of blocks is large enough, we are already achieving blalanced SM allocation. But we still should keep the number of inputs per thread large, because thread reduce is cheap. Benchmark for Half on V100: https://github.com/zasdfgbnm/things/blob/master/2020Q2/reduction-benchmark.ipynb On large tensor, it is: 1.37ms vs 1.25ms Test Plan: Imported from OSS Differential Revision: D20927533 Pulled By: ngimel fbshipit-source-id: 40df52e439cc1c01cda66c6195b600f301c5e984
1 parent 2f5b523 commit d9227bb

1 file changed

Lines changed: 19 additions & 5 deletions

File tree

aten/src/ATen/native/cuda/Reduce.cuh

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -789,15 +789,29 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
789789
config.output_mult[1] = config.split_output(block_height);
790790
}
791791

792-
if (config.input_mult[1] != 0 && config.values_per_thread() >= 256 && num_outputs <= 4096) {
792+
constexpr int min_values_per_thread = 16;
793+
constexpr int max_values_per_thread = 256;
794+
const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / (block_width * block_height);
795+
const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
796+
const int target_grid_size = num_mp * blocks_per_sm;
797+
int grid = config.grid().x;
798+
if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) {
793799
// Divide the input across thread-blocks if the amount of work per-thread
794800
// is large enough and the size of the output is small enough. This will
795801
// require a reduction using global memory.
796-
config.ctas_per_output = div_up(config.values_per_thread(), 16);
797-
if (config.ctas_per_output > 65535) {
798-
config.ctas_per_output = 65535;
802+
// If we decide to split input across blocks, as long as we can get enough
803+
// number of blocks (`target_grid_size`) to balance SM, we should still
804+
// make the number of values per thread large for best performance.
805+
int ctas_per_output1 = div_up(target_grid_size, grid);
806+
int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread);
807+
int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread);
808+
// We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have
809+
// a large number of values to deal with. But we don't want values_per_thread to be larger than
810+
// max_values_per_thread
811+
config.ctas_per_output = std::max(std::min<int>(ctas_per_output1, ctas_per_output2), ctas_per_output3);
812+
if (config.ctas_per_output > 1) {
813+
config.input_mult[2] = config.split_input(config.ctas_per_output);
799814
}
800-
config.input_mult[2] = config.split_input(config.ctas_per_output);
801815
}
802816

803817
at::DataPtr buffer;

0 commit comments

Comments
 (0)