-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Description
While optimizing a kernel that computes a set of convolutions over an image with an arbitrary number of layers/channels, I realized that conceptually you'd be able to load the image into shared memory channel per channel. However, I can't get Halide to generate the right code.
Here is a repro:
// vim: foldmethod=syntax
#include <Halide.h>
using namespace Halide;
// clang-format off
class LayerPerLayer : public Generator<LayerPerLayer> {
public:
Input<Buffer<float>> input{"input", 3}; // (w, h, c)
Output<Buffer<float>> output{"output", 3}; // (w, h, c)
Var x{"x"}, y{"y"}, cf{"cf"}, ci{"ci"};
Func clamped_input{"clamped_input"};
Func conv{"conv"};
RDom rdom_conv;
void generate() {
int kernel_size = 5;
int taps_per_side = kernel_size / 2;
rdom_conv = {{{-taps_per_side, int(kernel_size)},
{-taps_per_side, int(kernel_size)}}, "rdom_conv"};
clamped_input(x, y, ci) = Halide::BoundaryConditions::repeat_edge(input)(x, y, ci);
conv(x, y, ci) = 0.0f;
conv(x, y, ci) += clamped_input(x + rdom_conv.x, y + rdom_conv.y, ci);
output(x, y, ci) = conv(x, y, ci);
Expr w = input.dim(0).extent();
Expr h = input.dim(1).extent();
Expr d = input.dim(2).extent();
input.dim(0).set_bounds(0, w);
input.dim(1).set_bounds(0, h);
input.dim(2).set_bounds(0, d);
output.dim(0).set_bounds(0, w);
output.dim(1).set_bounds(0, h);
output.dim(2).set_bounds(0, d);
}
void schedule() {
Var xo{"xo"}, xi{"xi"};
Var yo{"yo"}, yi{"yi"};
output
.reorder(ci, x, y)
.partition(x, Partition::Never)
.partition(y, Partition::Never)
.gpu_tile(x, y, xo, yo, xi, yi, 32, 32)
;
conv
.update()
.partition(rdom_conv.x, Partition::Never)
.partition(rdom_conv.y, Partition::Never)
;
if constexpr (false) {
// Here, we prefetch all channels of the tile into shared memory.
// This works, but loads all channels at once, requiring a lot of
// shared memory.
clamped_input.in(conv)
.compute_at(output, xo)
.partition(x, Partition::Never)
.partition(y, Partition::Never)
// As the convolution requires more input pixels than output pixels (and
// threads), let's make a single thread load 8 numbers instead.
.tile(x, y, xo, yo, xi, yi, 4, 2)
.gpu_threads(xo, yo)
.vectorize(xi) // Generate ld.global.v4.f32 to get 128 bit loads.
.unroll(yi)
;
} else {
// Let's try to prefetch only one channel of a tile into shared memory.
clamped_input.in(conv)
.compute_at(output, ci)
.partition(x, Partition::Never)
.partition(y, Partition::Never)
.tile(x, y, xo, yo, xi, yi, 4, 2)
.gpu_threads(xo, yo)
.vectorize(xi)
.unroll(yi)
;
}
output.print_loop_nest();
}
};
HALIDE_REGISTER_GENERATOR(LayerPerLayer, layer_per_layer)And the convenience Makefile (update HALIDE_DISTRIB):
HALIDE_DISTRIB=/home/martijn/3rd/halide/distrib
GEN_GEN=$(HALIDE_DISTRIB)/tools/GenGen.cpp
CC=clang++-17
generator: layer_per_layer.cpp
$(CC) layer_per_layer.cpp $(GEN_GEN) -g -O1 -std=c++17 -fno-rtti -lpthread -ldl -L$(HALIDE_DISTRIB)/lib/ -lHalide -I$(HALIDE_DISTRIB)/include -o generator
stmt: generator
LD_LIBRARY_PATH=$(HALIDE_DISTRIB)/lib ./generator -g layer_per_layer -e conceptual_stmt,conceptual_stmt_html -o . target=host-cudaAnd run make stmt.
The loop nest it prints is:
produce output:
gpu_block y.yo<Default_GPU>:
gpu_block x.xo<Default_GPU>:
gpu_thread y.yi in [0, 31]<Default_GPU>:
gpu_thread x.xi in [0, 31]<Default_GPU>:
for ci: // layer per layer
produce clamped_input_in_conv:
// load the current layer into shared memory
gpu_thread y.yo in [0, 2]<Default_GPU>:
gpu_thread x.xo in [0, 1]<Default_GPU>:
unrolled y.yi in [0, 1]:
vectorized x.xi in [0, 3]:
clamped_input_in_conv(...) = ...
consume clamped_input_in_conv:
produce conv:
conv(...) = ...
for rdom_conv in [-2, 2]:
for rdom_conv in [-2, 2]:
conv(...) = ...
consume conv:
output(...) = ...
Internal Error at /home/martijn/3rd/halide/src/OffloadGPULoops.cpp:228 triggered by user code at :
Condition failed: is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3]):
32, 1So, clearly because of extra gpu_thread loops, it thinks it needs to generate a kernel with 4 thread dimensions.