Skip to content

FAQ: Load tile into GPUShared memory "channel per channel". #7946

@mcourteaux

Description

@mcourteaux

While optimizing a kernel that computes a set of convolutions over an image with an arbitrary number of layers/channels, I realized that conceptually you'd be able to load the image into shared memory channel per channel. However, I can't get Halide to generate the right code.

Here is a repro:

// vim: foldmethod=syntax
#include <Halide.h>
using namespace Halide;
// clang-format off
class LayerPerLayer : public Generator<LayerPerLayer> {
  public:
    Input<Buffer<float>> input{"input", 3};         // (w, h, c)
    Output<Buffer<float>> output{"output", 3};         // (w, h, c)

    Var x{"x"}, y{"y"}, cf{"cf"}, ci{"ci"};
    Func clamped_input{"clamped_input"};
    Func conv{"conv"};
    RDom rdom_conv;

    void generate() {
        int kernel_size = 5;
        int taps_per_side = kernel_size / 2;
        rdom_conv = {{{-taps_per_side, int(kernel_size)},
                      {-taps_per_side, int(kernel_size)}}, "rdom_conv"};

        clamped_input(x, y, ci) = Halide::BoundaryConditions::repeat_edge(input)(x, y, ci);

        conv(x, y, ci) = 0.0f;
        conv(x, y, ci) += clamped_input(x + rdom_conv.x, y + rdom_conv.y, ci);

        output(x, y, ci) = conv(x, y, ci);

        Expr w = input.dim(0).extent();
        Expr h = input.dim(1).extent();
        Expr d = input.dim(2).extent();

        input.dim(0).set_bounds(0, w);
        input.dim(1).set_bounds(0, h);
        input.dim(2).set_bounds(0, d);
        output.dim(0).set_bounds(0, w);
        output.dim(1).set_bounds(0, h);
        output.dim(2).set_bounds(0, d);
    }

    void schedule() {
      Var xo{"xo"}, xi{"xi"};
      Var yo{"yo"}, yi{"yi"};

      output
        .reorder(ci, x, y)
        .partition(x, Partition::Never)
        .partition(y, Partition::Never)
        .gpu_tile(x, y, xo, yo, xi, yi, 32, 32)
        ;

      conv
        .update()
        .partition(rdom_conv.x, Partition::Never)
        .partition(rdom_conv.y, Partition::Never)
        ;

      if constexpr (false) {
        // Here, we prefetch all channels of the tile into shared memory.
        // This works, but loads all channels at once, requiring a lot of
        // shared memory.
        clamped_input.in(conv)
          .compute_at(output, xo)
          .partition(x, Partition::Never)
          .partition(y, Partition::Never)
          // As the convolution requires more input pixels than output pixels (and
          // threads), let's make a single thread load 8 numbers instead.
          .tile(x, y, xo, yo, xi, yi, 4, 2)
          .gpu_threads(xo, yo)
          .vectorize(xi) // Generate ld.global.v4.f32 to get 128 bit loads.
          .unroll(yi)
          ;
      } else {
        // Let's try to prefetch only one channel of a tile into shared memory.
        clamped_input.in(conv)
          .compute_at(output, ci)
          .partition(x, Partition::Never)
          .partition(y, Partition::Never)
          .tile(x, y, xo, yo, xi, yi, 4, 2)
          .gpu_threads(xo, yo)
          .vectorize(xi)
          .unroll(yi)
          ;
      }

      output.print_loop_nest();
    }


};

HALIDE_REGISTER_GENERATOR(LayerPerLayer, layer_per_layer)

And the convenience Makefile (update HALIDE_DISTRIB):

HALIDE_DISTRIB=/home/martijn/3rd/halide/distrib
GEN_GEN=$(HALIDE_DISTRIB)/tools/GenGen.cpp

CC=clang++-17

generator: layer_per_layer.cpp
	$(CC) layer_per_layer.cpp $(GEN_GEN) -g -O1 -std=c++17 -fno-rtti -lpthread -ldl -L$(HALIDE_DISTRIB)/lib/ -lHalide  -I$(HALIDE_DISTRIB)/include -o generator

stmt: generator
	LD_LIBRARY_PATH=$(HALIDE_DISTRIB)/lib ./generator -g layer_per_layer -e conceptual_stmt,conceptual_stmt_html -o . target=host-cuda

And run make stmt.

The loop nest it prints is:

produce output:
  gpu_block y.yo<Default_GPU>:
    gpu_block x.xo<Default_GPU>:
      gpu_thread y.yi in [0, 31]<Default_GPU>:
        gpu_thread x.xi in [0, 31]<Default_GPU>:
          for ci:  // layer per layer
            produce clamped_input_in_conv:
              // load the current layer into shared memory
              gpu_thread y.yo in [0, 2]<Default_GPU>:
                gpu_thread x.xo in [0, 1]<Default_GPU>:
                  unrolled y.yi in [0, 1]:
                    vectorized x.xi in [0, 3]:
                      clamped_input_in_conv(...) = ...
            consume clamped_input_in_conv:
              produce conv:
                conv(...) = ...
                for rdom_conv in [-2, 2]:
                  for rdom_conv in [-2, 2]:
                    conv(...) = ...
              consume conv:
                output(...) = ...
Internal Error at /home/martijn/3rd/halide/src/OffloadGPULoops.cpp:228 triggered by user code at :
Condition failed: is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3]):
32, 1

So, clearly because of extra gpu_thread loops, it thinks it needs to generate a kernel with 4 thread dimensions.

Metadata

Metadata

Assignees

No one assigned

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions