vulkan: Switch MUL_MAT_VEC to 4 K per iteration for F16/32#22887
Conversation
|
Oops, ran out the door too quick and broke something. I'll fix the branch and after dinner. |
0e35f56 to
5c62199
Compare
|
Okay, this should be good for review now. Its not the prettiest thing in the world but +9% and getting us back to beating SYCL seems worth it. |
| return; | ||
| } | ||
| compute_outputs(first_row, p.stride_d - first_row); | ||
| compute_outputs(first_row, min(NUM_ROWS, p.stride_d - first_row)); |
There was a problem hiding this comment.
This should be unnecessary due to the check at line 289.
There was a problem hiding this comment.
Yes, should be, but see commit message - mesa's current unsigned upper bound pass doesn't consider conditionals at all. It thus can't properly unroll the inner loop. This commit currently shows an almost exactly 1% speedup on my hardware.
Against mesa git, this shows a 4.8% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG. Note that this breaks some tests until the last commit which fixes OOB A reads.
Against mesa git, this shows a 3.3% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG.
Mesa's UUB logic can't see through conditionals, limiting its ability to understand the bounds on the `num_rows` field in the cleanup run. Making it explicit that `num_rows` is, indeed, always <= `NUM_ROWS` helps mesa make slightly better codegen. Against mesa git, this currently shows a 1% performance improvement in tg128 on Qwen3.5-9B:BF16 on Intel BMG.
5c62199 to
961d347
Compare
|
Addressed feedback. Full diff$ git diff-tree -U2 5c62199e4 961d347bd
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
index ef9ecffed..e67299fde 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -37,5 +37,5 @@ vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
const vec2 a = data_a_packed32[(a_offset + ib)/2];
const vec2 b = data_a_packed32[(a_offset + ib)/2 + 1];
- return vec4(a.x, a.y, b.x, b.y);
+ return vec4(a, b);
}
#endif
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index 4ab55542c..df0317509 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -43,14 +43,4 @@ vec4 load_b(const uint j, const uint iybs, const uint iqs, const bool lastiter,
}
-vec4 load_b_aligned(const uint j, const uint iybs, const uint iqs, const bool lastiter, out bool OOB_y, out bool OOB_z, out bool OOB_w) {
- OOB_w = lastiter && (iybs + iqs + y_offset*3 >= p.ncols);
- if (!OOB_w) {
- OOB_y = false;
- OOB_z = false;
- return data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
- } else
- return load_b(j, iybs, iqs, lastiter, OOB_y, OOB_z, OOB_w);
-}
-
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
{
@@ -137,5 +127,5 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
-void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
+void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i)
{
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
@@ -144,11 +134,7 @@ void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint
const uint iybs = col; // y block start index
- bool OOB_y;
- bool OOB_z;
- bool OOB_w;
- const vec4 b = load_b_aligned(j, iybs, iqs, lastiter, OOB_y, OOB_z, OOB_w);
+ const vec4 b = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
uint ibi = first_row*p.ncols;
- const uint rows_in_bounds = OOB_w ? num_rows - 1 : num_rows;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib = (ibi + col)/QUANT_K; // block index
@@ -160,21 +146,4 @@ void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint
temp[j][n] += dot(v, b);
}
- if (OOB_w) {
- const uint ib = (ibi + col)/QUANT_K; // block index
- vec4 v;
- if (!OOB_z) {
- const vec2 v0 = dequantize(ib, iqs, a_offset);
- const FLOAT_TYPE v1 = dequantize1(ib + 2/QUANT_R, iqs, a_offset);
- v = vec4(v0.x, v0.y, v1, 0);
- } else if (!OOB_y) {
- const vec2 v0 = dequantize(ib, iqs, a_offset);
- v = vec4(v0.x, v0.y, 0, 0);
- } else {
- v = vec4(dequantize1(ib, iqs, a_offset), 0, 0, 0);
- }
-
- // matrix multiplication
- temp[j][rows_in_bounds] += dot(v, b);
- }
}
}
@@ -187,5 +156,5 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
const bool is_aligned_nonquant =
p.batch_stride_b % 4 == 0 && b_offset % 4 == 0 &&
- p.ncols % 2 == 0 && BLOCK_SIZE % 4 == 0 &&
+ p.ncols % 4 == 0 && BLOCK_SIZE % 4 == 0 &&
K_PER_ITER == 4;
@@ -221,9 +190,9 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
// Manually partially unroll the loop
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
- iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
+ iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
i++;
}
}
- } else
+ } else {
#endif
while (i < unrolled_iters) {
@@ -234,4 +203,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
}
}
+#if K_PER_ITER == 4
+ }
+#endif
unroll_count = 2;
@@ -249,9 +221,9 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
// Manually partially unroll the loop
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
- iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
+ iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
i++;
}
}
- } else
+ } else {
#endif
while (i < unrolled_iters) {
@@ -262,12 +234,15 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
}
}
+#if K_PER_ITER == 4
+ }
+#endif
#if K_PER_ITER == 4
if (is_aligned_nonquant) {
while (i < num_iters) {
- iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
+ iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
i++;
}
- } else
+ } else {
#endif
while (i < num_iters) {
@@ -275,4 +250,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
i++;
}
+#if K_PER_ITER == 4
+ }
+#endif
reduce_result(temp, d_offset, first_row, num_rows, tid);I tweaked the last commit to not bother with OOB checks by relying on |
There was a TODO to fix the OOB reads from the A matrix which we do here. It is within performance noise (+<0.1%) in tg128 for Qwen3.5-9B:BF16 on Intel BMG.
961d347 to
d853bfe
Compare
|
Okay, dropped all the pretense of optimization in the slow unaligned path. Full diff$ git diff-tree -U2 961d347bd d853bfe38
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
index df0317509..5a9d0e778 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -70,10 +70,5 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
#endif
uint ibi = first_row*p.ncols;
-#if K_PER_ITER == 8
- const uint rows_in_bounds = num_rows;
-#else
- const uint rows_in_bounds = OOB_w ? num_rows - 1 : num_rows;
-#endif
- [[unroll]] for (uint n = 0; n < rows_in_bounds; ++n) {
+ [[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib = (ibi + col)/QUANT_K; // block index
ibi += p.ncols;
@@ -98,29 +93,23 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
temp[j][n] += rowtmp;
#else
- const vec4 v = dequantize4(ib, iqs, a_offset);
-
- // matrix multiplication
- temp[j][n] += dot(v, b);
-#endif
- }
-#if K_PER_ITER != 8
- if (OOB_w) {
- const uint ib = (ibi + col)/QUANT_K; // block index
- vec4 v;
- if (!OOB_z) {
+ if (!OOB_w) {
+ const vec4 v = dequantize4(ib, iqs, a_offset);
+ temp[j][n] += dot(v, b);
+ } else if (!OOB_z) {
const vec2 v0 = dequantize(ib, iqs, a_offset);
const FLOAT_TYPE v1 = dequantize1(ib + 2/QUANT_R, iqs, a_offset);
- v = vec4(v0.x, v0.y, v1, 0);
+ const vec3 v = vec3(v0.x, v0.y, v1);
+ const vec3 b0 = vec3(b.x, b.y, b.z);
+ temp[j][n] += dot(v, b0);
} else if (!OOB_y) {
const vec2 v0 = dequantize(ib, iqs, a_offset);
- v = vec4(v0.x, v0.y, 0, 0);
+ const vec2 b0 = vec2(b.x, b.y);
+ temp[j][n] += dot(v0, b0);
} else {
- v = vec4(dequantize1(ib, iqs, a_offset), 0, 0, 0);
+ const FLOAT_TYPE v = dequantize1(ib, iqs, a_offset);
+ temp[j][n] = fma(v, b.x, temp[j][n]);
}
-
- // matrix multiplication
- temp[j][rows_in_bounds] += dot(v, b);
- }
#endif
+ }
}
} |
|
not sure if this is of any help, but heres a dockerfile based on arch, pulling the 26.1 packages from the testing repos. might be more bleeding than cutting edge, but maybe its worthwile for testing...I added the AMD and intel drivers by default...if anyone wants the green devil stuff, they need to add it. Havent owned a nvidia card for decades... .devops/vulkan-arch.Dockerfile
compose entry example
as of writing: Will test your latest additons once merged inside that container. |
0cc4m
left a comment
There was a problem hiding this comment.
LGTM. Sorry for the delay.
|
@jeffbolznv Any other concerns left? |
|
Works for me - can confirm the improvement, in my arch container, its slighly faster than sycl now in tg. Nice work as always :) |
* origin/master: hexagon: add support for Q4_1 in MUL_MAT and MUL_MAT_ID (ggml-org#23647) ggml-webgpu: Fix how to dispatch WG to some ops (ggml-org#23750) vulkan: Switch MUL_MAT_VEC to 4 K per iteration for F16/32 (ggml-org#22887) vulkan: use GL_NV_cooperative_matrix_decode_vector for faster matmul (ggml-org#23541) vulkan: add REPEAT op support for f16 to f16. (ggml-org#23298) ci : move ARM jobs to self-hosted + disable kleidiai mac release (ggml-org#23780) vendor : update cpp-httplib to 0.46.0 (ggml-org#23650) pyproject : add conversion folder and update dependencies (ggml-org#23746) CUDA: restrict PDL to CTK >= 12.3 due to MSVC issues (ggml-org#23742) ci : bump cuda release to 13.3 (ggml-org#23749) common : fix env names to all have LLAMA_ARG_ prefix (ggml-org#23778) ci : fix windows ccaches (ggml-org#23777) ci : remove wasm test (ggml-org#23733) vulkan: avoid preferring transfer queue on AMD UMA devices (ggml-org#22455) ci : add ccache to server builds + fix undefined sanitizer build (ggml-org#23763) docs : fix duplicated "the" in granitevision and model-conversion docs (ggml-org#23767) convert: add MiniCPM5 tokenizer support (ggml-org#23384) server : fix the log message when using SSL (ggml-org#23393)
…22887) * vulkan: Switch MUL_MAT_VEC to 4 K per iteration for F16/32 Against mesa git, this shows a 4.8% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG. Note that this breaks some tests until the last commit which fixes OOB A reads. * vulkan: Use aligned loads in mul_mat_vec when available Against mesa git, this shows a 3.3% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG. * Make explicit that `num_rows` is <= `NUM_ROWS` in mul_mat_vec Mesa's UUB logic can't see through conditionals, limiting its ability to understand the bounds on the `num_rows` field in the cleanup run. Making it explicit that `num_rows` is, indeed, always <= `NUM_ROWS` helps mesa make slightly better codegen. Against mesa git, this currently shows a 1% performance improvement in tg128 on Qwen3.5-9B:BF16 on Intel BMG. * vulkan: Fix OOB A reads in MUL_MAT_VEC for odd sizes There was a TODO to fix the OOB reads from the A matrix which we do here. It is within performance noise (+<0.1%) in tg128 for Qwen3.5-9B:BF16 on Intel BMG.
…22887) * vulkan: Switch MUL_MAT_VEC to 4 K per iteration for F16/32 Against mesa git, this shows a 4.8% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG. Note that this breaks some tests until the last commit which fixes OOB A reads. * vulkan: Use aligned loads in mul_mat_vec when available Against mesa git, this shows a 3.3% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG. * Make explicit that `num_rows` is <= `NUM_ROWS` in mul_mat_vec Mesa's UUB logic can't see through conditionals, limiting its ability to understand the bounds on the `num_rows` field in the cleanup run. Making it explicit that `num_rows` is, indeed, always <= `NUM_ROWS` helps mesa make slightly better codegen. Against mesa git, this currently shows a 1% performance improvement in tg128 on Qwen3.5-9B:BF16 on Intel BMG. * vulkan: Fix OOB A reads in MUL_MAT_VEC for odd sizes There was a TODO to fix the OOB reads from the A matrix which we do here. It is within performance noise (+<0.1%) in tg128 for Qwen3.5-9B:BF16 on Intel BMG.
Against mesa git, this shows a 9% performance improvement for tg128 on Qwen3.5-9B:BF16 on Intel BMG.
A few cleanups to MUL_MAT_VEC including fixing the OOB A read, but the real commit is 1 and 2, which shows a total ~9% performance improvement on Intel Arc B60 on mesa (where we're back to beating SYCL!). I'm curious how other devices deal with this.
I'm not really a huge fan of the code duplication but its not that bad, and more splitting stuff up didn't seem worth it. We could compile a different shader to make this all compile-time but that similarly didn't seem worth all that much.
The last commit is similarly kinda ugly, but required to make this all work (and fixes UB anyway, so that's good).