Skip to content

Commit 21110eb

Browse files
TheTomclaude
andcommitted
fix: CPU vec_dot heap allocation for turbo/TQ types (n > 4096 models)
Stack-allocated float tmp[4096] buffers in CPU vec_dot functions crashed on models with intermediate_size > 4096 (e.g. TinyLlama 5632, Qwen 27B 18944). Replaced with heap allocation. Affects CPU-only inference fallback path. GPU users unaffected. Reported by @oemc1470 on RX 6600 (gfx1032) where broken HIP forced CPU fallback. Tested: Qwen3.5-27B Config I, CPU-only (-ngl 0), intermediate_size=18944. No crash, no assert. Co-Authored-By: tturney@psyguard.ai Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 716dd77 commit 21110eb

1 file changed

Lines changed: 21 additions & 12 deletions

File tree

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3379,15 +3379,16 @@ static void ggml_vec_dot_turbo3_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
33793379
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
33803380

33813381
// Dequantize turbo3 to f32 temp buffer, then dot
3382-
float tmp[4096]; // max head_dim
3383-
GGML_ASSERT(n <= 4096);
3382+
float * tmp = (float *)malloc(n * sizeof(float));
3383+
GGML_ASSERT(tmp != NULL);
33843384
ggml_get_type_traits(GGML_TYPE_TURBO3_0)->to_float(vx, tmp, n);
33853385

33863386
const float * y = (const float *)vy;
33873387
float sum = 0.0f;
33883388
for (int i = 0; i < n; i++) {
33893389
sum += tmp[i] * y[i];
33903390
}
3391+
free(tmp);
33913392
*s = sum;
33923393
}
33933394

@@ -3398,15 +3399,16 @@ static void ggml_vec_dot_turbo2_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
33983399
GGML_ASSERT(nrc == 1);
33993400
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
34003401

3401-
float tmp[4096];
3402-
GGML_ASSERT(n <= 4096);
3402+
float * tmp = (float *)malloc(n * sizeof(float));
3403+
GGML_ASSERT(tmp != NULL);
34033404
ggml_get_type_traits(GGML_TYPE_TURBO2_0)->to_float(vx, tmp, n);
34043405

34053406
const float * y = (const float *)vy;
34063407
float sum = 0.0f;
34073408
for (int i = 0; i < n; i++) {
34083409
sum += tmp[i] * y[i];
34093410
}
3411+
free(tmp);
34103412
*s = sum;
34113413
}
34123414

@@ -3417,15 +3419,16 @@ static void ggml_vec_dot_turbo4_0_f32(int n, float * GGML_RESTRICT s, size_t bs,
34173419
GGML_ASSERT(nrc == 1);
34183420
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
34193421

3420-
float tmp[4096];
3421-
GGML_ASSERT(n <= 4096);
3422+
float * tmp = (float *)malloc(n * sizeof(float));
3423+
GGML_ASSERT(tmp != NULL);
34223424
ggml_get_type_traits(GGML_TYPE_TURBO4_0)->to_float(vx, tmp, n);
34233425

34243426
const float * y = (const float *)vy;
34253427
float sum = 0.0f;
34263428
for (int i = 0; i < n; i++) {
34273429
sum += tmp[i] * y[i];
34283430
}
3431+
free(tmp);
34293432
*s = sum;
34303433
}
34313434

@@ -3437,18 +3440,21 @@ static void ggml_vec_dot_tq3_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
34373440
GGML_ASSERT(nrc == 1);
34383441
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
34393442

3440-
float tmp[4096];
3441-
GGML_ASSERT(n <= 4096);
3443+
float * tmp = (float *)malloc(n * sizeof(float));
3444+
GGML_ASSERT(tmp != NULL);
34423445
ggml_get_type_traits(GGML_TYPE_TQ3_1S)->to_float(vx, tmp, n);
34433446

34443447
// Dequantize q8_0 and dot
3445-
float tmp2[4096];
3448+
float * tmp2 = (float *)malloc(n * sizeof(float));
3449+
GGML_ASSERT(tmp2 != NULL);
34463450
ggml_get_type_traits(GGML_TYPE_Q8_0)->to_float(vy, tmp2, n);
34473451

34483452
float sum = 0.0f;
34493453
for (int i = 0; i < n; i++) {
34503454
sum += tmp[i] * tmp2[i];
34513455
}
3456+
free(tmp);
3457+
free(tmp2);
34523458
*s = sum;
34533459
}
34543460

@@ -3460,17 +3466,20 @@ static void ggml_vec_dot_tq4_1s_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
34603466
GGML_ASSERT(nrc == 1);
34613467
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
34623468

3463-
float tmp[4096];
3464-
GGML_ASSERT(n <= 4096);
3469+
float * tmp = (float *)malloc(n * sizeof(float));
3470+
GGML_ASSERT(tmp != NULL);
34653471
ggml_get_type_traits(GGML_TYPE_TQ4_1S)->to_float(vx, tmp, n);
34663472

3467-
float tmp2[4096];
3473+
float * tmp2 = (float *)malloc(n * sizeof(float));
3474+
GGML_ASSERT(tmp2 != NULL);
34683475
ggml_get_type_traits(GGML_TYPE_Q8_0)->to_float(vy, tmp2, n);
34693476

34703477
float sum = 0.0f;
34713478
for (int i = 0; i < n; i++) {
34723479
sum += tmp[i] * tmp2[i];
34733480
}
3481+
free(tmp);
3482+
free(tmp2);
34743483
*s = sum;
34753484
}
34763485

0 commit comments

Comments
 (0)