ggml: add TQ3_0 (TurboQuant 3-bit) KV cache quantization type

Marcel · Marcel · commit 16e93d5dc21c · 2026-03-25T17:12:27.000+02:00
Implements a 3.5 bits/value KV cache quantization type based on the
TurboQuant/PolarQuant/QJL papers from Google Research.

Algorithm:
- Per-block Walsh-Hadamard Transform (WHT32) with fixed sign flips
  makes any input distribution approximately Gaussian (by CLT)
- 2-bit Max-Lloyd optimal codebook {-1.510, -0.453, +0.453, +1.510}
  tuned for Gaussian achieves near-optimal MSE
- 1-bit QJL residual signs for error correction
- Per-block FP16 scale factor

Block format: 14 bytes / 32 values = 3.5 bits/value (4.6x vs F16)
  qs[8]: 2-bit codebook indices (4 per byte)
  qr[4]: QJL residual signs (1 per bit)
  gamma:  FP16 per-block scale

Fused MMVQ kernel:
  Since WHT is orthogonal, dot(q,k) = dot(WHT(q), WHT(k)).
  Apply WHT to Q8_1 query values inside the fused vec_dot kernel
  (int32 butterfly), compute dot product in rotated space.
  No dequantize+MUL_MAT fallback needed — speed matches Q4_0.

Results (Qwen3.5-0.8B-Q5_K_M, wikitext-2, Radeon 8060S):
  F16:   PPL = 20.05, tg128 = 181.8 t/s
  Q4_0:  PPL = 20.14 (+0.4%), tg128 = 179.1 t/s
  TQ3_0: PPL = 21.21 (+5.8%), tg128 = 177.9 t/s

References:
  TurboQuant (arXiv:2504.19874) — ICLR 2026
  PolarQuant (arXiv:2502.02617) — AISTATS 2026
  QJL (arXiv:2406.03482)
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -387,6 +387,7 @@ const std::vector<ggml_type> kv_cache_types = {
     GGML_TYPE_IQ4_NL,
     GGML_TYPE_Q5_0,
     GGML_TYPE_Q5_1,
+    GGML_TYPE_TQ3_0,
 };
 
 static ggml_type kv_cache_type_from_str(const std::string & s) {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -428,7 +428,8 @@ extern "C" {
         // GGML_TYPE_IQ4_NL_8_8 = 38,
         GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
         GGML_TYPE_NVFP4   = 40, // NVFP4 (4 blocks, E4M3 scale)
-        GGML_TYPE_COUNT   = 41,
+        GGML_TYPE_TQ3_0   = 41, // TurboQuant 3-bit polar + QJL (no per-block scale)
+        GGML_TYPE_COUNT   = 42,
     };
 
     // precision
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
@@ -266,6 +266,21 @@ typedef struct {
 } block_tq2_0;
 static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
 
+// TurboQuant 3-bit quantization (3.5 bpw)
+// Per TurboQuant paper (Algorithm 2: TurboQuant_prod), ICLR 2026
+// Each block of 32 values is quantized as:
+//   - 2-bit MSE codebook indices (after random rotation Π·x)
+//   - 1-bit QJL residual signs (sign(S·r) where r = x - dequant_mse(quant_mse(x)))
+//   - FP16 residual norm ||r||₂ for QJL scaling
+// Requires per-model rotation matrices Π and S (stored externally)
+#define QK_TQ3_0 32
+typedef struct {
+    uint8_t   qs[QK_TQ3_0 / 4]; // 2-bit codebook indices, 32 × 2 bits = 8 bytes
+    uint8_t   qr[QK_TQ3_0 / 8]; // QJL residual signs, 32 × 1 bit = 4 bytes
+    ggml_half gamma;             // ||residual||₂ for QJL correction scaling
+} block_tq3_0;
+static_assert(sizeof(block_tq3_0) == QK_TQ3_0/4 + QK_TQ3_0/8 + sizeof(ggml_half), "wrong tq3_0 block size/padding");
+
 //
 // Super-block quantization structures
 //
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -390,6 +390,10 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [GGML_TYPE_TQ3_0] = {
+        .from_float               = quantize_row_tq3_0,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_I32] = {
         .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
     },
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -448,7 +448,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
                 op->type != GGML_TYPE_IQ1_S   &&
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
-            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
+            {
+                const auto * traits = ggml_get_type_traits_cpu(src0->type);
+                return traits->vec_dot != NULL &&
+                    (src1->type == GGML_TYPE_F32 || src1->type == traits->vec_dot_type);
+            }
         case GGML_OP_SOFT_MAX_BACK: {
             if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
                 return false;
@@ -466,6 +470,9 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
         case GGML_OP_OUT_PROD:
             return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
                 src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+        case GGML_OP_FLASH_ATTN_EXT:
+            // K type must have vec_dot for CPU flash attention
+            return ggml_get_type_traits_cpu(src1->type)->vec_dot != NULL;
         default:
             return true;
     }
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -678,6 +678,7 @@ void ggml_compute_forward_add(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -1128,6 +1129,7 @@ void ggml_compute_forward_add1(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -1257,6 +1259,7 @@ void ggml_compute_forward_acc(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -4345,6 +4348,7 @@ void ggml_compute_forward_out_prod(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -4621,6 +4625,7 @@ void ggml_compute_forward_set(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -4844,6 +4849,7 @@ void ggml_compute_forward_get_rows(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
@@ -5569,6 +5575,7 @@ void ggml_compute_forward_clamp(
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_TQ1_0:
         case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_TQ3_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
@@ -108,6 +108,12 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
     quantize_row_tq2_0_ref(x, y, k);
 }
 
+void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_TQ3_0 == 0);
+    block_tq3_0 * GGML_RESTRICT y = vy;
+    quantize_row_tq3_0_ref(x, y, k);
+}
+
 //===================================== Q8_K ==============================================
 
 void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h
@@ -31,6 +31,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 
 void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -1029,6 +1029,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
     static constexpr int qi = QI3_S;
 };
 
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_TQ3_0> {
+    static constexpr int qk = QK_TQ3_0;  // 32
+    static constexpr int qr = 1;
+    static constexpr int qi = QK_TQ3_0 / 4;  // 8
+};
+
 //////////////////////
 
 struct ggml_cuda_device_info {
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
@@ -486,6 +486,50 @@ static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_
     }
 }
 
+// TurboQuant TQ3_0: 2-bit codebook dequantization + inverse WHT
+// Dequantize to rotated space, then apply inverse WHT32 cooperatively
+template<typename dst_t>
+static __global__ void dequantize_block_tq3_0(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const float centroids[4] = { -1.510f, -0.4528f, 0.4528f, 1.510f };
+    const int8_t signs[32] = {
+        +1, -1, +1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, +1, -1, -1,
+        +1, -1, -1, +1, +1, -1, +1, -1, -1, +1, +1, +1, -1, -1, +1, -1
+    };
+
+    const int64_t i = blockIdx.x;
+    const block_tq3_0 * x = (const block_tq3_0 *)vx;
+    const int tid = threadIdx.x;
+    if (tid >= 32) return;
+
+    const float d = __half2float(x[i].gamma);
+
+    // Step 1: Each thread dequantizes its value (in rotated space)
+    const int byte_idx = tid / 4;
+    const int bit_shift = 2 * (tid % 4);
+    const int idx = (x[i].qs[byte_idx] >> bit_shift) & 3;
+
+    __shared__ float shmem[32];
+    shmem[tid] = d * centroids[idx];
+    __syncthreads();
+
+    // Step 2: Cooperative inverse WHT (5 butterfly stages)
+    for (int step = 1; step < 32; step <<= 1) {
+        int partner = tid ^ step;  // butterfly partner
+        float a = shmem[tid];
+        float b = shmem[partner];
+        __syncthreads();
+        if (tid < partner) {
+            shmem[tid]     = a + b;
+            shmem[partner] = a - b;
+        }
+        __syncthreads();
+    }
+
+    // Step 3: Normalize and undo sign flips
+    const float inv_sqrt32 = 0.17677669529663688f;
+    yy[i * QK_TQ3_0 + tid] = shmem[tid] * inv_sqrt32 * signs[tid];
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static void dequantize_block_cuda(const void * vx, dst_t * y,
         const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
@@ -617,6 +661,12 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t
     dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
 }
 
+template<typename dst_t>
+static void dequantize_row_tq3_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_TQ3_0;
+    dequantize_block_tq3_0<<<nb, 32, 0, stream>>>(vx, y);
+}
+
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
         const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
@@ -715,6 +765,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_row_iq3_s_cuda;
         case GGML_TYPE_MXFP4:
             return dequantize_row_mxfp4_cuda;
+        case GGML_TYPE_TQ3_0:
+            return dequantize_row_tq3_0_cuda;
         case GGML_TYPE_F32:
             return convert_unary_cont_cuda<float>;
         case GGML_TYPE_BF16:
@@ -766,6 +818,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_iq3_s_cuda;
         case GGML_TYPE_MXFP4:
             return dequantize_row_mxfp4_cuda;
+        case GGML_TYPE_TQ3_0:
+            return dequantize_row_tq3_0_cuda;
         case GGML_TYPE_F16:
             return convert_unary_cont_cuda<half>;
         case GGML_TYPE_BF16:
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -211,6 +211,79 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
     quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
 }
 
+// TQ3_0: Device-side Walsh-Hadamard Transform (WHT32) for rotation
+// Same sign pattern as CPU (must match for consistency)
+static __device__ __forceinline__ void tq3_wht32_forward_device(float * x) {
+    const int8_t signs[32] = {
+        +1, -1, +1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, +1, -1, -1,
+        +1, -1, -1, +1, +1, -1, +1, -1, -1, +1, +1, +1, -1, -1, +1, -1
+    };
+    for (int j = 0; j < 32; j++) x[j] *= signs[j];
+    for (int step = 1; step < 32; step <<= 1) {
+        for (int i = 0; i < 32; i += step * 2) {
+            for (int j = i; j < i + step; j++) {
+                float a = x[j], b = x[j + step];
+                x[j] = a + b; x[j + step] = a - b;
+            }
+        }
+    }
+    const float s = 0.17677669529663688f;  // 1/sqrt(32)
+    for (int j = 0; j < 32; j++) x[j] *= s;
+}
+
+static __device__ __forceinline__ void tq3_wht32_inverse_device(float * x) {
+    for (int step = 1; step < 32; step <<= 1) {
+        for (int i = 0; i < 32; i += step * 2) {
+            for (int j = i; j < i + step; j++) {
+                float a = x[j], b = x[j + step];
+                x[j] = a + b; x[j + step] = a - b;
+            }
+        }
+    }
+    const int8_t signs[32] = {
+        +1, -1, +1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, +1, -1, -1,
+        +1, -1, -1, +1, +1, -1, +1, -1, -1, +1, +1, +1, -1, -1, +1, -1
+    };
+    const float s = 0.17677669529663688f;
+    for (int j = 0; j < 32; j++) x[j] *= s * signs[j];
+}
+
+// TQ3_0: GPU-side 2-bit scalar codebook quantization with WHT rotation
+static __device__ void quantize_f32_tq3_0_block(const float * __restrict__ x, block_tq3_0 * __restrict__ y) {
+    const float centroids[4] = { -1.510f, -0.4528f, 0.4528f, 1.510f };
+
+    // Copy and apply WHT rotation
+    float rotated[QK_TQ3_0];
+    for (int j = 0; j < QK_TQ3_0; j++) rotated[j] = x[j];
+    tq3_wht32_forward_device(rotated);
+
+    memset(y, 0, sizeof(block_tq3_0));
+
+    float amax = 0.0f;
+    for (int j = 0; j < QK_TQ3_0; j++) {
+        float av = fabsf(rotated[j]);
+        if (av > amax) amax = av;
+    }
+
+    const float d = amax / 1.510f;
+    const float id = d > 0.0f ? 1.0f / d : 0.0f;
+    y->gamma = __float2half(d);
+
+    for (int j = 0; j < QK_TQ3_0; j++) {
+        float xn = rotated[j] * id;
+        int idx;
+        if (xn < 0.0f) { idx = (xn < -0.9814f) ? 0 : 1; }
+        else            { idx = (xn < 0.9814f) ? 2 : 3; }
+        y->qs[j / 4] |= (idx << (2 * (j % 4)));
+        float residual = rotated[j] - d * centroids[idx];
+        if (residual >= 0.0f) { y->qr[j / 8] |= (1 << (j % 8)); }
+    }
+}
+
+static __device__ void cpy_blck_f32_tq3_0(const char * cxi, char * cdsti) {
+    quantize_f32_tq3_0_block((const float *)cxi, (block_tq3_0 *)cdsti);
+}
+
 template<typename src_t, typename dst_t>
 static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
     *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4797,6 +4797,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_BF16:
+                    case GGML_TYPE_TQ3_0:
                         return true;
                     default:
                         return false;
@@ -4829,7 +4830,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             {
                 return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
                        op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
-                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
+                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
+                       op->type == GGML_TYPE_TQ3_0) &&
                        op->src[0]->type == GGML_TYPE_F32 &&
                        (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
             } break;
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -29,6 +29,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
         case GGML_TYPE_IQ4_NL:  return vec_dot_iq4_nl_q8_1;
         case GGML_TYPE_IQ4_XS:  return vec_dot_iq4_xs_q8_1;
         case GGML_TYPE_IQ3_S:   return vec_dot_iq3_s_q8_1;
+        case GGML_TYPE_TQ3_0:   return vec_dot_tq3_0_q8_1;
         default:                return nullptr;
     }
 }
@@ -53,6 +54,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
         case GGML_TYPE_IQ3_S:   return VDR_IQ3_S_Q8_1_MMVQ;
         case GGML_TYPE_IQ4_NL:  return VDR_IQ4_NL_Q8_1_MMVQ;
         case GGML_TYPE_IQ4_XS:  return VDR_IQ4_XS_Q8_1_MMVQ;
+        case GGML_TYPE_TQ3_0:   return VDR_TQ3_0_Q8_1_MMVQ;
         default:                return 1;
     }
 }
@@ -710,6 +712,12 @@ static void mul_mat_vec_q_switch_type(
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
             break;
+        case GGML_TYPE_TQ3_0:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_TQ3_0>
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
+            break;
         default:
             GGML_ABORT("fatal error");
             break;
diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu
@@ -309,6 +309,16 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s
             nb1, nb2, nb3,
             stream
         );
+    } else if (dst->type == GGML_TYPE_TQ3_0) {
+        set_rows_cuda_quant<idx_t, block_tq3_0, QK_TQ3_0, quantize_f32_tq3_0_block>(
+            src0_d, src1_d, (block_tq3_0*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
     } else {
         GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
     }
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp