feat: asymmetric K/V support + q8_0 × turbo FA kernel instantiations

TheTom · claude · TheTom · commit 965a6ca57081 · 2026-03-29T14:36:31.000-05:00
Add full asymmetric K/V quantization support for Metal flash attention: - Pipeline naming uses k{type}_v{type} format for all FA kernels (335 total), eliminating underscore ambiguity in type names - 90 turbo × turbo asymmetric instantiations (turbo2/3/4 all combinations) - 150 q8_0 × turbo asymmetric instantiations (both directions, all head dims) - Gatekeeper and assertion updated to allow turbo × turbo and q8_0 × turbo pairs - Zero regression on existing symmetric paths (validated across 4 models, 2 machines) The q8_0 × turbo kernels fix a silent dispatch failure where mixed q8_0-K + turbo-V configs would NaN (turbo4-V) or fall to undefined paths (turbo3-V). This enables the asymmetric quality rescue: q8_0-K + turbo-V recovers near-baseline PPL on low-bit models where symmetric turbo-K degrades. Validated on Metal (M2 Pro + M5 Max): - phi-4-Q8_0: symmetric turbo3 +4.2%, turbo4 +1.7% (no regression) - Qwen2.5-7B Q4_K_M: q8_0-K + turbo4-V +1.0%, q8_0-K + turbo3-V +2.0% (rescued) - Qwen3.5-35B MoE, 27B Dense, Mistral-24B: all healthy (no regression) - Cross-hardware M2/M5 parity confirmed on all tested configs Closes #27 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-Authored-By: tturney@psyguard.ai
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1348,6 +1348,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
             dk,
             dv);
 
+
     snprintf(name, 256, "%s_mask=%d_sinks=%d_bias=%d_scap=%d_kvpad=%d_bcm=%d_ns10=%d_ns20=%d_nsg=%d",
             base,
             has_mask,
@@ -1414,6 +1415,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
             dk,
             dv);
 
+
     snprintf(name, 256, "%s_mask=%d_sink=%d_bias=%d_scap=%d_kvpad=%d_ns10=%d_ns20=%d_nsg=%d_nwg=%d",
             base,
             has_mask,
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1196,14 +1196,21 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                 return false;
             }
             if (op->src[1]->type != op->src[2]->type) {
-                // Allow asymmetric K/V for supported turbo quantization pairs
+                // Allow asymmetric K/V for supported mixed pairs:
+                // - turbo x turbo (any combination)
+                // - q8_0 x turbo (either direction)
                 const bool k_is_turbo = (op->src[1]->type == GGML_TYPE_TURBO2_0 ||
                                          op->src[1]->type == GGML_TYPE_TURBO3_0 ||
                                          op->src[1]->type == GGML_TYPE_TURBO4_0);
                 const bool v_is_turbo = (op->src[2]->type == GGML_TYPE_TURBO2_0 ||
                                          op->src[2]->type == GGML_TYPE_TURBO3_0 ||
                                          op->src[2]->type == GGML_TYPE_TURBO4_0);
-                if (!k_is_turbo || !v_is_turbo) {
+                const bool k_is_q8 = (op->src[1]->type == GGML_TYPE_Q8_0);
+                const bool v_is_q8 = (op->src[2]->type == GGML_TYPE_Q8_0);
+                const bool supported = (k_is_turbo && v_is_turbo) ||
+                                       (k_is_q8 && v_is_turbo) ||
+                                       (k_is_turbo && v_is_q8);
+                if (!supported) {
                     return false;
                 }
             }
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2682,14 +2682,19 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
 
     GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
 
-    // Allow asymmetric K/V quantization for supported turbo pairs
+    // Allow asymmetric K/V quantization for supported mixed pairs
     {
         const ggml_type type_k = op->src[1]->type;
         const ggml_type type_v = op->src[2]->type;
         if (type_k != type_v) {
             const bool k_is_turbo = (type_k == GGML_TYPE_TURBO2_0 || type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0);
             const bool v_is_turbo = (type_v == GGML_TYPE_TURBO2_0 || type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0);
-            GGML_ASSERT(k_is_turbo && v_is_turbo && "asymmetric K/V types only supported for turbo quantization pairs");
+            const bool k_is_q8 = (type_k == GGML_TYPE_Q8_0);
+            const bool v_is_q8 = (type_v == GGML_TYPE_Q8_0);
+            const bool supported = (k_is_turbo && v_is_turbo) ||
+                                   (k_is_q8 && v_is_turbo) ||
+                                   (k_is_turbo && v_is_q8);
+            GGML_ASSERT(supported && "asymmetric K/V types only supported for turbo and q8_0 mixed pairs");
         }
     }
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal

Original file line number	Diff line number	Diff line change
`@@ -2682,14 +2682,19 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {`
`2682`	`2682`
`2683`	`2683`	`GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);`
`2684`	`2684`
`2685`		`- // Allow asymmetric K/V quantization for supported turbo pairs`
	`2685`	`+ // Allow asymmetric K/V quantization for supported mixed pairs`
`2686`	`2686`	`{`
`2687`	`2687`	`const ggml_type type_k = op->src[1]->type;`
`2688`	`2688`	`const ggml_type type_v = op->src[2]->type;`
`2689`	`2689`	`if (type_k != type_v) {`
`2690`	`2690`	`const bool k_is_turbo = (type_k == GGML_TYPE_TURBO2_0 \|\| type_k == GGML_TYPE_TURBO3_0 \|\| type_k == GGML_TYPE_TURBO4_0);`
`2691`	`2691`	`const bool v_is_turbo = (type_v == GGML_TYPE_TURBO2_0 \|\| type_v == GGML_TYPE_TURBO3_0 \|\| type_v == GGML_TYPE_TURBO4_0);`
`2692`		`- GGML_ASSERT(k_is_turbo && v_is_turbo && "asymmetric K/V types only supported for turbo quantization pairs");`
	`2692`	`+ const bool k_is_q8 = (type_k == GGML_TYPE_Q8_0);`
	`2693`	`+ const bool v_is_q8 = (type_v == GGML_TYPE_Q8_0);`
	`2694`	`+ const bool supported = (k_is_turbo && v_is_turbo) \|\|`
	`2695`	`+ (k_is_q8 && v_is_turbo) \|\|`
	`2696`	`+ (k_is_turbo && v_is_q8);`
	`2697`	`+ GGML_ASSERT(supported && "asymmetric K/V types only supported for turbo and q8_0 mixed pairs");`
`2693`	`2698`	`}`
`2694`	`2699`	`}`
`2695`	`2700`