ggml-org · michaelw9999 · Mar 21, 2026 · Mar 22, 2026
@@ -1422,6 +1422,20 @@ extern "C" {
             struct ggml_tensor  * b,
             struct ggml_tensor  * ids);
 
+    GGML_API void ggml_mul_mat_add_scale(
+            struct ggml_tensor * a,
+            struct ggml_tensor * scale);
+
+    GGML_API const struct ggml_tensor * ggml_mul_mat_get_scale(
+            const struct ggml_tensor * a);
+
+    GGML_API void ggml_mul_mat_id_add_scale(
+            struct ggml_tensor * a,
+            struct ggml_tensor * scale);
+
+    GGML_API const struct ggml_tensor * ggml_mul_mat_id_get_scale(
+            const struct ggml_tensor * a);
+
     // A: m columns, n rows,
     // B: p columns, n rows,
     // result is m columns, p rows

@@ -1222,6 +1222,16 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 
                 for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
                     vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+
+                    // NVFP4's per tensor scale needs to be applied with the vecdot
+                    // this should be put into a future traits based approach to be cleaner
+                    if (src0->type == GGML_TYPE_NVFP4) {
+                        float nvfp4_t_s = ((const float *) ggml_mul_mat_get_scale(dst)->data)[0];
+                        if (ggml_mul_mat_get_scale(dst)->ne[0] > 1) {
+                            nvfp4_t_s = ((const float *) ggml_mul_mat_get_scale(dst)->data)[i02];
+                        }
+                        tmp[ir0 - iir0] = tmp[ir0 - iir0] * nvfp4_t_s;
+                    }
                 }
 
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
@@ -1490,6 +1500,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
 
                 for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
                     vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
+
+                    if (src0->type == GGML_TYPE_NVFP4) {
+                        float nvfp4_t_s = ((const float *) ggml_mul_mat_id_get_scale(dst)->data)[0];
+                        if (ggml_mul_mat_id_get_scale(dst)->ne[0] > 1) {
+                            nvfp4_t_s = ((const float *) ggml_mul_mat_id_get_scale(dst)->data)[cur_a];
+                        }
+                        tmp[ir0 - iir0] = tmp[ir0 - iir0] * nvfp4_t_s;
+                    }
                 }
 
                 memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));

@@ -3280,6 +3280,28 @@ struct ggml_tensor * ggml_mul_mat_id(
     return result;
 }
 
+GGML_API void ggml_mul_mat_add_scale(
+        struct ggml_tensor * a,
+        struct ggml_tensor * scale) {
+    a->src[2] = scale;
+}
+
+GGML_API const struct ggml_tensor * ggml_mul_mat_get_scale(
+        const struct ggml_tensor * a) {
+    return a->src[2];
+}
+
+GGML_API void ggml_mul_mat_id_add_scale(
+        struct ggml_tensor * a,
+        struct ggml_tensor * scale) {
+    a->src[3] = scale;
+}
+
+GGML_API const struct ggml_tensor * ggml_mul_mat_id_get_scale(
+        const struct ggml_tensor * a) {
+    return a->src[3];
+}
+
 // ggml_out_prod
 
 static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {

@@ -903,6 +903,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
           ggml_tensor * cur,
           ggml_tensor * w_s) const {
     ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+    if (w_s && w->type == GGML_TYPE_NVFP4) {
+        ggml_mul_mat_add_scale(res, w_s); // Save the NVFP4 per tensor scale so it can be applied earlier to vecdot
+        w_s = nullptr; // Clear the scale so it doesnt get applied here, to preserve precision
+    }
 
     for (const auto & lora : *loras) {
         llama_adapter_lora_weight * lw = lora.first->get_weight(w);
@@ -1007,7 +1011,13 @@ ggml_tensor * llm_graph_context::build_ffn(
      llm_ffn_op_type   type_op,
    llm_ffn_gate_type   type_gate,
                  int   il) const {
-    ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
+    ggml_tensor * tmp = cur;
+    if (up && up->type == GGML_TYPE_NVFP4) {
+        tmp = build_lora_mm(up, cur, up_s);
+        up_s = nullptr;
+    } else if (up) {
+        tmp = build_lora_mm(up, cur);
+    }
     cb(tmp, "ffn_up", il);
 
     if (up_b) {
@@ -1024,12 +1034,22 @@ ggml_tensor * llm_graph_context::build_ffn(
         switch (type_gate) {
             case LLM_FFN_SEQ:
                 {
-                    cur = build_lora_mm(gate, tmp);
+                    if (gate->type == GGML_TYPE_NVFP4) {
+                        cur = build_lora_mm(gate, tmp, gate_s);
+                        gate_s = nullptr;
+                    } else {
+                        cur = build_lora_mm(gate, tmp);
+                    }
                     cb(cur, "ffn_gate", il);
                 } break;
             case LLM_FFN_PAR:
                 {
-                    cur = build_lora_mm(gate, cur);
+                    if (gate->type == GGML_TYPE_NVFP4) {
+                        cur = build_lora_mm(gate, cur, gate_s);
+                        gate_s = nullptr;
+                    } else {
+                        cur = build_lora_mm(gate, cur);
+                    }
                     cb(cur, "ffn_gate", il);
                 } break;
         }
@@ -1133,7 +1153,12 @@ ggml_tensor * llm_graph_context::build_ffn(
     }
 
     if (down) {
-        cur = build_lora_mm(down, cur);
+        if (down->type == GGML_TYPE_NVFP4) {
+            cur = build_lora_mm(down, cur, down_s);
+            down_s = nullptr;
+        } else {
+            cur = build_lora_mm(down, cur);
+        }
         if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
             // GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
@@ -1390,6 +1415,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     } else {
         // separate gate and up path
         up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        if (up_exps->type == GGML_TYPE_NVFP4) {
+            ggml_mul_mat_id_add_scale(up, up_exps_s); // Save the expert tensor scale
+            up_exps_s = nullptr; // Clear so it doesn't get applied here, to avoid precision loss
+        }
         cb(up, "ffn_moe_up", il);
 
         if (up_exps_b) {
@@ -1408,6 +1437,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
         if (gate_exps) {
             cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+            if (gate_exps->type == GGML_TYPE_NVFP4) {
+                ggml_mul_mat_id_add_scale(cur, gate_exps_s);
+                gate_exps_s = nullptr;
+            }
             cb(cur, "ffn_moe_gate", il);
         } else {
             cur = up;
@@ -1498,6 +1531,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     }
 
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    if (down_exps->type == GGML_TYPE_NVFP4) {
+        ggml_mul_mat_id_add_scale(experts, down_exps_s);
+        down_exps_s = nullptr;
+    }
     cb(experts, "ffn_moe_down", il);
 
     if (down_exps_b) {