Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1422,6 +1422,20 @@ extern "C" {
struct ggml_tensor * b,
struct ggml_tensor * ids);

GGML_API void ggml_mul_mat_add_scale(
struct ggml_tensor * a,
struct ggml_tensor * scale);

GGML_API const struct ggml_tensor * ggml_mul_mat_get_scale(
const struct ggml_tensor * a);

GGML_API void ggml_mul_mat_id_add_scale(
struct ggml_tensor * a,
struct ggml_tensor * scale);

GGML_API const struct ggml_tensor * ggml_mul_mat_id_get_scale(
const struct ggml_tensor * a);

// A: m columns, n rows,
// B: p columns, n rows,
// result is m columns, p rows
Expand Down
18 changes: 18 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,16 @@ static void ggml_compute_forward_mul_mat_one_chunk(

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);

// NVFP4's per tensor scale needs to be applied with the vecdot
// this should be put into a future traits based approach to be cleaner
if (src0->type == GGML_TYPE_NVFP4) {
float nvfp4_t_s = ((const float *) ggml_mul_mat_get_scale(dst)->data)[0];
if (ggml_mul_mat_get_scale(dst)->ne[0] > 1) {
nvfp4_t_s = ((const float *) ggml_mul_mat_get_scale(dst)->data)[i02];
}
tmp[ir0 - iir0] = tmp[ir0 - iir0] * nvfp4_t_s;
}
}

for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
Expand Down Expand Up @@ -1490,6 +1500,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);

if (src0->type == GGML_TYPE_NVFP4) {
float nvfp4_t_s = ((const float *) ggml_mul_mat_id_get_scale(dst)->data)[0];
if (ggml_mul_mat_id_get_scale(dst)->ne[0] > 1) {
nvfp4_t_s = ((const float *) ggml_mul_mat_id_get_scale(dst)->data)[cur_a];
}
tmp[ir0 - iir0] = tmp[ir0 - iir0] * nvfp4_t_s;
}
}

memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
Expand Down
22 changes: 22 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -3280,6 +3280,28 @@ struct ggml_tensor * ggml_mul_mat_id(
return result;
}

GGML_API void ggml_mul_mat_add_scale(
struct ggml_tensor * a,
struct ggml_tensor * scale) {
a->src[2] = scale;
}

GGML_API const struct ggml_tensor * ggml_mul_mat_get_scale(
const struct ggml_tensor * a) {
return a->src[2];
}

GGML_API void ggml_mul_mat_id_add_scale(
struct ggml_tensor * a,
struct ggml_tensor * scale) {
a->src[3] = scale;
}

GGML_API const struct ggml_tensor * ggml_mul_mat_id_get_scale(
const struct ggml_tensor * a) {
return a->src[3];
}

// ggml_out_prod

static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
Expand Down
45 changes: 41 additions & 4 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * cur,
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
if (w_s && w->type == GGML_TYPE_NVFP4) {
ggml_mul_mat_add_scale(res, w_s); // Save the NVFP4 per tensor scale so it can be applied earlier to vecdot
w_s = nullptr; // Clear the scale so it doesnt get applied here, to preserve precision
}

for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
Expand Down Expand Up @@ -1007,7 +1011,13 @@ ggml_tensor * llm_graph_context::build_ffn(
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
int il) const {
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
ggml_tensor * tmp = cur;
if (up && up->type == GGML_TYPE_NVFP4) {
tmp = build_lora_mm(up, cur, up_s);
up_s = nullptr;
} else if (up) {
tmp = build_lora_mm(up, cur);
}
cb(tmp, "ffn_up", il);

if (up_b) {
Expand All @@ -1024,12 +1034,22 @@ ggml_tensor * llm_graph_context::build_ffn(
switch (type_gate) {
case LLM_FFN_SEQ:
{
cur = build_lora_mm(gate, tmp);
if (gate->type == GGML_TYPE_NVFP4) {
cur = build_lora_mm(gate, tmp, gate_s);
gate_s = nullptr;
} else {
cur = build_lora_mm(gate, tmp);
}
cb(cur, "ffn_gate", il);
} break;
case LLM_FFN_PAR:
{
cur = build_lora_mm(gate, cur);
if (gate->type == GGML_TYPE_NVFP4) {
cur = build_lora_mm(gate, cur, gate_s);
gate_s = nullptr;
} else {
cur = build_lora_mm(gate, cur);
}
cb(cur, "ffn_gate", il);
} break;
}
Expand Down Expand Up @@ -1133,7 +1153,12 @@ ggml_tensor * llm_graph_context::build_ffn(
}

if (down) {
cur = build_lora_mm(down, cur);
if (down->type == GGML_TYPE_NVFP4) {
cur = build_lora_mm(down, cur, down_s);
down_s = nullptr;
} else {
cur = build_lora_mm(down, cur);
}
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
Expand Down Expand Up @@ -1390,6 +1415,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
} else {
// separate gate and up path
up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
if (up_exps->type == GGML_TYPE_NVFP4) {
ggml_mul_mat_id_add_scale(up, up_exps_s); // Save the expert tensor scale
up_exps_s = nullptr; // Clear so it doesn't get applied here, to avoid precision loss
}
cb(up, "ffn_moe_up", il);

if (up_exps_b) {
Expand All @@ -1408,6 +1437,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
if (gate_exps->type == GGML_TYPE_NVFP4) {
ggml_mul_mat_id_add_scale(cur, gate_exps_s);
gate_exps_s = nullptr;
}
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
Expand Down Expand Up @@ -1498,6 +1531,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}

experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
if (down_exps->type == GGML_TYPE_NVFP4) {
ggml_mul_mat_id_add_scale(experts, down_exps_s);
down_exps_s = nullptr;
}
cb(experts, "ffn_moe_down", il);

if (down_exps_b) {
Expand Down