@@ -135,6 +135,7 @@ const char * llm_type_name(llm_type type) {
135135 case LLM_TYPE_100B_A6B: return "100B.A6B";
136136 case LLM_TYPE_102B_A12B: return "102B.A12B";
137137 case LLM_TYPE_106B_A12B: return "106B.A12B";
138+ case LLM_TYPE_120B_A12B: return "120B.A12B";
138139 case LLM_TYPE_122B_A10B: return "122B.A10B";
139140 case LLM_TYPE_196B_A11B: return "196B.A11B";
140141 case LLM_TYPE_230B_A10B: return "230B.A10B";
@@ -1861,10 +1862,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
18611862 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
18621863 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
18631864 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1865+ ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false);
18641866
18651867 switch (hparams.n_layer) {
18661868 case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
18671869 case 56: type = LLM_TYPE_9B; break;
1870+ case 88: type = LLM_TYPE_120B_A12B; break;
18681871 default: type = LLM_TYPE_UNKNOWN;
18691872 }
18701873 } break;
@@ -5544,6 +5547,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
55445547 const int64_t n_ssm_head = hparams.ssm_dt_rank;
55455548 const int64_t n_group = hparams.ssm_n_group;
55465549 const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5550+ const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
55475551
55485552 // embeddings
55495553 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5603,8 +5607,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
56035607 layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
56045608
56055609 // MoE branch
5606- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5607- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5610+ layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
5611+ layer.ffn_latent_up = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP, "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
5612+
5613+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, moe_n_embd, n_expert}, 0);
5614+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
56085615
56095616 // Shared expert branch
56105617 layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
0 commit comments