Skip to content

[Bug] RuntimeError: Unsloth: Saving LoRA finetune failed since # of LoRAs = 9360 does not match # of saved modules = 144. Please file a bug report! #3701

@aleksanderhan

Description

@aleksanderhan
  1. Did you update? pip install --upgrade unsloth unsloth_zoo - yes
  2. Colab or Kaggle or local / cloud - local script
  3. Number GPUs used, use nvidia-smi
    +-----------------------------------------------------------------------------------------+
    | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
    +-----------------------------------------+------------------------+----------------------+
    | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
    | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
    | | | MIG M. |
    |=========================================+========================+======================|
    | 0 NVIDIA RTX PRO 6000 Blac... Off | 00000000:01:00.0 Off | Off |
    | 37% 47C P0 46W / 300W | 0MiB / 97887MiB | 3% Default |
    | | | N/A |
    +-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
4. Which notebook? Please link!
5. Which Unsloth version, TRL version, transformers version, PyTorch version?

torch.version
'2.8.0+cu128'
trl.version
'0.23.0'
transformers.version
'4.57.1'
unsloth.version
'2025.12.1'

  1. Which trainer? SFTTrainer, GRPOTrainer etc
from unsloth import FastLanguageModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
import re
import torch.nn as nn
import bitsandbytes as bnb

model_id = "unsloth/gpt-oss-120b"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    dtype = torch.bfloat16, # None for auto detection
    max_seq_length = 32786, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)
print(model)


PATTERNS = [
    r"\.self_attn\.(q_proj|k_proj|v_proj|o_proj)$",
    r"\.mlp\.experts\.gate_up_projs\.\d+$",        # every expert's up proj
    r"\.mlp\.experts\.down_projs\.\d+$",           # every expert's down proj
]
PATTERNS = [re.compile(p) for p in PATTERNS]

def is_target(name, module):
    if not isinstance(module, (nn.Linear, bnb.nn.Linear4bit)):
        return False
    return any(p.search(name) for p in PATTERNS)

target_modules = [name for name, mod in model.named_modules() if is_target(name, mod)]
print(f"{len(target_modules)} modules will get LoRA")

model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=target_modules,
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


from datasets import load_from_disk
train = load_from_disk("data/OpenMathReasoning-harmony-deduplicated")

def tok(batch):
    enc = tokenizer(
        [p + c for p, c in zip(batch["prompt"], batch["completion"])],
        padding=False,
        truncation=False,
    )

    labels = []
    for p, c, ids in zip(batch["prompt"], batch["completion"], enc["input_ids"]):
        prompt_ids = tokenizer(p, add_special_tokens=False)["input_ids"]
        cutoff = len(prompt_ids)  # everything up to here gets masked out
        labels.append([-100] * cutoff + ids[cutoff:])
    enc["labels"] = labels
    return enc

train_tok = train.map(
    tok,
    batched=True,
    batch_size=1000,
    writer_batch_size=10_000,
    num_proc=24,
    remove_columns=train.column_names,
)

from transformers import TrainingArguments
from trl import SFTConfig, SFTTrainer


training_args = TrainingArguments(
    learning_rate=5.0e-5,
    gradient_checkpointing=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine_with_min_lr",
    lr_scheduler_kwargs={"min_lr_rate": 0.1},
    output_dir="gpt-oss-120b-sft",
    report_to="none",
    bf16=True,
    fp16=False,
    optim = "paged_adamw_8bit",

    save_strategy="steps",          # or "epoch"
    save_steps=200,                 # pick a cadence that fits your run length
    save_total_limit=2,             # keep disk usage in check
    logging_steps=20,
)


from transformers import Trainer
from transformers.trainer_utils import get_last_checkpoint
from torch.nn.utils.rnn import pad_sequence

def collate(batch):
    input_ids = [torch.tensor(x["input_ids"], dtype=torch.long) for x in batch]
    attention_mask = [torch.tensor(x["attention_mask"], dtype=torch.long) for x in batch]
    labels = [torch.tensor(x["labels"], dtype=torch.long) for x in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_tok,
    data_collator=collate,
)

last_ckpt = get_last_checkpoint(training_args.output_dir)
if last_ckpt:
    trainer.train(resume_from_checkpoint=last_ckpt)
else:
    trainer.train()

model.save_pretrained_merged(training_args.output_dir, tokenizer, save_method="mxfp4")

This is the error im getting:

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 47,197 | Num Epochs = 1 | Total steps = 5,900
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 2,147,254,272 of 118,976,410,944 (1.80% trained)
{'train_runtime': 0.2181, 'train_samples_per_second': 216357.618, 'train_steps_per_second': 27046.421, 'train_loss': 0.0, 'epoch': 1.0} 
  0%|                                                                                                          | 0/5900 [00:00<?, ?it/s]
Found HuggingFace hub cache directory: /home/aleks/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00000-of-00014.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Unsloth: Preparing safetensor model files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 81180.08it/s]
Note: tokenizer.model not found (this is OK for non-SentencePiece models)
Unsloth: Merging weights into mxfp4: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:13<00:00,  1.09it/s]
Traceback (most recent call last):
  File "/home/aleks/projects/aimo3/train_sft.py", line 131, in <module>
    model.save_pretrained_merged(training_args.output_dir, tokenizer, save_method="mxfp4")
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/aleks/miniconda3/lib/python3.13/site-packages/unsloth/save.py", line 2688, in unsloth_generic_save_pretrained_merged
    unsloth_generic_save(**arguments)
    ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^
  File "/home/aleks/miniconda3/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
  File "/home/aleks/miniconda3/lib/python3.13/site-packages/unsloth/save.py", line 2636, in unsloth_generic_save
    merge_and_overwrite_lora(
    ~~~~~~~~~~~~~~~~~~~~~~~~^
        get_model_name,
        ^^^^^^^^^^^^^^^
    ...<9 lines>...
        use_temp_file = False,
        ^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/home/aleks/miniconda3/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
  File "/home/aleks/miniconda3/lib/python3.13/site-packages/unsloth_zoo/saving_utils.py", line 1368, in merge_and_overwrite_lora
    raise RuntimeError(
    ...<2 lines>...
    )
RuntimeError: Unsloth: Saving LoRA finetune failed since # of LoRAs = 9360 does not match # of saved modules = 144. Please file a bug report!

Metadata

Metadata

Assignees

Labels

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions