- Did you update?
pip install --upgrade unsloth unsloth_zoo - yes
Colab or Kaggle or local / cloud - local script
- Number GPUs used, use
nvidia-smi
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA RTX PRO 6000 Blac... Off | 00000000:01:00.0 Off | Off |
| 37% 47C P0 46W / 300W | 0MiB / 97887MiB | 3% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
4. Which notebook? Please link!
5. Which Unsloth version, TRL version, transformers version, PyTorch version?
torch.version
'2.8.0+cu128'
trl.version
'0.23.0'
transformers.version
'4.57.1'
unsloth.version
'2025.12.1'
- Which trainer?
SFTTrainer, GRPOTrainer etc
from unsloth import FastLanguageModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
import re
import torch.nn as nn
import bitsandbytes as bnb
model_id = "unsloth/gpt-oss-120b"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_id,
dtype = torch.bfloat16, # None for auto detection
max_seq_length = 32786, # Choose any for long context!
load_in_4bit = True, # 4 bit quantization to reduce memory
full_finetuning = False, # [NEW!] We have full finetuning now!
)
print(model)
PATTERNS = [
r"\.self_attn\.(q_proj|k_proj|v_proj|o_proj)$",
r"\.mlp\.experts\.gate_up_projs\.\d+$", # every expert's up proj
r"\.mlp\.experts\.down_projs\.\d+$", # every expert's down proj
]
PATTERNS = [re.compile(p) for p in PATTERNS]
def is_target(name, module):
if not isinstance(module, (nn.Linear, bnb.nn.Linear4bit)):
return False
return any(p.search(name) for p in PATTERNS)
target_modules = [name for name, mod in model.named_modules() if is_target(name, mod)]
print(f"{len(target_modules)} modules will get LoRA")
model = FastLanguageModel.get_peft_model(
model,
r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=target_modules,
lora_alpha = 64,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = True, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
from datasets import load_from_disk
train = load_from_disk("data/OpenMathReasoning-harmony-deduplicated")
def tok(batch):
enc = tokenizer(
[p + c for p, c in zip(batch["prompt"], batch["completion"])],
padding=False,
truncation=False,
)
labels = []
for p, c, ids in zip(batch["prompt"], batch["completion"], enc["input_ids"]):
prompt_ids = tokenizer(p, add_special_tokens=False)["input_ids"]
cutoff = len(prompt_ids) # everything up to here gets masked out
labels.append([-100] * cutoff + ids[cutoff:])
enc["labels"] = labels
return enc
train_tok = train.map(
tok,
batched=True,
batch_size=1000,
writer_batch_size=10_000,
num_proc=24,
remove_columns=train.column_names,
)
from transformers import TrainingArguments
from trl import SFTConfig, SFTTrainer
training_args = TrainingArguments(
learning_rate=5.0e-5,
gradient_checkpointing=True,
num_train_epochs=1,
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
warmup_ratio=0.03,
lr_scheduler_type="cosine_with_min_lr",
lr_scheduler_kwargs={"min_lr_rate": 0.1},
output_dir="gpt-oss-120b-sft",
report_to="none",
bf16=True,
fp16=False,
optim = "paged_adamw_8bit",
save_strategy="steps", # or "epoch"
save_steps=200, # pick a cadence that fits your run length
save_total_limit=2, # keep disk usage in check
logging_steps=20,
)
from transformers import Trainer
from transformers.trainer_utils import get_last_checkpoint
from torch.nn.utils.rnn import pad_sequence
def collate(batch):
input_ids = [torch.tensor(x["input_ids"], dtype=torch.long) for x in batch]
attention_mask = [torch.tensor(x["attention_mask"], dtype=torch.long) for x in batch]
labels = [torch.tensor(x["labels"], dtype=torch.long) for x in batch]
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
labels = pad_sequence(labels, batch_first=True, padding_value=-100)
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_tok,
data_collator=collate,
)
last_ckpt = get_last_checkpoint(training_args.output_dir)
if last_ckpt:
trainer.train(resume_from_checkpoint=last_ckpt)
else:
trainer.train()
model.save_pretrained_merged(training_args.output_dir, tokenizer, save_method="mxfp4")
This is the error im getting:
==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1
\\ /| Num examples = 47,197 | Num Epochs = 1 | Total steps = 5,900
O^O/ \_/ \ Batch size per device = 1 | Gradient accumulation steps = 8
\ / Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
"-____-" Trainable parameters = 2,147,254,272 of 118,976,410,944 (1.80% trained)
{'train_runtime': 0.2181, 'train_samples_per_second': 216357.618, 'train_steps_per_second': 27046.421, 'train_loss': 0.0, 'epoch': 1.0}
0%| | 0/5900 [00:00<?, ?it/s]
Found HuggingFace hub cache directory: /home/aleks/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00000-of-00014.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Unsloth: Preparing safetensor model files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 81180.08it/s]
Note: tokenizer.model not found (this is OK for non-SentencePiece models)
Unsloth: Merging weights into mxfp4: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:13<00:00, 1.09it/s]
Traceback (most recent call last):
File "/home/aleks/projects/aimo3/train_sft.py", line 131, in <module>
model.save_pretrained_merged(training_args.output_dir, tokenizer, save_method="mxfp4")
~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aleks/miniconda3/lib/python3.13/site-packages/unsloth/save.py", line 2688, in unsloth_generic_save_pretrained_merged
unsloth_generic_save(**arguments)
~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^
File "/home/aleks/miniconda3/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/home/aleks/miniconda3/lib/python3.13/site-packages/unsloth/save.py", line 2636, in unsloth_generic_save
merge_and_overwrite_lora(
~~~~~~~~~~~~~~~~~~~~~~~~^
get_model_name,
^^^^^^^^^^^^^^^
...<9 lines>...
use_temp_file = False,
^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/home/aleks/miniconda3/lib/python3.13/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/home/aleks/miniconda3/lib/python3.13/site-packages/unsloth_zoo/saving_utils.py", line 1368, in merge_and_overwrite_lora
raise RuntimeError(
...<2 lines>...
)
RuntimeError: Unsloth: Saving LoRA finetune failed since # of LoRAs = 9360 does not match # of saved modules = 144. Please file a bug report!
pip install --upgrade unsloth unsloth_zoo- yesColaborKaggleor local / cloud - local scriptnvidia-smi+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA RTX PRO 6000 Blac... Off | 00000000:01:00.0 Off | Off |
| 37% 47C P0 46W / 300W | 0MiB / 97887MiB | 3% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
4. Which notebook? Please link!
5. Which Unsloth version, TRL version, transformers version, PyTorch version?
SFTTrainer,GRPOTraineretcThis is the error im getting: