Note: Please do not remove the questions. Answer beside them.
# Testing script
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
from unsloth import FastVisionModel
from peft import LoraConfig
import torch
from datasets import Sequence, Image as HFImage
import os
# If not handled properly, prompt truncation may truncate image token
dataset = load_dataset("trl-internal-testing/zen-multi-image", "conversational_prompt_only", split="train")
dataset = dataset.filter(lambda x: len(x["images"]) == 2)
def my_reward_function(prompts, completions, **kwargs):
return [1.0] * len(prompts)
model, processor = FastVisionModel.from_pretrained(
"./outputs/qwen3.5-2b-invoice-finetuned",
max_seq_length=55_000,
dtype=torch.bfloat16,
load_in_4bit=False,
trust_remote_code=True,
device_map="auto",
attn_implementation="flash_attention_2",
fast_inference=False
)
model = FastVisionModel.get_peft_model(
model,
)
training_args = GRPOConfig(
output_dir="tmp_dir",
# ── generation ─────────────────────────────────────────
# max_new_tokens=self.max_new_tokens,
temperature=0.9, # higher than inference — need diverse rollouts
top_p=0.95,
top_k=20,
num_generations=4, # G=4 rollouts per prompt
# ── training ───────────────────────────────────────────
num_train_epochs=1,
learning_rate=1e-5,
weight_decay=0.01,
max_grad_norm=0.5,
warmup_steps=20,
lr_scheduler_type="cosine",
optim="adamw_8bit",
# ── batch / accumulation ───────────────────────────────
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
# ── GRPO-specific ──────────────────────────────────────
beta=0.04, # KL penalty coefficient
epsilon=0.2, # PPO clip ratio
# ── memory ─────────────────────────────────────────────
bf16=True,
fp16=False,
use_cache=False,
# ── dataloader ─────────────────────────────────────────
dataloader_num_workers=2,
dataloader_pin_memory=True,
# ── saving / logging ───────────────────────────────────
save_strategy="epoch",
save_total_limit=2,
logging_steps=5,
report_to="none",
seed=4123,
remove_unused_columns=False,
use_vllm=False,
log_completions = True,
# max_prompt_length = 45_000,
max_completion_length = 20_000,
importance_sampling_level = "sequence",
# mask_truncated_completions = False,
loss_type = "vespo"
)
trainer = GRPOTrainer(
model=model,
processing_class=processor,
reward_funcs=my_reward_function, # define a dummy reward function
args=training_args,
train_dataset=dataset,
)
previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
trainer.train()
Traceback (most recent call last):
File "/workspace/ocr/qwen_grpo2.py", line 350, in <module>
main()
File "/workspace/ocr/qwen_grpo2.py", line 346, in main
trainer.train()
File "/workspace/ocr/qwen_grpo2.py", line 290, in train
result =self.trainer.train()
^^^^^^^^^^^^^^^^^^^^
File "/workspace/ocr/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 84, in wrapper
output = f(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/unsloth/models/rl.py", line 144, in _unsloth_train_with_resume_guard
return original_train(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1425, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1507, in _inner_training_loop
self._run_epoch(
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/trainer.py", line 1735, in _run_epoch
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/ocr/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 3086, in training_step
output = super().training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<string>", line 34, in _unsloth_training_step
File "/workspace/stable-venv/lib/python3.12/site-packages/trl/extras/profiling.py", line 202, in wrapper
return func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/ocr/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 3115, in _prepare_inputs
generation_batch = self._generate_and_score_completions(generation_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/unsloth/models/rl.py", line 526, in wrapped
return original(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/ocr/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 4111, in _generate_and_score_completions
ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/ocr/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 3021, in _get_per_token_logps_and_entropies
logits_chunk = unwrapped_model(
^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/peft/peft_model.py", line 1993, in forward
return self.base_model(
^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 330, in forward
return self.model.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 876, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1937, in forward
outputs = self.model(
^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/utils/generic.py", line 876, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1666, in forward
image_mask, _ = self.get_placeholder_mask(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/models/qwen3_5/modeling_qwen3_5.py", line 1569, in get_placeholder_mask
torch_compilable_check(
File "/workspace/stable-venv/lib/python3.12/site-packages/transformers/utils/import_utils.py", line 1540, in torch_compilable_check
torch._check_with(error_type, cond, msg_callable)
File "/workspace/stable-venv/lib/python3.12/site-packages/torch/__init__.py", line 1719, in _check_with
raise error_type(message_evaluated)
ValueError: Image features and image tokens do not match, tokens: 596, features: 298
Note: Please do not remove the questions. Answer beside them.
pip install --upgrade unsloth unsloth_zoo: yesColaborKaggleor local / cloud: cloud ( a40 nvidia gpu )nvidia-smi: 1SFTTrainer,GRPOTraineretc: GRPOTrainerExplanation:
error:
🦥 You can also ask via our Reddit page: https://reddit.com/r/unsloth/