Hello, I just started learning about GRPO. After reading Issue 3016 and Issue 240 in the Issues section, my understanding is that if I need to use GRPO for fine-tuning, I must modify unsloth_zoo/vision_utils.pyto implement the core video processing logic. Based on Issue 240, I believe unsloth_zoo/vision_utils.pyhas already implemented this functionality. Therefore, I wrote a demo file to test the fine-tuning process. I found that the code can run normally, but it seems that vision_utils.pywas not accessed. I would like to ask whether GRPOtrainer will call this Python file? I would be extremely grateful for your guidance! Below is my demo.py (only the main part is included, data and prompts are missing).
from trl import GRPOConfig
from trl import GRPOConfig, GRPOTrainer
from unsloth import FastVisionModel, is_bf16_supported
import torch
model, tokenizer = FastVisionModel.from_pretrained(
"/root/autodl-tmp/LLaMA-Factory/Qwen/Qwen2.5-VL-7B-Instruct",
load_in_4bit=True # 显存不足可以换成 8bit 或 float16
)
model = FastVisionModel.get_peft_model(
model,
finetune_vision_layers=True,
finetune_language_layers=True,
finetune_attention_modules=True,
finetune_mlp_modules=True,
r=16,
lora_alpha=16,
lora_dropout=0.1,
bias="none"
)
def get_prompt_rft(example):
results = [
{
'prompt': [
{'role': 'system', 'content': [{"type": "text", "text": SYSTEM_PROMPT}]},
{'role': 'user', 'content': [
{"type": "video", "video": video_path},
{"type": "text", "text": messages_prompt}]}
],
'answer': json.dumps(new_data),
}
]
return results
def dataset_gen():
for items in ds:
multiple_out = get_prompt_rft(items)
for single_out in multiple_out:
yield single_out
dataset_train = Dataset.from_generator(dataset_gen)
output_dir="./outputs/Qwenvl-Instruct-GRPO"
run_name="Qwen-vl-GRPO"
reward_funcs = [
format_reward_func, # all reward functions
levenshtein_reward_func,
json_reward]
training_args = GRPOConfig(
lr_scheduler_type="cosine",
optim="adamw_8bit",
bf16=is_bf16_supported(),
fp16=not is_bf16_supported(),
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
num_generations=2,
max_prompt_length=None,
max_completion_length=512,
max_steps=250,
output_dir="outputs",
report_to="none"
)
trainer = GRPOTrainer(
model=model,
reward_funcs=reward_funcs,
args=training_args,
train_dataset=dataset_train,
processing_class=tokenizer,
reward_processing_classes=[tokenizer] * len(reward_funcs),
)
trainer.train()
trainer.save_model(output_dir)
Hello, I just started learning about GRPO. After reading Issue 3016 and Issue 240 in the Issues section, my understanding is that if I need to use GRPO for fine-tuning, I must modify unsloth_zoo/vision_utils.pyto implement the core video processing logic. Based on Issue 240, I believe unsloth_zoo/vision_utils.pyhas already implemented this functionality. Therefore, I wrote a demo file to test the fine-tuning process. I found that the code can run normally, but it seems that vision_utils.pywas not accessed. I would like to ask whether GRPOtrainer will call this Python file? I would be extremely grateful for your guidance! Below is my demo.py (only the main part is included, data and prompts are missing).
from trl import GRPOConfig
from trl import GRPOConfig, GRPOTrainer
from unsloth import FastVisionModel, is_bf16_supported
import torch
model, tokenizer = FastVisionModel.from_pretrained(
"/root/autodl-tmp/LLaMA-Factory/Qwen/Qwen2.5-VL-7B-Instruct",
load_in_4bit=True # 显存不足可以换成 8bit 或 float16
)
model = FastVisionModel.get_peft_model(
model,
finetune_vision_layers=True,
finetune_language_layers=True,
finetune_attention_modules=True,
finetune_mlp_modules=True,
r=16,
lora_alpha=16,
lora_dropout=0.1,
bias="none"
)
def get_prompt_rft(example):
results = [
{
'prompt': [
{'role': 'system', 'content': [{"type": "text", "text": SYSTEM_PROMPT}]},
{'role': 'user', 'content': [
{"type": "video", "video": video_path},
{"type": "text", "text": messages_prompt}]}
],
'answer': json.dumps(new_data),
}
]
return results
def dataset_gen():
for items in ds:
multiple_out = get_prompt_rft(items)
for single_out in multiple_out:
yield single_out
dataset_train = Dataset.from_generator(dataset_gen)
output_dir="./outputs/Qwenvl-Instruct-GRPO"
run_name="Qwen-vl-GRPO"
reward_funcs = [
format_reward_func, # all reward functions
levenshtein_reward_func,
json_reward]
training_args = GRPOConfig(
lr_scheduler_type="cosine",
optim="adamw_8bit",
bf16=is_bf16_supported(),
fp16=not is_bf16_supported(),
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
num_generations=2,
max_prompt_length=None,
max_completion_length=512,
max_steps=250,
output_dir="outputs",
report_to="none"
)
trainer = GRPOTrainer(
model=model,
reward_funcs=reward_funcs,
args=training_args,
train_dataset=dataset_train,
processing_class=tokenizer,
reward_processing_classes=[tokenizer] * len(reward_funcs),
)
trainer.train()
trainer.save_model(output_dir)