-
Notifications
You must be signed in to change notification settings - Fork 32.5k
Closed
Closed
Copy link
Labels
Description
System Info
Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.
transformersversion: 4.44.2- Platform: Linux-5.4.0-167-generic-x86_64-with-glibc2.35
- Python version: 3.10.12
- Huggingface_hub version: 0.24.7
- Safetensors version: 0.4.5
- Accelerate version: 0.34.2
- Accelerate config: not found
- PyTorch version (GPU?): 2.1.1+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using distributed or parallel set-up in script?: NA
- Using GPU in script?: yes
- GPU type: NVIDIA A40
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from accelerate import Accelerator
from peft import LoraConfig
import math, os, random
from datetime import datetime
# Select rows to train on
initial_rows = 50000
annealing_rows = 10000
eval_rows = 10000 # Only 10000 rows for evaluation
batch_size = 8
ga = 4
learning_rate=1e-3
def setup_environment():
os.environ['WANDB_DISABLED'] = 'true'
return Accelerator()
def load_model_and_tokenizer():
model_name = "Trelis/80M-0.0090-cosmopedia"
model_kwargs = {
"torch_dtype": torch.bfloat16,
}
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
return model, tokenizer
def load_and_preprocess_train_dataset(start_idx, num_rows):
dataset = load_dataset("TIGER-Lab/WebInstructSub", split="train",
streaming=True
)
dataset = dataset.skip(start_idx).take(num_rows)
def format_instruction(example):
return {
"messages": [
{"role": "user", "content": example["question"]},
{"role": "assistant", "content": example["answer"]}
]
}
formatted_dataset = dataset.map(format_instruction)
return formatted_dataset
def format_instruction_for_trainer(example):
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
return tokenizer.apply_chat_template(
example["messages"],
truncation=True,
padding="max_length",
max_length=2048,
tokenize=False,
)
def load_and_preprocess_eval_dataset():
dataset = load_dataset("TIGER-Lab/WebInstructSub", split="train")
# Get the total number of rows in the dataset
total_rows = len(dataset)
# Generate a list of random indices
random_indices = random.sample(range(total_rows), eval_rows)
# Select the random rows
dataset = dataset.select(random_indices)
def format_instruction(example):
return {
"messages": [
{"role": "user", "content": example["question"]},
{"role": "assistant", "content": example["answer"]}
]
}
formatted_dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
return formatted_dataset
def main():
accelerator = setup_environment()
model, tokenizer = load_model_and_tokenizer()
print(model.device)
# Combined training dataset (streaming)
total_rows = initial_rows + annealing_rows
train_dataset = load_and_preprocess_train_dataset(0, total_rows)
# Evaluation dataset (non-streaming, last 1000 rows)
eval_dataset = load_and_preprocess_eval_dataset()
# Calculate steps
num_epochs = 1
total_steps = (total_rows * num_epochs) // (batch_size * ga)
initial_steps = (initial_rows * num_epochs) // (batch_size * ga)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"SFT-{total_rows}rows-lr{learning_rate}-{timestamp}"
training_args = SFTConfig(
output_dir=f"./Trelis_local/80M-0.015-cosmopedia-SFT-{run_name}",
run_name=run_name,
logging_dir=f"./logs/{run_name}",
eval_strategy="steps",
save_strategy="steps",
report_to="tensorboard",
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=20,
logging_steps=int(total_steps * 0.1),
eval_steps=int(total_steps * 0.1),
save_steps=int(total_steps * 0.1),
learning_rate=learning_rate,
bf16=True,
max_steps=total_steps,
gradient_accumulation_steps=ga,
)
# Trainer
trainer = SFTTrainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=training_args,
tokenizer=tokenizer,
max_seq_length=2048,
formatting_func=format_instruction_for_trainer,
optimizers=(optimizer, lr_scheduler) # Use custom optimizer and scheduler
)
trainer = accelerator.prepare(trainer)
print(f"Starting instruction fine-tuning on {total_rows} rows of data (streaming)...")
trainer.train()
print("Instruction fine-tuning completed. Saving model...")
trainer.save_model("./finetuned_model_small_messages")
if __name__ == "__main__":
main()
Expected behavior
Getting this error:
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
I don't expect an error here, and it's unclear what I need to update if I'm to use an appropriate Cache class.
Reactions are currently unavailable