Skip to content

passing past_key_values as a tuple is deprecated, but unclear how to resolve #33489

@RonanKMcGovern

Description

@RonanKMcGovern

System Info

Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

  • transformers version: 4.44.2
  • Platform: Linux-5.4.0-167-generic-x86_64-with-glibc2.35
  • Python version: 3.10.12
  • Huggingface_hub version: 0.24.7
  • Safetensors version: 0.4.5
  • Accelerate version: 0.34.2
  • Accelerate config: not found
  • PyTorch version (GPU?): 2.1.1+cu121 (True)
  • Tensorflow version (GPU?): not installed (NA)
  • Flax version (CPU?/GPU?/TPU?): not installed (NA)
  • Jax version: not installed
  • JaxLib version: not installed
  • Using distributed or parallel set-up in script?: NA
  • Using GPU in script?: yes
  • GPU type: NVIDIA A40

Who can help?

@ArthurZucker

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from accelerate import Accelerator
from peft import LoraConfig
import math, os, random
from datetime import datetime

# Select rows to train on
initial_rows = 50000
annealing_rows = 10000
eval_rows = 10000  # Only 10000 rows for evaluation

batch_size = 8
ga = 4

learning_rate=1e-3

def setup_environment():
    os.environ['WANDB_DISABLED'] = 'true'
    return Accelerator()

def load_model_and_tokenizer():
    model_name = "Trelis/80M-0.0090-cosmopedia"
    model_kwargs = {
        "torch_dtype": torch.bfloat16,
    }
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
    model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
    return model, tokenizer

def load_and_preprocess_train_dataset(start_idx, num_rows):
    dataset = load_dataset("TIGER-Lab/WebInstructSub", split="train",
                           streaming=True
                          )
    dataset = dataset.skip(start_idx).take(num_rows)
    
    def format_instruction(example):
        return {
            "messages": [
                {"role": "user", "content": example["question"]},
                {"role": "assistant", "content": example["answer"]}
            ]
        }
    
    formatted_dataset = dataset.map(format_instruction)
    return formatted_dataset

def format_instruction_for_trainer(example):
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
    
    return tokenizer.apply_chat_template(
        example["messages"],
        truncation=True,
        padding="max_length",
        max_length=2048,
        tokenize=False,
    )

def load_and_preprocess_eval_dataset():
    dataset = load_dataset("TIGER-Lab/WebInstructSub", split="train")
    
    # Get the total number of rows in the dataset
    total_rows = len(dataset)
    
    # Generate a list of random indices
    random_indices = random.sample(range(total_rows), eval_rows)
    
    # Select the random rows
    dataset = dataset.select(random_indices)
    
    def format_instruction(example):
        return {
            "messages": [
                {"role": "user", "content": example["question"]},
                {"role": "assistant", "content": example["answer"]}
            ]
        }
    
    formatted_dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
    return formatted_dataset

def main():
    accelerator = setup_environment()
    
    model, tokenizer = load_model_and_tokenizer()
    print(model.device)
    
    # Combined training dataset (streaming)
    total_rows = initial_rows + annealing_rows
    train_dataset = load_and_preprocess_train_dataset(0, total_rows)
    
    # Evaluation dataset (non-streaming, last 1000 rows)
    eval_dataset = load_and_preprocess_eval_dataset()
    
    # Calculate steps
    num_epochs = 1
    total_steps = (total_rows * num_epochs) // (batch_size * ga)
    initial_steps = (initial_rows * num_epochs) // (batch_size * ga)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_name = f"SFT-{total_rows}rows-lr{learning_rate}-{timestamp}"
    
    training_args = SFTConfig(
        output_dir=f"./Trelis_local/80M-0.015-cosmopedia-SFT-{run_name}",
        run_name=run_name,
        logging_dir=f"./logs/{run_name}",
        eval_strategy="steps",
        save_strategy="steps",
        report_to="tensorboard",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=20,
        logging_steps=int(total_steps * 0.1),
        eval_steps=int(total_steps * 0.1),
        save_steps=int(total_steps * 0.1),
        learning_rate=learning_rate,
        bf16=True,
        max_steps=total_steps,
        gradient_accumulation_steps=ga,
    )
    
    # Trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=2048,
        formatting_func=format_instruction_for_trainer,
        optimizers=(optimizer, lr_scheduler)  # Use custom optimizer and scheduler
    )
    
    trainer = accelerator.prepare(trainer)
    
    print(f"Starting instruction fine-tuning on {total_rows} rows of data (streaming)...")
    trainer.train()
    print("Instruction fine-tuning completed. Saving model...")
    
    trainer.save_model("./finetuned_model_small_messages")

if __name__ == "__main__":
    main()

Expected behavior

Getting this error:

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

I don't expect an error here, and it's unclear what I need to update if I'm to use an appropriate Cache class.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions