passing past_key_values as a tuple is deprecated, but unclear how to resolve

### System Info

Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.44.2
- Platform: Linux-5.4.0-167-generic-x86_64-with-glibc2.35
- Python version: 3.10.12
- Huggingface_hub version: 0.24.7
- Safetensors version: 0.4.5
- Accelerate version: 0.34.2
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.1.1+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using distributed or parallel set-up in script?: NA
- Using GPU in script?: yes
- GPU type: NVIDIA A40

### Who can help?

@ArthurZucker 

### Information

- [ ] The official example scripts
- [X] My own modified scripts

### Tasks

- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [X] My own task or dataset (give details below)

### Reproduction

```
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from accelerate import Accelerator
from peft import LoraConfig
import math, os, random
from datetime import datetime

# Select rows to train on
initial_rows = 50000
annealing_rows = 10000
eval_rows = 10000  # Only 10000 rows for evaluation

batch_size = 8
ga = 4

learning_rate=1e-3

def setup_environment():
    os.environ['WANDB_DISABLED'] = 'true'
    return Accelerator()

def load_model_and_tokenizer():
    model_name = "Trelis/80M-0.0090-cosmopedia"
    model_kwargs = {
        "torch_dtype": torch.bfloat16,
    }
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
    model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
    return model, tokenizer

def load_and_preprocess_train_dataset(start_idx, num_rows):
    dataset = load_dataset("TIGER-Lab/WebInstructSub", split="train",
                           streaming=True
                          )
    dataset = dataset.skip(start_idx).take(num_rows)
    
    def format_instruction(example):
        return {
            "messages": [
                {"role": "user", "content": example["question"]},
                {"role": "assistant", "content": example["answer"]}
            ]
        }
    
    formatted_dataset = dataset.map(format_instruction)
    return formatted_dataset

def format_instruction_for_trainer(example):
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M-Instruct")
    
    return tokenizer.apply_chat_template(
        example["messages"],
        truncation=True,
        padding="max_length",
        max_length=2048,
        tokenize=False,
    )

def load_and_preprocess_eval_dataset():
    dataset = load_dataset("TIGER-Lab/WebInstructSub", split="train")
    
    # Get the total number of rows in the dataset
    total_rows = len(dataset)
    
    # Generate a list of random indices
    random_indices = random.sample(range(total_rows), eval_rows)
    
    # Select the random rows
    dataset = dataset.select(random_indices)
    
    def format_instruction(example):
        return {
            "messages": [
                {"role": "user", "content": example["question"]},
                {"role": "assistant", "content": example["answer"]}
            ]
        }
    
    formatted_dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
    return formatted_dataset

def main():
    accelerator = setup_environment()
    
    model, tokenizer = load_model_and_tokenizer()
    print(model.device)
    
    # Combined training dataset (streaming)
    total_rows = initial_rows + annealing_rows
    train_dataset = load_and_preprocess_train_dataset(0, total_rows)
    
    # Evaluation dataset (non-streaming, last 1000 rows)
    eval_dataset = load_and_preprocess_eval_dataset()
    
    # Calculate steps
    num_epochs = 1
    total_steps = (total_rows * num_epochs) // (batch_size * ga)
    initial_steps = (initial_rows * num_epochs) // (batch_size * ga)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_name = f"SFT-{total_rows}rows-lr{learning_rate}-{timestamp}"
    
    training_args = SFTConfig(
        output_dir=f"./Trelis_local/80M-0.015-cosmopedia-SFT-{run_name}",
        run_name=run_name,
        logging_dir=f"./logs/{run_name}",
        eval_strategy="steps",
        save_strategy="steps",
        report_to="tensorboard",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=20,
        logging_steps=int(total_steps * 0.1),
        eval_steps=int(total_steps * 0.1),
        save_steps=int(total_steps * 0.1),
        learning_rate=learning_rate,
        bf16=True,
        max_steps=total_steps,
        gradient_accumulation_steps=ga,
    )
    
    # Trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=training_args,
        tokenizer=tokenizer,
        max_seq_length=2048,
        formatting_func=format_instruction_for_trainer,
        optimizers=(optimizer, lr_scheduler)  # Use custom optimizer and scheduler
    )
    
    trainer = accelerator.prepare(trainer)
    
    print(f"Starting instruction fine-tuning on {total_rows} rows of data (streaming)...")
    trainer.train()
    print("Instruction fine-tuning completed. Saving model...")
    
    trainer.save_model("./finetuned_model_small_messages")

if __name__ == "__main__":
    main()
```

### Expected behavior

Getting this error:
```
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
```
I don't expect an error here, and it's unclear what I need to update if I'm to use an appropriate `Cache` class.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

passing past_key_values as a tuple is deprecated, but unclear how to resolve #33489

System Info

Who can help?

Information

Tasks

Reproduction

Expected behavior

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

passing past_key_values as a tuple is deprecated, but unclear how to resolve #33489

Description

System Info

Who can help?

Information

Tasks

Reproduction

Expected behavior

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions