Skip to content

[0.4.1] ValueError: Attempting to unscale FP16 gradients. #834

@carmocca

Description

@carmocca

🐛 Bug

Traceback (most recent call last):
  File "kk.py", line 43, in <module>
    trainer.fit(model, DataLoader(RandomDataset(32, 64), batch_size=2))
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 552, in fit
    self._run(model)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 922, in _run
    self._dispatch()
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _dispatch
    self.accelerator.start_training(self)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
    self._results = trainer.run_stage()
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1000, in run_stage
    return self._run_train()
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1049, in _run_train
    self.fit_loop.run()
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
    self.advance(*args, **kwargs)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
    epoch_output = self.epoch_loop.run(train_dataloader)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
    self.advance(*args, **kwargs)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 130, in advance
    batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 100, in run
    super().run(batch, batch_idx, dataloader_idx)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
    self.advance(*args, **kwargs)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 147, in advance
    result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 201, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 395, in _optimizer_step
    model_ref.optimizer_step(
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1616, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 206, in step
    self.__optimizer_step(closure=closure, profiler_name=profiler_name, **kwargs)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 128, in __optimizer_step
    trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 292, in optimizer_step
    make_optimizer_step = self.precision_plugin.pre_optimizer_step(
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 57, in pre_optimizer_step
    result = lambda_closure()  # native amp does not support closures
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 235, in _training_step_and_backward_closure
    result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 548, in training_step_and_backward
    self.backward(result, optimizer, opt_idx)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 589, in backward
    result.closure_loss = self.trainer.accelerator.backward(result.closure_loss, optimizer, *args, **kwargs)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 278, in backward
    closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss, optimizer)
  File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 77, in post_backward
    self.scaler.unscale_(optimizer)
  File "/home/carlos/venv/lib/python3.8/site-packages/fairscale/optim/grad_scaler.py", line 62, in unscale_
    super().unscale_(optimizer)
  File "/home/carlos/venv/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 279, in unscale_
    optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
  File "/home/carlos/venv/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 207, in _unscale_grads_
    raise ValueError("Attempting to unscale FP16 gradients.")

Command

python script.py on a GPU machine

To Reproduce

import torch
from fairscale.nn import wrap
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.plugins import DDPFullyShardedPlugin


class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


class TestFSDPModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))

    def forward(self, x):
        return self.layer(x)

    def configure_sharded_model(self):
        self.layer = wrap(self.layer)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(self.layer.parameters(), lr=0.1)


if __name__ == "__main__":
    model = TestFSDPModel()
    trainer = Trainer(gpus=1, plugins=DDPFullyShardedPlugin(), precision=16, max_epochs=1)
    trainer.fit(model, DataLoader(RandomDataset(32, 64), batch_size=2))

Expected behavior

I'm not entirely sure whether this is a problem on our end or in the release. But this did work with the 0.4.0 release.

cc @SeanNaren

Environment

pytorch-lightning==1.4.9
torch==1.9.0+cu111
fairscale==0.4.1

Thank you for your help!

Metadata

Metadata

Labels

FSDPFullyShardedDataParallel (zero-3)

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions