-
Notifications
You must be signed in to change notification settings - Fork 296
[0.4.1] ValueError: Attempting to unscale FP16 gradients. #834
Copy link
Copy link
Closed
Labels
FSDPFullyShardedDataParallel (zero-3)FullyShardedDataParallel (zero-3)
Description
🐛 Bug
Traceback (most recent call last):
File "kk.py", line 43, in <module>
trainer.fit(model, DataLoader(RandomDataset(32, 64), batch_size=2))
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 552, in fit
self._run(model)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 922, in _run
self._dispatch()
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _dispatch
self.accelerator.start_training(self)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
self._results = trainer.run_stage()
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1000, in run_stage
return self._run_train()
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1049, in _run_train
self.fit_loop.run()
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
epoch_output = self.epoch_loop.run(train_dataloader)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 130, in advance
batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 100, in run
super().run(batch, batch_idx, dataloader_idx)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 147, in advance
result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 201, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 395, in _optimizer_step
model_ref.optimizer_step(
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1616, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 206, in step
self.__optimizer_step(closure=closure, profiler_name=profiler_name, **kwargs)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 128, in __optimizer_step
trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 292, in optimizer_step
make_optimizer_step = self.precision_plugin.pre_optimizer_step(
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 57, in pre_optimizer_step
result = lambda_closure() # native amp does not support closures
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 235, in _training_step_and_backward_closure
result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 548, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 589, in backward
result.closure_loss = self.trainer.accelerator.backward(result.closure_loss, optimizer, *args, **kwargs)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 278, in backward
closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss, optimizer)
File "/home/carlos/venv/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 77, in post_backward
self.scaler.unscale_(optimizer)
File "/home/carlos/venv/lib/python3.8/site-packages/fairscale/optim/grad_scaler.py", line 62, in unscale_
super().unscale_(optimizer)
File "/home/carlos/venv/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 279, in unscale_
optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
File "/home/carlos/venv/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 207, in _unscale_grads_
raise ValueError("Attempting to unscale FP16 gradients.")Command
python script.py on a GPU machine
To Reproduce
import torch
from fairscale.nn import wrap
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.plugins import DDPFullyShardedPlugin
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class TestFSDPModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Sequential(torch.nn.Linear(32, 32), torch.nn.ReLU(), torch.nn.Linear(32, 2))
def forward(self, x):
return self.layer(x)
def configure_sharded_model(self):
self.layer = wrap(self.layer)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
return loss
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
if __name__ == "__main__":
model = TestFSDPModel()
trainer = Trainer(gpus=1, plugins=DDPFullyShardedPlugin(), precision=16, max_epochs=1)
trainer.fit(model, DataLoader(RandomDataset(32, 64), batch_size=2))Expected behavior
I'm not entirely sure whether this is a problem on our end or in the release. But this did work with the 0.4.0 release.
cc @SeanNaren
Environment
pytorch-lightning==1.4.9
torch==1.9.0+cu111
fairscale==0.4.1Thank you for your help!
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
FSDPFullyShardedDataParallel (zero-3)FullyShardedDataParallel (zero-3)