In a distributed setting, len(dataloader) will return:
len(dataset) // (batch_size * num_GPUs) if dataset is a map-style dataset
len(dataset) // batch_size if dataset is a datapipe
This discrepancy makes it a bit difficult to work with torchvision's training recipes, where we often need the size of the dataloader.
Below is an illustration of this discrepancy - you can run the snippet (even without a GPU) with torchrun --nproc_per_node 4 script.py
# Run this with e.g. `torchrun --nproc_per_node 4 script.py`
import torch.utils.data as data
import torch.distributed as dist
import torchdata
def replace_print():
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
if dist.get_rank() == 0:
builtin_print(f"[GPU 0]", *args, **kwargs)
__builtin__.print = print
# Setting up DDP - you can ignore this
dist.init_process_group(backend="gloo")
replace_print()
dist.barrier()
size = 800
dp = torchdata.datapipes.iter.IterableWrapper(range(size)).sharding_filter()
dl = data.DataLoader(dp, batch_size=10, num_workers=4, drop_last=True)
print(f"with dp, {len(dl) = }")
# Gives : 80
ds = list(range(size))
dl = data.DataLoader(ds, batch_size=10, num_workers=4, drop_last=True, sampler=data.DistributedSampler(ds, shuffle=False))
print(f"with mapstyle, {len(dl) = }")
# Gives: 20