_______________________ TestDistributed.test_rloo[fsdp2] _______________________
self = <tests.distributed.test_distributed.TestDistributed object at 0x7ff53a844ad0>
config = 'fsdp2'
get_config_path = <function get_config_path.<locals>._get_config_path at 0x7ff53a50fd80>
@pytest.mark.parametrize(
"config",
[
"ddp",
pytest.param(
"zero2",
marks=pytest.mark.xfail(
Version(transformers.__version__) == Version("5.1.0"),
reason="Upstream incompatibility: deepspeed and transformers==5.1.0 (see transformers#43780)",
),
),
pytest.param("zero3", marks=pytest.mark.xfail(reason="ZeRO 3 is currently failing, see #4899")),
"fsdp2",
],
)
def test_rloo(self, config, get_config_path):
# fmt: off
> run_command(
[
"accelerate", "launch", "--config_file", get_config_path(config), "trl/scripts/rloo.py",
"--output_dir", self.tmp_dir,
"--model_name_or_path", "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
"--dataset_name", "trl-internal-testing/zen",
"--dataset_config", "conversational_prompt_only",
"--reward_model_name_or_path", "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
],
os.environ.copy(),
)
tests/distributed/test_distributed.py:242:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
command = ['accelerate', 'launch', '--config_file', PosixPath('/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0/data/accelerate_configs/fsdp2.yaml'), 'trl/scripts/rloo.py', '--output_dir', ...]
env = {'ACTIONS_ORCHESTRATION_ID': '34e8d70d-b149-49b6-baf2-06c01e941646.distributed_smoke.__default', 'CI': 'true', 'CI_SLACK_CHANNEL': '***', 'CUDA_VERSION': '12.8.1', ...}
def run_command(command: list[str], env: dict[str, str]) -> None:
result = subprocess.run(command, env=env, cwd=ROOT)
> assert result.returncode == 0
E AssertionError: assert 1 == 0
E + where 1 = CompletedProcess(args=['accelerate', 'launch', '--config_file', PosixPath('/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0/data/accelerate_configs/fsdp2.yaml'), 'trl/scripts/rloo.py', '--output_dir', '/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0', '--model_name_or_path', 'trl-internal-testing/tiny-Qwen2ForCausalLM-2.5', '--dataset_name', 'trl-internal-testing/zen', '--dataset_config', 'conversational_prompt_only', '--reward_model_name_or_path', 'trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5'], returncode=1).returncode
tests/distributed/test_distributed.py:32: AssertionError
----------------------------- Captured stderr call -----------------------------
/pytorch/aten/src/ATen/native/cuda/TensorCompare.cu:112: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
[rank1]: Traceback (most recent call last):
[rank1]: File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 169, in <module>
[rank1]: main(script_args, training_args, model_args, dataset_args)
[rank1]: File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 141, in main
[rank1]: trainer.train()
[rank1]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1424, in train
[rank1]: return inner_training_loop(
[rank1]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1506, in _inner_training_loop
[rank1]: self._run_epoch(
[rank1]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1734, in _run_epoch
[rank1]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank1]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 748, in training_step
[rank1]: output = super().training_step(model, inputs, num_items_in_batch)
[rank1]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1900, in training_step
[rank1]: inputs = self._prepare_inputs(inputs)
[rank1]: File "/fsx/albert/dev/trl/trl/extras/profiling.py", line 202, in wrapper
[rank1]: return func(self, *args, **kwargs)
[rank1]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 777, in _prepare_inputs
[rank1]: generation_batch = self._generate_and_score_completions(generation_batch)
[rank1]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1118, in _generate_and_score_completions
[rank1]: prompt_ids_list, completion_ids_list, completions = self._generate(prompts)
[rank1]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1043, in _generate
[rank1]: completion_ids = self._generate_single_turn(prompt_ids, images, multimodal_fields)
[rank1]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1016, in _generate_single_turn
[rank1]: prompt_completion_ids = unwrapped_model.generate(
[rank1]: File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/fsdp/_fully_shard/_fully_shard.py", line 658, in wrapped_method
[rank1]: out = orig_method(*args, **kwargs)
[rank1]: File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
[rank1]: return func(*args, **kwargs)
[rank1]: File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2555, in generate
[rank1]: result = decoding_method(
[rank1]: File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2803, in _sample
[rank1]: next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
[rank1]: torch.AcceleratorError: CUDA error: device-side assert triggered
[rank1]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
[rank0]: Traceback (most recent call last):
[rank0]: File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 169, in <module>
[rank0]: main(script_args, training_args, model_args, dataset_args)
[rank0]: File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 141, in main
[rank0]: trainer.train()
[rank0]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1424, in train
[rank0]: return inner_training_loop(
[rank0]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1506, in _inner_training_loop
[rank0]: self._run_epoch(
[rank0]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1734, in _run_epoch
[rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank0]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 748, in training_step
[rank0]: output = super().training_step(model, inputs, num_items_in_batch)
[rank0]: File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1900, in training_step
[rank0]: inputs = self._prepare_inputs(inputs)
[rank0]: File "/fsx/albert/dev/trl/trl/extras/profiling.py", line 202, in wrapper
[rank0]: return func(self, *args, **kwargs)
[rank0]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 777, in _prepare_inputs
[rank0]: generation_batch = self._generate_and_score_completions(generation_batch)
[rank0]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1118, in _generate_and_score_completions
[rank0]: prompt_ids_list, completion_ids_list, completions = self._generate(prompts)
[rank0]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1043, in _generate
[rank0]: completion_ids = self._generate_single_turn(prompt_ids, images, multimodal_fields)
[rank0]: File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1016, in _generate_single_turn
[rank0]: prompt_completion_ids = unwrapped_model.generate(
[rank0]: File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/fsdp/_fully_shard/_fully_shard.py", line 658, in wrapped_method
[rank0]: out = orig_method(*args, **kwargs)
[rank0]: File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2555, in generate
[rank0]: result = decoding_method(
[rank0]: File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2803, in _sample
[rank0]: next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
[rank0]: torch.AcceleratorError: CUDA error: device-side assert triggered
[rank0]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
[rank0]:[E327 10:04:44.068014116 ProcessGroupNCCL.cpp:2068] [PG ID 0 PG GUID 0(default_pg) Rank 0] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f3f75476eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7f3f755091c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7f3f763cd640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7f3f763dce28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7f3f763dff48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7f3f763e1ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7f3f59ba7793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7f3fd3f23609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7f3fd405d353 in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::DistBackendError'
what(): [PG ID 0 PG GUID 0(default_pg) Rank 0] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f3f75476eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7f3f755091c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7f3f763cd640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7f3f763dce28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7f3f763dff48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7f3f763e1ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7f3f59ba7793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7f3fd3f23609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7f3fd405d353 in /lib/x86_64-linux-gnu/libc.so.6)
Exception raised from run at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2074 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f3f75476eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xe1c1a1 (0x7f3f763b91a1 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x9468e6 (0x7f3f75ee38e6 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xe6793 (0x7f3f59ba7793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #4: <unknown function> + 0x8609 (0x7f3fd3f23609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #5: clone + 0x43 (0x7f3fd405d353 in /lib/x86_64-linux-gnu/libc.so.6)
[rank1]:[E327 10:04:44.068762441 ProcessGroupNCCL.cpp:2068] [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fabe4408eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7fabe449b1c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7fabe535f640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7fabe536ee28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7fabe5371f48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7fabe5373ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7fabc8b39793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7fac42eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7fac42fef353 in /lib/x86_64-linux-gnu/libc.so.6)
terminate called after throwing an instance of 'c10::DistBackendError'
what(): [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fabe4408eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7fabe449b1c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7fabe535f640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7fabe536ee28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7fabe5371f48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7fabe5373ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7fabc8b39793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7fac42eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7fac42fef353 in /lib/x86_64-linux-gnu/libc.so.6)
Exception raised from run at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2074 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fabe4408eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xe1c1a1 (0x7fabe534b1a1 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x9468e6 (0x7fabe4e758e6 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xe6793 (0x7fabc8b39793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #4: <unknown function> + 0x8609 (0x7fac42eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #5: clone + 0x43 (0x7fac42fef353 in /lib/x86_64-linux-gnu/libc.so.6)
W0327 10:04:47.719000 1665985 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1666031 closing signal SIGTERM
E0327 10:04:47.872000 1665985 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: -6) local_rank: 1 (pid: 1666032) of binary: /fsx/albert/dev/trl/.venv/bin/python3
Traceback (most recent call last):
File "/fsx/albert/dev/trl/.venv/bin/accelerate", line 10, in <module>
sys.exit(main())
File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main
args.func(args)
File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1392, in launch_command
multi_gpu_launcher(args)
File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1023, in multi_gpu_launcher
distrib_run.run(args)
File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
trl/scripts/rloo.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
--------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2026-03-27_10:04:47
host : ip-26-0-172-142.ec2.internal
rank : 1 (local_rank: 1)
exitcode : -6 (pid: 1666032)
error_file: <N/A>
traceback : Signal 6 (SIGABRT) received by PID 1666032
============================================================
CI fails for distributed test_rloo[fsdp2]: https://github.com/huggingface/trl/actions/runs/23628572685/job/68822753991
Stacktrace: