Skip to content

CI fails for distributed test_rloo[fsdp2]: CUDA error: device-side assert triggered: probability tensor contains either inf, nan or element < 0 #5386

@albertvillanova

Description

@albertvillanova

CI fails for distributed test_rloo[fsdp2]: https://github.com/huggingface/trl/actions/runs/23628572685/job/68822753991

torch.AcceleratorError: CUDA error: device-side assert triggered

probability tensor contains either inf, nan or element < 0

/pytorch/aten/src/ATen/native/cuda/TensorCompare.cu:112: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
FAILED tests/distributed/test_distributed.py::TestDistributed::test_rloo[fsdp2] - AssertionError: assert 1 == 0
 +  where 1 = CompletedProcess(args=['accelerate', 'launch', '--config_file', PosixPath('/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0/data/accelerate_configs/fsdp2.yaml'), 'trl/scripts/rloo.py', '--output_dir', '/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0', '--model_name_or_path', 'trl-internal-testing/tiny-Qwen2ForCausalLM-2.5', '--dataset_name', 'trl-internal-testing/zen', '--dataset_config', 'conversational_prompt_only', '--reward_model_name_or_path', 'trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5'], returncode=1).returncode

Stacktrace:

_______________________ TestDistributed.test_rloo[fsdp2] _______________________

self = <tests.distributed.test_distributed.TestDistributed object at 0x7ff53a844ad0>
config = 'fsdp2'
get_config_path = <function get_config_path.<locals>._get_config_path at 0x7ff53a50fd80>

    @pytest.mark.parametrize(
        "config",
        [
            "ddp",
            pytest.param(
                "zero2",
                marks=pytest.mark.xfail(
                    Version(transformers.__version__) == Version("5.1.0"),
                    reason="Upstream incompatibility: deepspeed and transformers==5.1.0 (see transformers#43780)",
                ),
            ),
            pytest.param("zero3", marks=pytest.mark.xfail(reason="ZeRO 3 is currently failing, see #4899")),
            "fsdp2",
        ],
    )
    def test_rloo(self, config, get_config_path):
        # fmt: off
>       run_command(
            [
                "accelerate", "launch", "--config_file", get_config_path(config), "trl/scripts/rloo.py",
                "--output_dir", self.tmp_dir,
                "--model_name_or_path", "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
                "--dataset_name", "trl-internal-testing/zen",
                "--dataset_config", "conversational_prompt_only",
                "--reward_model_name_or_path", "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
            ],
            os.environ.copy(),
        )

tests/distributed/test_distributed.py:242: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

command = ['accelerate', 'launch', '--config_file', PosixPath('/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0/data/accelerate_configs/fsdp2.yaml'), 'trl/scripts/rloo.py', '--output_dir', ...]
env = {'ACTIONS_ORCHESTRATION_ID': '34e8d70d-b149-49b6-baf2-06c01e941646.distributed_smoke.__default', 'CI': 'true', 'CI_SLACK_CHANNEL': '***', 'CUDA_VERSION': '12.8.1', ...}

    def run_command(command: list[str], env: dict[str, str]) -> None:
        result = subprocess.run(command, env=env, cwd=ROOT)
>       assert result.returncode == 0
E       AssertionError: assert 1 == 0
E        +  where 1 = CompletedProcess(args=['accelerate', 'launch', '--config_file', PosixPath('/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0/data/accelerate_configs/fsdp2.yaml'), 'trl/scripts/rloo.py', '--output_dir', '/tmp/pytest-of-root/pytest-0/test_rloo_fsdp2_0', '--model_name_or_path', 'trl-internal-testing/tiny-Qwen2ForCausalLM-2.5', '--dataset_name', 'trl-internal-testing/zen', '--dataset_config', 'conversational_prompt_only', '--reward_model_name_or_path', 'trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5'], returncode=1).returncode

tests/distributed/test_distributed.py:32: AssertionError
----------------------------- Captured stderr call -----------------------------
/pytorch/aten/src/ATen/native/cuda/TensorCompare.cu:112: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
[rank1]: Traceback (most recent call last):
[rank1]:   File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 169, in <module>
[rank1]:     main(script_args, training_args, model_args, dataset_args)
[rank1]:   File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 141, in main
[rank1]:     trainer.train()
[rank1]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1424, in train
[rank1]:     return inner_training_loop(
[rank1]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1506, in _inner_training_loop
[rank1]:     self._run_epoch(
[rank1]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1734, in _run_epoch
[rank1]:     tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank1]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 748, in training_step
[rank1]:     output = super().training_step(model, inputs, num_items_in_batch)
[rank1]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1900, in training_step
[rank1]:     inputs = self._prepare_inputs(inputs)
[rank1]:   File "/fsx/albert/dev/trl/trl/extras/profiling.py", line 202, in wrapper
[rank1]:     return func(self, *args, **kwargs)
[rank1]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 777, in _prepare_inputs
[rank1]:     generation_batch = self._generate_and_score_completions(generation_batch)
[rank1]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1118, in _generate_and_score_completions
[rank1]:     prompt_ids_list, completion_ids_list, completions = self._generate(prompts)
[rank1]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1043, in _generate
[rank1]:     completion_ids = self._generate_single_turn(prompt_ids, images, multimodal_fields)
[rank1]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1016, in _generate_single_turn
[rank1]:     prompt_completion_ids = unwrapped_model.generate(
[rank1]:   File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/fsdp/_fully_shard/_fully_shard.py", line 658, in wrapped_method
[rank1]:     out = orig_method(*args, **kwargs)
[rank1]:   File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
[rank1]:     return func(*args, **kwargs)
[rank1]:   File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2555, in generate
[rank1]:     result = decoding_method(
[rank1]:   File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2803, in _sample
[rank1]:     next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
[rank1]: torch.AcceleratorError: CUDA error: device-side assert triggered
[rank1]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[rank0]: Traceback (most recent call last):
[rank0]:   File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 169, in <module>
[rank0]:     main(script_args, training_args, model_args, dataset_args)
[rank0]:   File "/fsx/albert/dev/trl/trl/scripts/rloo.py", line 141, in main
[rank0]:     trainer.train()
[rank0]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1424, in train
[rank0]:     return inner_training_loop(
[rank0]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1506, in _inner_training_loop
[rank0]:     self._run_epoch(
[rank0]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1734, in _run_epoch
[rank0]:     tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
[rank0]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 748, in training_step
[rank0]:     output = super().training_step(model, inputs, num_items_in_batch)
[rank0]:   File "/fsx/albert/dev/transformers/src/transformers/trainer.py", line 1900, in training_step
[rank0]:     inputs = self._prepare_inputs(inputs)
[rank0]:   File "/fsx/albert/dev/trl/trl/extras/profiling.py", line 202, in wrapper
[rank0]:     return func(self, *args, **kwargs)
[rank0]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 777, in _prepare_inputs
[rank0]:     generation_batch = self._generate_and_score_completions(generation_batch)
[rank0]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1118, in _generate_and_score_completions
[rank0]:     prompt_ids_list, completion_ids_list, completions = self._generate(prompts)
[rank0]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1043, in _generate
[rank0]:     completion_ids = self._generate_single_turn(prompt_ids, images, multimodal_fields)
[rank0]:   File "/fsx/albert/dev/trl/trl/trainer/rloo_trainer.py", line 1016, in _generate_single_turn
[rank0]:     prompt_completion_ids = unwrapped_model.generate(
[rank0]:   File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/fsdp/_fully_shard/_fully_shard.py", line 658, in wrapped_method
[rank0]:     out = orig_method(*args, **kwargs)
[rank0]:   File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
[rank0]:     return func(*args, **kwargs)
[rank0]:   File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2555, in generate
[rank0]:     result = decoding_method(
[rank0]:   File "/fsx/albert/dev/transformers/src/transformers/generation/utils.py", line 2803, in _sample
[rank0]:     next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
[rank0]: torch.AcceleratorError: CUDA error: device-side assert triggered
[rank0]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[rank0]:[E327 10:04:44.068014116 ProcessGroupNCCL.cpp:2068] [PG ID 0 PG GUID 0(default_pg) Rank 0] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f3f75476eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7f3f755091c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7f3f763cd640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7f3f763dce28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7f3f763dff48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7f3f763e1ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7f3f59ba7793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7f3fd3f23609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7f3fd405d353 in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::DistBackendError'
  what():  [PG ID 0 PG GUID 0(default_pg) Rank 0] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f3f75476eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7f3f755091c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7f3f763cd640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7f3f763dce28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7f3f763dff48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7f3f763e1ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7f3f59ba7793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7f3fd3f23609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7f3fd405d353 in /lib/x86_64-linux-gnu/libc.so.6)

Exception raised from run at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2074 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f3f75476eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xe1c1a1 (0x7f3f763b91a1 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x9468e6 (0x7f3f75ee38e6 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xe6793 (0x7f3f59ba7793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #4: <unknown function> + 0x8609 (0x7f3fd3f23609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #5: clone + 0x43 (0x7f3fd405d353 in /lib/x86_64-linux-gnu/libc.so.6)

[rank1]:[E327 10:04:44.068762441 ProcessGroupNCCL.cpp:2068] [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fabe4408eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7fabe449b1c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7fabe535f640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7fabe536ee28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7fabe5371f48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7fabe5373ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7fabc8b39793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7fac42eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7fac42fef353 in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::DistBackendError'
  what():  [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at /pytorch/c10/cuda/CUDAException.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fabe4408eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x111c7 (0x7fabe449b1c7 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10_cuda.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x50 (0x7fabe535f640 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x68 (0x7fabe536ee28 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: c10d::ProcessGroupNCCL::Watchdog::runLoop() + 0x978 (0x7fabe5371f48 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #5: c10d::ProcessGroupNCCL::Watchdog::run() + 0xd2 (0x7fabe5373ec2 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xe6793 (0x7fabc8b39793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x8609 (0x7fac42eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x43 (0x7fac42fef353 in /lib/x86_64-linux-gnu/libc.so.6)

Exception raised from run at /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2074 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fabe4408eb0 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xe1c1a1 (0x7fabe534b1a1 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x9468e6 (0x7fabe4e758e6 in /fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xe6793 (0x7fabc8b39793 in /lib/x86_64-linux-gnu/libstdc++.so.6)
frame #4: <unknown function> + 0x8609 (0x7fac42eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #5: clone + 0x43 (0x7fac42fef353 in /lib/x86_64-linux-gnu/libc.so.6)

W0327 10:04:47.719000 1665985 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1666031 closing signal SIGTERM
E0327 10:04:47.872000 1665985 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: -6) local_rank: 1 (pid: 1666032) of binary: /fsx/albert/dev/trl/.venv/bin/python3
Traceback (most recent call last):
  File "/fsx/albert/dev/trl/.venv/bin/accelerate", line 10, in <module>
    sys.exit(main())
  File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main
    args.func(args)
  File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1392, in launch_command
    multi_gpu_launcher(args)
  File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1023, in multi_gpu_launcher
    distrib_run.run(args)
  File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
    elastic_launch(
  File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/fsx/albert/dev/trl/.venv/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
trl/scripts/rloo.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
--------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2026-03-27_10:04:47
  host      : ip-26-0-172-142.ec2.internal
  rank      : 1 (local_rank: 1)
  exitcode  : -6 (pid: 1666032)
  error_file: <N/A>
  traceback : Signal 6 (SIGABRT) received by PID 1666032
============================================================

Metadata

Metadata

Labels

No labels
No labels

Type

No fields configured for Bug.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions