@@ -3938,7 +3938,6 @@ def forward(self):
39383938 f"The { BACKEND } backend does not support DistributedDataParallel"
39393939 )
39403940 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
3941- @skip_if_rocm
39423941 def test_DistributedDataParallel_non_default_stream (self ):
39433942 stream = torch .cuda .Stream (self .rank )
39443943 rank = self .rank
@@ -3977,7 +3976,6 @@ def test_DistributedDataParallel_non_default_stream(self):
39773976 f"The { BACKEND } backend does not support DDP communication hook on CUDA devices"
39783977 )
39793978 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
3980- @skip_if_rocm
39813979 def test_ddp_comm_hook_logging (self ):
39823980 hooks = [
39833981 default .allreduce_hook ,
@@ -4171,7 +4169,6 @@ def _test_ddp_hook_with_optimizer_parity(
41714169 "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
41724170 )
41734171 @skip_if_lt_x_gpu (2 )
4174- @skip_if_rocm
41754172 @parametrize ("grad_as_bucket_view" , [True , False ])
41764173 @parametrize ("static_graph" , [True , False ])
41774174 @parametrize ("optimize_subset" , [True , False ])
@@ -4199,7 +4196,6 @@ def test_ddp_hook_with_optimizer_parity_adamw(
41994196 "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
42004197 )
42014198 @skip_if_lt_x_gpu (2 )
4202- @skip_if_rocm
42034199 @parametrize ("optimize_subset" , [True , False ])
42044200 def test_ddp_hook_with_optimizer_parity_adam (self , optimize_subset ):
42054201 adam_lr = 1e-2
@@ -4220,7 +4216,6 @@ def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
42204216 "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
42214217 )
42224218 @skip_if_lt_x_gpu (2 )
4223- @skip_if_rocm
42244219 @parametrize ("optimize_subset" , [True , False ])
42254220 def test_ddp_hook_with_optimizer_parity_sgd (self , optimize_subset ):
42264221 sgd_lr = 1e-2
@@ -4298,7 +4293,6 @@ def _test_ddp_hook_parity(self, state, hook):
42984293 f"The { BACKEND } backend does not support DDP communication hook on CUDA devices"
42994294 )
43004295 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
4301- @skip_if_rocm
43024296 def test_ddp_hook_parity_allreduce (self ):
43034297 self ._test_ddp_hook_parity (state = None , hook = default .allreduce_hook )
43044298
@@ -4307,7 +4301,6 @@ def test_ddp_hook_parity_allreduce(self):
43074301 f"The { BACKEND } backend does not support DDP communication hook on CUDA devices"
43084302 )
43094303 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
4310- @skip_if_rocm
43114304 def test_ddp_hook_parity_allreduce_process_group (self ):
43124305 # process_group is passed in to both DDP and comm. hook
43134306 world_size = dist .get_world_size ()
@@ -4321,7 +4314,6 @@ def test_ddp_hook_parity_allreduce_process_group(self):
43214314 f"The { BACKEND } backend does not support DDP communication hook on CUDA devices"
43224315 )
43234316 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
4324- @skip_if_rocm
43254317 def test_ddp_hook_parity_powerSGD (self ):
43264318 for warm_start in [True , False ]:
43274319 powersgd_state = powerSGD .PowerSGDState (
@@ -4344,7 +4336,6 @@ def test_ddp_hook_parity_powerSGD(self):
43444336 don't support multiprocessing with spawn start method" ,
43454337 )
43464338 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
4347- @skip_if_rocm
43484339 def test_ddp_hook_parity_post_localSGD (self ):
43494340 # Although we start run local SGD at iteration 10, since we still use the global process group to run it,
43504341 # the post-LocalSGD actually still allreduces gradients globally for the remaining iterations.
@@ -7191,14 +7182,12 @@ def _test_compute_bucket_assignment_by_size(self, use_logger):
71917182 @require_backend (DistTestCases .backend_feature ["gpu" ])
71927183 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
71937184 @skip_if_lt_x_gpu (2 )
7194- @skip_if_rocm
71957185 def test_compute_bucket_assignment_by_size_sparse_error_without_logger (self ):
71967186 self ._test_compute_bucket_assignment_by_size (use_logger = False )
71977187
71987188 @require_backend (DistTestCases .backend_feature ["gpu" ])
71997189 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
72007190 @skip_if_lt_x_gpu (2 )
7201- @skip_if_rocm
72027191 def test_compute_bucket_assignment_by_size_sparse_error_with_logger (self ):
72037192 self ._test_compute_bucket_assignment_by_size (use_logger = True )
72047193
@@ -7283,14 +7272,12 @@ def _test_verify_model_across_rank(self, use_logger):
72837272 @require_backend (DistTestCases .backend_feature ["gpu" ])
72847273 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
72857274 @skip_if_lt_x_gpu (2 )
7286- @skip_if_rocm
72877275 def test_verify_model_across_rank_with_logger (self ):
72887276 self ._test_verify_model_across_rank (use_logger = True )
72897277
72907278 @require_backend (DistTestCases .backend_feature ["gpu" ])
72917279 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
72927280 @skip_if_lt_x_gpu (2 )
7293- @skip_if_rocm
72947281 def test_verify_model_across_rank_without_logger (self ):
72957282 self ._test_verify_model_across_rank (use_logger = False )
72967283
@@ -7314,7 +7301,6 @@ def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
73147301 @require_backend (DistTestCases .backend_feature ["gpu" ])
73157302 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
73167303 @skip_if_lt_x_gpu (2 )
7317- @skip_if_rocm
73187304 def test_ddp_model_diff_shape_across_ranks (self ):
73197305 group_gloo = dist .new_group (
73207306 timeout = timedelta (seconds = 60 ), backend = dist .Backend .GLOO
@@ -7337,7 +7323,6 @@ def test_ddp_model_diff_shape_across_ranks(self):
73377323 @require_backend (DistTestCases .backend_feature ["gpu" ])
73387324 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
73397325 @skip_if_lt_x_gpu (2 )
7340- @skip_if_rocm
73417326 def test_ddp_model_diff_num_params_across_ranks (self ):
73427327 group_gloo = dist .new_group (
73437328 timeout = timedelta (seconds = 60 ), backend = dist .Backend .GLOO
@@ -7679,7 +7664,6 @@ def _test_monitored_barrier_allreduce_hang(self, wait_all_ranks):
76797664 @with_nccl_blocking_wait
76807665 @require_backend (DistTestCases .backend_feature ["gpu" ])
76817666 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
7682- @skip_if_rocm
76837667 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
76847668 def test_monitored_barrier_allreduce_hang (self ):
76857669 # tests expected behavior when nonzero rank hangs and we want to
@@ -7689,7 +7673,6 @@ def test_monitored_barrier_allreduce_hang(self):
76897673 @with_nccl_blocking_wait
76907674 @require_backend (DistTestCases .backend_feature ["gpu" ])
76917675 @require_backends_available (DistTestCases .backend_feature ["gpu" ])
7692- @skip_if_rocm
76937676 @skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
76947677 def test_monitored_barrier_allreduce_hang_wait_all_ranks (self ):
76957678 # tests expected behavior when nonzero rank hangs and we want to
@@ -8024,7 +8007,6 @@ def test_ddp_inference(self):
80248007 f"The { BACKEND } backend does not support DistributedDataParallel"
80258008 )
80268009 @skip_if_lt_x_gpu (2 )
8027- @skip_if_rocm
80288010 def test_ddp_sync_bn_training_vs_eval (self ):
80298011 rank = self .rank
80308012 torch .cuda .set_device (rank )
0 commit comments