Fix type annotation for _sym_get_coordinate (#177446)

aorenste · pytorchmergebot · commit b8d53c685bbb · 2026-03-19T02:24:02.000Z
Pull Request resolved: #177446 Approved by: https://github.com/Skylion007 ghstack dependencies: #172795
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
@@ -12,6 +12,7 @@
 from torch.distributed import is_available
 from torch.distributed._mesh_layout import _MeshLayout
 from torch.distributed._pycute import IntTuple, is_int, suffix_product
+from torch.types import IntLikeType
 from torch.utils._typing_utils import not_none
 
 
@@ -1220,7 +1221,7 @@ def get_coordinate(self) -> tuple[int, ...] | None:
             """
             return self._coordinate_on_dim
 
-        def _sym_get_coordinate(self, index: int) -> int:
+        def _sym_get_coordinate(self, index: int) -> IntLikeType:
             import torch.distributed.config as config
             from torch._guards import detect_fake_mode
 
diff --git a/torch/distributed/tensor/_collective_utils.py b/torch/distributed/tensor/_collective_utils.py
@@ -24,6 +24,7 @@
     scatter,
     Work,
 )
+from torch.types import IntLikeType
 
 
 logger = logging.getLogger(__name__)
@@ -191,7 +192,9 @@ def pad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tenso
 
 
 @maybe_run_for_local_tensor
-def unpad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tensor:
+def unpad_tensor(
+    tensor: torch.Tensor, pad_dim: int, pad_size: IntLikeType
+) -> torch.Tensor:
     from torch.fx.experimental.symbolic_shapes import guard_or_false
 
     if guard_or_false(pad_size == 0):
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
@@ -11,6 +11,7 @@
 from torch.distributed.device_mesh import _get_device_handle, DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor.placement_types import _StridedShard, Shard
+from torch.types import IntLikeType
 
 
 logger = getLogger(__name__)
@@ -391,8 +392,8 @@ def _compute_rng_offsets(self, spec: DTensorSpec) -> tuple[int, int]:
         return start_offset_incr, end_offset_incr
 
     def _calc_shard_linear_idx(
-        self, shard_coord: list[int], shard_size: list[int]
-    ) -> int:
+        self, shard_coord: Sequence[IntLikeType], shard_size: Sequence[IntLikeType]
+    ) -> IntLikeType:
         return _calc_shard_linear_idx(shard_coord, shard_size)
 
 
@@ -411,8 +412,8 @@ def _calc_first_shard_size(spec: DTensorSpec) -> list[int]:
 
 
 def _calc_shard_info(
-    mesh_coordinate: Sequence[int], spec: DTensorSpec
-) -> tuple[list[int], list[int]]:
+    mesh_coordinate: Sequence[IntLikeType], spec: DTensorSpec
+) -> tuple[list[IntLikeType], list[IntLikeType]]:
     mesh = spec.mesh
     # note: dim_map does not allow double sharding which is the FSDP(fully_shard)+TP
     # case. Replace the custom logic with dim_map once we support it.
@@ -436,10 +437,12 @@ def _calc_shard_info(
         raise AssertionError
     mesh_size = mesh.shape
     shard_idx_by_dim = []
-    total_num_shards_by_dim = []  # total number of shards on each tensor dim
+    total_num_shards_by_dim: list[
+        IntLikeType
+    ] = []  # total number of shards on each tensor dim
     for mesh_dim in dim_map:
-        shard_idx = 0
-        total_num_shards = 1
+        shard_idx: IntLikeType = 0
+        total_num_shards: IntLikeType = 1
         # the tensor dim is sharded on more than 1 mesh dim
         if isinstance(mesh_dim, list):
             rank_coord = [mesh_coordinate[d] for d in mesh_dim]
@@ -454,10 +457,12 @@ def _calc_shard_info(
     return shard_idx_by_dim, total_num_shards_by_dim
 
 
-def _calc_shard_linear_idx(shard_coord: list[int], shard_size: list[int]) -> int:
+def _calc_shard_linear_idx(
+    shard_coord: Sequence[IntLikeType], shard_size: Sequence[IntLikeType]
+) -> IntLikeType:
     # compute shard linear index
-    shard_linear_idx = 0
-    shard_coord_stride = 1
+    shard_linear_idx: IntLikeType = 0
+    shard_coord_stride: IntLikeType = 1
     for idx, size in zip(reversed(shard_coord), reversed(shard_size)):
         shard_linear_idx += idx * shard_coord_stride
         shard_coord_stride *= size
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
@@ -31,6 +31,7 @@
     Replicate,
     Shard,
 )
+from torch.types import IntLikeType
 from torch.utils._debug_mode import get_active_debug_mode
 
 
@@ -144,7 +145,7 @@ class _TransformInfo:
     mesh_dim: int
     src_dst_placements: tuple[Placement, Placement]
     # logical_shape on this mesh dimension
-    logical_shape: list[int]
+    logical_shape: Sequence[IntLikeType]
 
     def __post_init__(self):
         if self.mesh_dim < 0:
@@ -1176,8 +1177,8 @@ def get_logical_shape(
         src_state: "DTensorRedistributePlanner.DistState",
         mesh_dim: int,
         full_tensor_shape: tuple[int, ...],
-    ) -> list[int]:
-        new_logical_shape = list(full_tensor_shape)
+    ) -> list[IntLikeType]:
+        new_logical_shape: list[IntLikeType] = list(full_tensor_shape)
         for entry in src_state.tensor_dim_to_mesh_dim:
             tensor_dim = entry.tensor_dim
             mesh_dims = entry.mesh_dims
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
@@ -2,6 +2,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 import functools
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from typing import cast, TypeVar
 
@@ -21,6 +22,7 @@
     unpad_tensor,
 )
 from torch.distributed.tensor._ops._mask_buffer import MaskBuffer
+from torch.types import IntLikeType
 
 
 __all__ = ["Placement", "Shard", "Replicate", "Partial"]
@@ -211,7 +213,7 @@ def _custom_chunk(
     @staticmethod
     @maybe_run_for_local_tensor
     def local_shard_size_and_offset(
-        curr_local_size: int,
+        curr_local_size: IntLikeType,
         num_chunks: int,
         rank: _RankTypeT,
     ) -> tuple[_RankTypeT, _RankTypeT]:
@@ -392,7 +394,7 @@ def _reduce_shard_tensor(
     def _maybe_pad_tensor(
         self,
         local_tensor: torch.Tensor,
-        logical_dim_size: int,
+        logical_dim_size: IntLikeType,
         num_chunks: int,
     ) -> torch.Tensor:
         from torch.fx.experimental.symbolic_shapes import guard_or_true
@@ -414,7 +416,7 @@ def _maybe_pad_tensor(
     def _maybe_unpad_tensor(
         self,
         local_tensor: torch.Tensor,
-        logical_dim_size: int,
+        logical_dim_size: IntLikeType,
         num_chunks: int,
     ) -> torch.Tensor:
         from torch.fx.experimental.symbolic_shapes import guard_or_true
@@ -434,7 +436,7 @@ def _to_replicate_tensor(
         local_tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
-        current_logical_shape: list[int],
+        current_logical_shape: Sequence[IntLikeType],
     ) -> torch.Tensor:
         """
         This function all_gather all shards and return a tensor that
@@ -462,7 +464,7 @@ def _replicate_to_shard(
         local_tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
-        shard_index: int,
+        shard_index: IntLikeType,
     ) -> torch.Tensor:
         """
         transform from replicated tensor to a sharded tensor on
@@ -489,11 +491,11 @@ def _get_shard_pad_size(
 
     @staticmethod
     def _compute_padding_info(
-        current_logical_shape: list[int],
+        current_logical_shape: Sequence[IntLikeType],
         num_chunks: int,
         old_shard_dim: int,
         new_shard_dim: int,
-    ) -> tuple[bool, int, int, bool, int, int]:
+    ) -> tuple[bool, IntLikeType, int, bool, IntLikeType, int]:
         from torch.fx.experimental.symbolic_shapes import guard_or_true
 
         results = []
@@ -508,7 +510,7 @@ def _compute_padding_info(
     @staticmethod
     @maybe_run_for_local_tensor
     def _pad_for_new_shard_dim(
-        current_logical_shape: list[int],
+        current_logical_shape: Sequence[IntLikeType],
         local_tensor: torch.Tensor,
         num_chunks: int,
         old_shard_dim: int,
@@ -543,7 +545,7 @@ def _pad_for_new_shard_dim(
     @staticmethod
     @maybe_run_for_local_tensor
     def _unpad_for_new_shard_dim(
-        current_logical_shape: list[int],
+        current_logical_shape: Sequence[IntLikeType],
         local_tensor: torch.Tensor,
         num_chunks: int,
         old_shard_dim: int,
@@ -582,7 +584,7 @@ def _to_new_shard_dim(
         local_tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
-        current_logical_shape: list[int],
+        current_logical_shape: Sequence[IntLikeType],
         new_shard_dim: int,
     ) -> torch.Tensor:
         """
@@ -857,7 +859,7 @@ def _select_split_tensor(
         self,
         tensor: torch.Tensor,
         num_chunks: int,
-        index: int,
+        index: IntLikeType,
         *,
         with_padding: bool = True,
         contiguous: bool = True,
@@ -891,7 +893,7 @@ def _to_replicate_tensor(
         local_tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
-        current_logical_shape: list[int],
+        current_logical_shape: Sequence[IntLikeType],
     ) -> torch.Tensor:
         """
         Replay the replicate-to-shard process to understand how to stitch shards back.
@@ -1050,7 +1052,7 @@ def _replicate_to_strided_shard(
         local_tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
-        shard_index: int,
+        shard_index: IntLikeType,
     ) -> torch.Tensor:
         """
         Transform from replicated tensor to a strided-sharded tensor on the current rank.
@@ -1097,7 +1099,7 @@ def _local_shard_size_and_offset(
     @maybe_run_for_local_tensor
     def local_shard_size_and_offset(
         self,
-        curr_local_size: int,
+        curr_local_size: IntLikeType,
         num_chunks: int,
         rank: RankType,
         return_first_offset: bool = True,
@@ -1384,7 +1386,9 @@ def __init__(
     @staticmethod
     @maybe_run_for_local_tensor
     def _mask_tensor(
-        tensor: torch.Tensor, local_offset_on_dim: int, local_shard_size: int
+        tensor: torch.Tensor,
+        local_offset_on_dim: IntLikeType,
+        local_shard_size: IntLikeType,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Build the input mask and save it for the current partial placement
         # this is so that the output of embedding op can reuse the same partial