Skip to content

Commit fe43676

Browse files
committed
Update on "Don't truncate leading 1s if they are unbacked"
This prevents us from guarding on leading unbacked SymInts. The previous attempt at #94521 I got the logic a bit wrong. My idea there was to avoid slicing when the values to be set have low enough dimensionality that they definitely aren't too long. To do this, I need to compute the difference between the data to be set, and the post-slice space for the values. But I incorrectly compared against the *pre-slice* space in the original PR. Another version of this PR which is wrong is to compare against variableIndices.size(); but remember that in advanced indexing with tensors/lists, each of the individual indices specify what coordinates to read out of each dimension! A third incorrect attempt tested `variableIndices[0].dim()`, which is only correct if you don't broadcast one of the later variable indices, and if there are enough variableIndices to cover all dims. This is all quite complicated, so I went for a simpler solution of checking if the leading dim had a hint before testing if it is not equal to one. BTW, there is no test for this one stripping behavior. There is now a test for this, based off the real code that caused the problem. Signed-off-by: Edward Z. Yang <ezyangmeta.com> [ghstack-poisoned]
2 parents eb67c45 + 09f4a2d commit fe43676

27 files changed

Lines changed: 624 additions & 139 deletions

.ci/pytorch/test.sh

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ test_single_dynamo_benchmark() {
284284
# Feel free to remove --device cuda if you ever decide to need to
285285
# test CPU as well in CI
286286
python "benchmarks/dynamo/$suite.py" \
287-
--ci --accuracy --timing --explain --device cuda \
287+
--ci --accuracy --timing --explain \
288288
"$@" "${partition_flags[@]}" \
289289
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
290290
python benchmarks/dynamo/check_csv.py \
@@ -297,10 +297,10 @@ test_aot_eager_benchmark() {
297297
local exit_status=0
298298

299299
# Check inference with --float32
300-
test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager || exit_status=$?
300+
test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager --device cuda || exit_status=$?
301301

302302
# Check training with --amp
303-
test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager --training --amp || exit_status=$?
303+
test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager --device cuda --training --amp || exit_status=$?
304304

305305
if [[ $exit_status -ne 0 ]]; then
306306
echo "Some benchmarks failed; scroll up for details"
@@ -311,14 +311,22 @@ test_aot_eager_benchmark() {
311311
test_inductor_benchmark() {
312312
# Usage: test_dynamo_benchmark huggingface 0
313313

314-
# Check inference with --float32
315-
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor
314+
local device="$1"
315+
shift
316316

317-
# Check training with --amp
318-
test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp
317+
if [[ $device == "cpu" ]]; then
318+
# TODO: Add training and dynamic shape test
319+
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --float32 --device cpu
320+
else
321+
# Check inference with --float32
322+
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --device cuda
319323

320-
# Check inference with --dynamic-shapes
321-
test_single_dynamo_benchmark "dynamic_inductor-inference" "$@" --inductor --dynamic-shapes
324+
# Check training with --amp
325+
test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp --device cuda
326+
327+
# Check inference with --dynamic-shapes
328+
test_single_dynamo_benchmark "dynamic_inductor-inference" "$@" --inductor --dynamic-shapes --device cuda
329+
fi
322330
}
323331

324332
test_inductor_benchmark_perf() {
@@ -371,7 +379,9 @@ test_aot_eager_all() {
371379
}
372380

373381
test_inductor_huggingface() {
374-
test_inductor_benchmark huggingface ""
382+
local device=$1
383+
shift
384+
test_inductor_benchmark "$device" huggingface ""
375385
}
376386

377387
test_inductor_huggingface_perf() {
@@ -383,7 +393,9 @@ test_inductor_timm_shard() {
383393
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
384394
exit 1
385395
fi
386-
test_inductor_benchmark timm_models "$1"
396+
local device=$1
397+
shift
398+
test_inductor_benchmark "$device" timm_models "$1"
387399
}
388400

389401
test_inductor_timm_perf_shard() {
@@ -395,7 +407,9 @@ test_inductor_timm_perf_shard() {
395407
}
396408

397409
test_inductor_torchbench() {
398-
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark torchbench ""
410+
local device=$1
411+
shift
412+
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark "$device" torchbench ""
399413
}
400414

401415
test_inductor_torchbench_perf() {
@@ -917,38 +931,54 @@ elif [[ "${TEST_CONFIG}" == *aot_eager_torchbench* ]]; then
917931
elif [[ "${TEST_CONFIG}" == *inductor_huggingface* ]]; then
918932
install_torchvision
919933
install_filelock
920-
install_triton
934+
if [[ "${TEST_CONFIG}" != *inductor_huggingface_cpu_accuracy* ]]; then
935+
# Cpp backend does not depend on triton
936+
install_triton
937+
fi
921938
install_huggingface
922939
if [[ "${TEST_CONFIG}" == *inductor_huggingface_perf* ]]; then
923940
test_inductor_huggingface_perf
941+
elif [[ "${TEST_CONFIG}" == *inductor_huggingface_cpu_accuracy* ]]; then
942+
test_inductor_huggingface cpu
924943
else
925-
test_inductor_huggingface
944+
test_inductor_huggingface cuda
926945
fi
927946
elif [[ "${TEST_CONFIG}" == *inductor_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
928947
install_torchvision
929948
install_filelock
930-
install_triton
949+
if [[ "${TEST_CONFIG}" != *inductor_timm_cpu_accuracy* ]]; then
950+
# Cpp backend does not depend on triton
951+
install_triton
952+
fi
931953
install_timm
932954
id=$((SHARD_NUMBER-1))
933955
if [[ "${TEST_CONFIG}" == *inductor_timm_perf* && $NUM_TEST_SHARDS -gt 1 ]]; then
934956
test_inductor_timm_perf_shard $id
957+
elif [[ "${TEST_CONFIG}" == *inductor_timm_cpu_accuracy* && $NUM_TEST_SHARDS -gt 1 ]]; then
958+
test_inductor_timm_shard cpu $id
935959
else
936-
test_inductor_timm_shard $id
960+
test_inductor_timm_shard cuda $id
937961
fi
938962
elif [[ "${TEST_CONFIG}" == *inductor_torchbench* ]]; then
939963
install_torchtext
940964
install_torchvision
941965
install_filelock
942-
install_triton
966+
if [[ "${TEST_CONFIG}" != *inductor_torchbench_cpu_accuracy* ]]; then
967+
# Cpp backend does not depend on triton
968+
install_triton
969+
fi
943970
if [[ "${TEST_CONFIG}" == *inductor_torchbench_perf* ]]; then
944971
checkout_install_torchbench
945972
test_inductor_torchbench_perf
973+
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_accuracy* ]]; then
974+
checkout_install_torchbench
975+
test_inductor_torchbench cpu
946976
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
947977
checkout_install_torchbench hf_Bert hf_Albert timm_efficientdet timm_vision_transformer
948978
test_inductor_torchbench_smoketest_perf
949979
else
950980
checkout_install_torchbench
951-
test_inductor_torchbench
981+
test_inductor_torchbench cuda
952982
fi
953983
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
954984
install_torchvision

.github/workflows/inductor.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,26 @@ jobs:
6161
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp.outputs.docker-image }}
6262
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp.outputs.test-matrix }}
6363
use-gha: anything-non-empty-to-use-gha
64+
65+
linux-focal-cpu-py3_8-gcc7-inductor-build:
66+
name: linux-focal-cpu-py3.8-gcc7-inductor
67+
uses: ./.github/workflows/_linux-build.yml
68+
with:
69+
build-environment: linux-focal-py3_8-gcc7-build
70+
docker-image-name: pytorch-linux-focal-py3.8-gcc7
71+
test-matrix: |
72+
{ include: [
73+
{ config: "inductor_huggingface_cpu_accuracy", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
74+
{ config: "inductor_timm_cpu_accuracy", shard: 1, num_shards: 2, runner: "linux.4xlarge" },
75+
{ config: "inductor_timm_cpu_accuracy", shard: 2, num_shards: 2, runner: "linux.4xlarge" },
76+
{ config: "inductor_torchbench_cpu_accuracy", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
77+
]}
78+
79+
linux-focal-cpu-py3_8-gcc7-inductor-test:
80+
name: linux-focal-cpu-py3.8-gcc7-inductor
81+
uses: ./.github/workflows/_linux-test.yml
82+
needs: linux-focal-cpu-py3_8-gcc7-inductor-build
83+
with:
84+
build-environment: linux-focal-py3_8-gcc7-build
85+
docker-image: ${{ needs.linux-focal-cpu-py3_8-gcc7-inductor-build.outputs.docker-image }}
86+
test-matrix: ${{ needs.linux-focal-cpu-py3_8-gcc7-inductor-build.outputs.test-matrix }}

aten/src/ATen/native/BucketizationUtils.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,13 @@ inline void searchsorted_pre_check(
134134

135135
TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ",
136136
"dtype but got dtype ", sorter.scalar_type());
137+
138+
if (sorter.numel() > 0) {
139+
auto minmax = sorter.aminmax();
140+
int64_t vmin = std::get<0>(minmax).item().toLong();
141+
int64_t vmax = std::get<1>(minmax).item().toLong();
142+
TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range");
143+
}
137144
}
138145

139146
TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1),

aten/src/ATen/native/TensorFactories.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include <ATen/ops/empty_like.h>
4747
#include <ATen/ops/empty_like_native.h>
4848
#include <ATen/ops/empty_native.h>
49+
#include <ATen/ops/empty_permuted_native.h>
4950
#include <ATen/ops/empty_strided.h>
5051
#include <ATen/ops/empty_strided_native.h>
5152
#include <ATen/ops/eye.h>
@@ -278,6 +279,45 @@ Tensor empty_names(
278279
return result;
279280
}
280281

282+
Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, c10::optional<ScalarType> dtype_opt,
283+
c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt
284+
) {
285+
// size is logical; aka, the output size you'll get from the operation overall
286+
//
287+
// physical_layout follows NCHW/NHWC convention:
288+
// contiguous is [0,1,2,3], channels last is [0,2,3,1]
289+
//
290+
// this means if i is physical index, physical_layout[i] is logical index;
291+
// e.g., to find what is innermost physical dim (3), query NHWC[3] == 1
292+
// (aka it is channels)
293+
int64_t dim = static_cast<int64_t>(size.size());
294+
SymDimVector phys_size(dim);
295+
TORCH_CHECK(physical_layout.size() == dim,
296+
"Number of dimensions in size does not match the "
297+
"length of the physical_layout; i.e. len(size) = ", dim,
298+
" is not equal to len(physical_layout) = ", physical_layout.size());
299+
std::vector<bool> seen_dims(dim);
300+
for (const auto i : c10::irange(dim)) {
301+
TORCH_CHECK(physical_layout[i] >= 0 && physical_layout[i] < dim,
302+
"Dimension out of range (expected to be between 0 and ", dim - 1, ", but got ",
303+
physical_layout[i], " at index ", i, "). NB: negative dims "
304+
"not currently supported; file an issue if you want it.");
305+
TORCH_CHECK(!seen_dims[physical_layout[i]], "Duplicate dim not allowed");
306+
phys_size[i] = size[physical_layout[i]];
307+
seen_dims[physical_layout[i]] = true;
308+
}
309+
// do a contiguous allocation
310+
Tensor phys_tensor = at::empty_symint(phys_size, dtype_opt, layout_opt, device_opt, pin_memory_opt, c10::nullopt);
311+
SymIntArrayRef phys_strides = phys_tensor.sym_strides();
312+
// permute the strides (inverse permutation! This is why this is
313+
// empty_permute*d*, not empty_permute; it's not an empty + permute)
314+
SymDimVector strides(dim);
315+
for (const auto i : c10::irange(dim)) {
316+
strides[physical_layout[i]] = phys_strides[i];
317+
}
318+
return phys_tensor.as_strided_symint(size, strides);
319+
}
320+
281321
Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt,
282322
c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
283323
return at::detail::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);

aten/src/ATen/native/native_functions.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2241,6 +2241,11 @@
22412241
SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
22422242
QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
22432243

2244+
- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2245+
dispatch:
2246+
CompositeExplicitAutograd: empty_permuted_symint
2247+
autogen: empty_permuted.out
2248+
22442249
# We do not make new_empty a composite that calls into new_empty_strided, as the strided version
22452250
# is significantly more difficult to implement by different backends
22462251
- func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

benchmarks/dynamo/common.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class CI(NamedTuple):
6464
backend: str # aot_eager or inductor
6565
training: bool
6666
dynamic: bool = False
67+
device: str = "cuda"
6768

6869

6970
CI_SKIP = collections.defaultdict(list)
@@ -146,6 +147,35 @@ class CI(NamedTuple):
146147
"gluon_xception65", # accuracy https://github.com/pytorch/pytorch/issues/93847
147148
]
148149

150+
CI_SKIP[CI("inductor", training=False, device="cpu")] = [
151+
# TorchBench
152+
"drq", # Need to update torchbench
153+
"detectron2_fasterrcnn_r_101_c4",
154+
"detectron2_fasterrcnn_r_101_dc5",
155+
"detectron2_fasterrcnn_r_101_fpn",
156+
"detectron2_fasterrcnn_r_50_c4",
157+
"detectron2_fasterrcnn_r_50_dc5",
158+
"detectron2_fasterrcnn_r_50_fpn",
159+
"detectron2_fcos_r_50_fpn",
160+
"detectron2_maskrcnn_r_101_c4",
161+
"detectron2_maskrcnn_r_101_fpn",
162+
"detectron2_maskrcnn_r_50_c4",
163+
"detectron2_maskrcnn_r_50_fpn",
164+
"mobilenet_v2_quantized_qat",
165+
"pyhpc_turbulent_kinetic_energy",
166+
"vision_maskrcnn",
167+
"resnet50_quantized_qat", # Eager model failed to run(Quantize only works on Float Tensor, got Double)
168+
# Huggingface
169+
"AllenaiLongformerBase",
170+
"BartForConditionalGeneration", # OOM
171+
"DebertaV2ForQuestionAnswering", # OOM
172+
"MBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94793
173+
"PLBartForConditionalGeneration", # Accuracy https://github.com/pytorch/pytorch/issues/94794
174+
# TIMM
175+
"cait_m36_384", # Accuracy
176+
"pnasnet5large", # OOM
177+
]
178+
149179
CI_SKIP[CI("inductor", training=True)] = [
150180
*CI_SKIP[CI("inductor", training=False)],
151181
# TorchBench
@@ -1869,9 +1899,11 @@ def run(runner, args, original_dir=None):
18691899
set(CI_SKIP[ci(dynamic=True)]) - set(CI_SKIP[ci(dynamic=False)])
18701900
)
18711901
else:
1872-
args.exclude_exact = CI_SKIP[
1873-
CI(args.backend, training=args.training, dynamic=args.dynamic_shapes)
1874-
]
1902+
ci = functools.partial(
1903+
CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
1904+
)
1905+
for device in args.devices:
1906+
args.exclude_exact.extend(CI_SKIP[ci(device=device)])
18751907
if args.ddp:
18761908
# TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
18771909
# but just to measure impact on singlenode of performing graph-breaks.

test/expect/HasDecompTest.test_has_decomposition.expect

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,8 @@ aten::embedding_renorm_
719719
aten::empty.memory_format
720720
aten::empty.names
721721
aten::empty.names_out
722+
aten::empty_permuted
723+
aten::empty_permuted.out
722724
aten::empty_quantized
723725
aten::empty_quantized.out
724726
aten::equal

test/functorch/test_aotdispatch.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2485,7 +2485,6 @@ def forward(self, x):
24852485
xfail('nn.functional.interpolate', 'linear'), # Cannot call sizes() on tensor with symbolic sizes/strides
24862486
xfail('nn.functional.interpolate', 'trilinear'), # Cannot call sizes() on tensor with symbolic sizes/st...
24872487
xfail('nn.functional.max_pool1d', ''), # Cannot call sizes() on tensor with symbolic sizes/strides
2488-
xfail('nn.functional.max_pool2d', ''), # aten.max_pool2d_with_indices_backward.default - couldn't find s...
24892488
xfail('nn.functional.max_pool3d', ''), # aten.max_pool3d_with_indices.default - couldn't find symbolic m...
24902489
xfail('nn.functional.max_unpool1d', ''), # aten.max_unpool2d.default - couldn't find symbolic meta funct...
24912490
xfail('nn.functional.max_unpool1d', 'grad'), # aten.max_unpool2d.default - couldn't find symbolic meta ...
@@ -2503,7 +2502,6 @@ def forward(self, x):
25032502
xfail('nn.functional.pixel_unshuffle', ''), # aten.pixel_unshuffle.default - couldn't find symbolic meta...
25042503
xfail('nn.functional.rrelu', ''), # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
25052504
xfail('nn.functional.smooth_l1_loss', ''), # could not find kernel
2506-
xfail('nn.functional.unfold', ''), # Cannot call sizes() on tensor with symbolic sizes/strides
25072505
xfail('norm', 'nuc'), # Cannot call sizes() on tensor with symbolic sizes/strides
25082506
xfail('normal', 'number_mean'), # Cannot call sizes() on tensor with symbolic sizes/strides
25092507
xfail('ormqr', ''), # aten.ormqr.default - couldn't find symbolic meta function/decomposition

0 commit comments

Comments
 (0)