Skip to content

Commit 2c3f4ba

Browse files
committed
Update on "Remove structseq_slice"
Summary: Python 2 has reached end-of-life and is no longer supported by PyTorch. This function was already ifdef'ed out in Python 2. Added a comment about when we might be able to remove this entire file. Test Plan: CI Differential Revision: [D20842885](https://our.internmc.facebook.com/intern/diff/D20842885)
2 parents 527e1f9 + 6b5557e commit 2c3f4ba

76 files changed

Lines changed: 1660 additions & 870 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

aten/src/ATen/Declarations.cwrap

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -334,20 +334,6 @@
334334
- bool largest
335335
- bool sorted
336336
]]
337-
[[
338-
name: _th_exp
339-
cname: exp
340-
types:
341-
- floating_point
342-
backends:
343-
- CUDA
344-
variants: function
345-
return: argument 0
346-
arguments:
347-
- arg: THTensor* result
348-
output: True
349-
- THTensor* self
350-
]]
351337
[[
352338
name: _th_erfc
353339
cname: erfc

aten/src/ATen/core/ivalue_inl.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -310,12 +310,11 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
310310
return value_;
311311
}
312312

313+
// This accessor should only be used if we know that the future is
314+
// completed() with no error.
313315
const IValue& constValue() {
314316
std::unique_lock<std::mutex> lock(mutex_);
315317
AT_ASSERT(completed());
316-
if (error_) {
317-
throw *error_;
318-
}
319318
return value_;
320319
}
321320

aten/src/ATen/native/TensorIterator.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <ATen/ExpandUtils.h>
55
#include <ATen/Parallel.h>
66
#include <ATen/native/TypeProperties.h>
7+
#include <ATen/MemoryOverlap.h>
78

89
namespace at {
910

aten/src/ATen/native/TensorIterator.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
#include <c10/util/TypeCast.h>
77
#include <ATen/core/Range.h>
88
#include <bitset>
9-
#include <c10/util/Optional.h>
10-
#include <ATen/MemoryOverlap.h>
119
#include <ATen/NamedTensorUtils.h>
1210
#include <ATen/Parallel.h>
1311

aten/src/ATen/native/TensorShape.cpp

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,15 @@ Tensor sum_to_size(const Tensor& self, IntArrayRef size) {
613613
return sum_to(self, size);
614614
}
615615

616+
// We currently do not support per-channel quant for unfold, diagonal, expand, permute.
617+
// TODO: Make this an aten function and replace as_strided_qtensorimpl once that is done.
618+
Tensor make_qtensor(const Tensor& self, IntArrayRef size, IntArrayRef stride, QuantizerPtr quantizer) {
619+
auto result = detail::make_tensor<QTensorImpl>(
620+
Storage(self.storage()), self.key_set(), quantizer);
621+
setStrided(result, size, stride, self.storage_offset());
622+
return result;
623+
}
624+
616625
Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
617626
auto storage_offset = storage_offset_.value_or(self.storage_offset());
618627
auto result = detail::make_tensor<TensorImpl>(Storage(self.storage()), self.key_set());
@@ -1232,9 +1241,66 @@ inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
12321241
return std::make_tuple(sizes, strides);
12331242
}
12341243

1244+
Tensor squeeze_qtensor(const Tensor& self) {
1245+
auto quantizer = get_qtensorimpl(self)->quantizer();
1246+
std::vector<int64_t> sizes;
1247+
std::vector<int64_t> strides;
1248+
std::tie(sizes, strides) = inferSqueezeGeometry(self);
1249+
if (quantizer->qscheme() == QScheme::PER_CHANNEL_AFFINE) {
1250+
const auto* per_channel_quantizer = static_cast<at::PerChannelAffineQuantizer*>(quantizer.get());
1251+
auto axis = per_channel_quantizer->axis();
1252+
int64_t shift = 0;
1253+
for (int64_t d = 0; d < self.dim(); ++d) {
1254+
if (self.sizes()[d] == 1) {
1255+
TORCH_CHECK(axis != d, "Squeeze is only possible on non-axis dimension for Per-Channel Quantized Tensors.");
1256+
if (d < axis) {
1257+
shift += 1;
1258+
}
1259+
}
1260+
}
1261+
axis = axis - shift;
1262+
quantizer = make_per_channel_affine_quantizer(per_channel_quantizer->scales(),
1263+
per_channel_quantizer->zero_points(),
1264+
axis,
1265+
quantizer->scalar_type());
1266+
}
1267+
return make_qtensor(self, sizes, strides, quantizer);
1268+
}
1269+
1270+
Tensor squeeze_qtensor(const Tensor& self, int64_t dim) {
1271+
auto quantizer = get_qtensorimpl(self)->quantizer();
1272+
std::vector<int64_t> sizes;
1273+
std::vector<int64_t> strides;
1274+
std::tie(sizes, strides) = inferSqueezeGeometry(self, dim);
1275+
if (quantizer->qscheme() == QScheme::PER_CHANNEL_AFFINE) {
1276+
const auto* per_channel_quantizer = static_cast<at::PerChannelAffineQuantizer*>(quantizer.get());
1277+
auto axis = per_channel_quantizer->axis();
1278+
TORCH_CHECK(axis != dim, "Squeeze is only possible on non-axis dimension for Per-Channel Quantized Tensors.");
1279+
if (axis >= dim) {
1280+
axis -= 1;
1281+
}
1282+
quantizer = make_per_channel_affine_quantizer(per_channel_quantizer->scales(),
1283+
per_channel_quantizer->zero_points(),
1284+
axis,
1285+
quantizer->scalar_type());
1286+
}
1287+
if (self.dim() == 0 || self.sizes()[dim] != 1) {
1288+
sizes = self.sizes().vec();
1289+
strides = self.strides().vec();
1290+
}
1291+
auto result = make_qtensor(self, sizes, strides, quantizer);
1292+
namedinference::propagate_names_except(result, self, {dim});
1293+
return result;
1294+
}
1295+
12351296
Tensor squeeze(const Tensor& self) {
12361297
auto g = inferSqueezeGeometry(self);
1237-
auto result = self.as_strided(std::get<0>(g), std::get<1>(g));
1298+
at::Tensor result;
1299+
if (self.is_quantized()) {
1300+
result = squeeze_qtensor(self);
1301+
} else {
1302+
result = self.as_strided(std::get<0>(g), std::get<1>(g));
1303+
}
12381304
auto maybe_outnames = namedinference::compute_squeeze_outnames(self);
12391305
namedinference::propagate_names_if_nonempty(result, maybe_outnames);
12401306
return result;
@@ -1244,6 +1310,9 @@ Tensor squeeze(const Tensor& self, int64_t dim) {
12441310
int64_t dims = self.dim();
12451311
dim = maybe_wrap_dim(dim, dims);
12461312

1313+
if (self.is_quantized()) {
1314+
return squeeze_qtensor(self, dim);
1315+
}
12471316
if (dims == 0 || self.sizes()[dim] != 1) {
12481317
return self.as_strided(self.sizes(), self.strides());
12491318
}
@@ -1303,11 +1372,31 @@ static Tensor unsqueeze_sparse(Tensor const &self, int64_t dim /* should already
13031372
}
13041373
}
13051374

1375+
Tensor unsqueeze_qtensor(const Tensor& self, int64_t dim) {
1376+
dim = maybe_wrap_dim(dim, self.dim() + 1);
1377+
auto g = inferUnsqueezeGeometry(self, dim);
1378+
auto quantizer = get_qtensorimpl(self)->quantizer();
1379+
if (quantizer->qscheme() == QScheme::PER_CHANNEL_AFFINE) {
1380+
const auto* per_channel_quantizer = static_cast<at::PerChannelAffineQuantizer*>(quantizer.get());
1381+
auto axis = per_channel_quantizer->axis();
1382+
if (axis >= dim) {
1383+
axis += 1;
1384+
}
1385+
quantizer = make_per_channel_affine_quantizer(per_channel_quantizer->scales(),
1386+
per_channel_quantizer->zero_points(),
1387+
axis,
1388+
quantizer->scalar_type());
1389+
}
1390+
return make_qtensor(self, std::get<0>(g), std::get<1>(g), quantizer);
1391+
}
1392+
13061393
Tensor unsqueeze(const Tensor& self, int64_t dim) {
13071394
dim = maybe_wrap_dim(dim, self.dim() + 1);
13081395

13091396
if (self.is_sparse()) {
13101397
return unsqueeze_sparse(self, dim);
1398+
} else if (self.is_quantized()) {
1399+
return unsqueeze_qtensor(self, dim);
13111400
} else {
13121401
auto g = inferUnsqueezeGeometry(self, dim);
13131402
return self.as_strided(std::get<0>(g), std::get<1>(g));

aten/src/ATen/native/UnaryOps.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,10 @@ Tensor& ceil_out(Tensor& result, const Tensor& self) {
171171
Tensor ceil(const Tensor& self) { return unary_op_impl(self, at::ceil_out); }
172172
Tensor& ceil_(Tensor& self) { return unary_op_impl_(self, at::ceil_out); }
173173

174+
Tensor& exp_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, exp_stub); }
175+
Tensor exp(const Tensor& self) { return unary_op_impl(self, at::exp_out); }
176+
Tensor& exp_(Tensor& self) { return unary_op_impl_(self, at::exp_out); }
177+
174178
Tensor& expm1_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, expm1_stub); }
175179
Tensor expm1(const Tensor& self) { return unary_op_impl(self, at::expm1_out); }
176180
Tensor& expm1_(Tensor& self) { return unary_op_impl_(self, at::expm1_out); }
@@ -440,7 +444,6 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
440444

441445
IMPLEMENT_UNARY_OP_VEC(erfc)
442446
IMPLEMENT_UNARY_OP_VEC_CUDA(erfinv)
443-
IMPLEMENT_UNARY_OP_VEC(exp)
444447
IMPLEMENT_UNARY_OP_VEC_CUDA(lgamma)
445448

446449
DEFINE_DISPATCH(abs_stub);

aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -483,12 +483,25 @@ void min_elementwise_kernel(TensorIterator& iter) {
483483
}
484484

485485
void smooth_l1_kernel(TensorIterator& iter) {
486-
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
487-
cpu_kernel(iter, [=](scalar_t a, scalar_t b) -> scalar_t {
488-
auto z = std::abs(a - b);
489-
return z < scalar_t(1.) ? scalar_t(0.5) * z * z : z - scalar_t(0.5);
490-
});
491-
});
486+
AT_DISPATCH_FLOATING_TYPES_AND2(
487+
kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
488+
using Vec = Vec256<scalar_t>;
489+
const Vec one_vec(static_cast<scalar_t>(1));
490+
const Vec point_five_vec(static_cast<scalar_t>(0.5));
491+
cpu_kernel_vec(
492+
iter,
493+
[](scalar_t a, scalar_t b) -> scalar_t {
494+
auto z = std::abs(a - b);
495+
return z < static_cast<scalar_t>(1)
496+
? static_cast<scalar_t>(0.5) * z * z
497+
: z - static_cast<scalar_t>(0.5);
498+
},
499+
[&one_vec, &point_five_vec](Vec a, Vec b) {
500+
auto z = (a - b).abs();
501+
return Vec::blendv(
502+
point_five_vec * z * z, z - point_five_vec, z >= one_vec);
503+
});
504+
});
492505
}
493506

494507
void sigmoid_backward_kernel(TensorIterator& iter) {
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include <ATen/native/UnaryOps.h>
2+
#include <ATen/native/cuda/Loops.cuh>
3+
#include <ATen/Dispatch.h>
4+
#include <ATen/native/DispatchStub.h>
5+
#include <ATen/native/TensorIterator.h>
6+
7+
namespace at { namespace native {
8+
9+
// We manually overload abs because std::abs does not work with thrust::complex types and ROCm.
10+
template<typename scalar_t>
11+
__host__ __device__ static inline scalar_t abs_wrapper(scalar_t v) {
12+
return ::abs(v);
13+
}
14+
15+
template<typename T>
16+
__host__ __device__ static inline c10::complex<T> abs_wrapper(c10::complex<T> v) {
17+
return std::abs(v);
18+
}
19+
20+
__host__ __device__ static inline uint8_t abs_wrapper(uint8_t v) {
21+
return v;
22+
}
23+
24+
__host__ __device__ static inline bool abs_wrapper(bool v) {
25+
return v;
26+
}
27+
28+
void abs_kernel_cuda(TensorIterator& iter) {
29+
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() {
30+
AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "abs_cuda", [&] {
31+
gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
32+
return abs_wrapper(a);
33+
});
34+
});
35+
});
36+
}
37+
38+
REGISTER_DISPATCH(abs_stub, &abs_kernel_cuda);
39+
40+
}} // namespace at::native
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#include <ATen/Dispatch.h>
2+
#include <ATen/native/DispatchStub.h>
3+
#include <ATen/native/cuda/Loops.cuh>
4+
#include <ATen/native/BinaryOps.h>
5+
6+
// NOTE: CUDA on Windows requires that the enclosing function
7+
// of a __device__ lambda not have internal linkage.
8+
9+
namespace at { namespace native {
10+
11+
void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
12+
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
13+
auto alpha = alpha_scalar.to<scalar_t>();
14+
gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
15+
return a + alpha * b;
16+
});
17+
});
18+
}
19+
20+
static void sub_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
21+
add_kernel_cuda(iter, -alpha_scalar);
22+
}
23+
24+
REGISTER_DISPATCH(add_stub, &add_kernel_cuda);
25+
REGISTER_DISPATCH(sub_stub, &sub_kernel_cuda);
26+
27+
}} // namespace at::native

aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu renamed to aten/src/ATen/native/cuda/BinaryMulDivKernel.cu

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,12 @@
44
#include <ATen/native/cuda/zmath.cuh>
55
#include <ATen/native/TensorIterator.h>
66
#include <ATen/native/BinaryOps.h>
7-
#include <c10/macros/Macros.h>
8-
97

108
// NOTE: CUDA on Windows requires that the enclosing function
119
// of a __device__ lambda not have internal linkage.
1210

1311
namespace at { namespace native {
1412

15-
void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
16-
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
17-
auto alpha = alpha_scalar.to<scalar_t>();
18-
gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
19-
return a + alpha * b;
20-
});
21-
});
22-
}
23-
24-
static void sub_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
25-
add_kernel_cuda(iter, -alpha_scalar);
26-
}
27-
2813
void div_kernel_cuda(TensorIterator& iter) {
2914
if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) {
3015
// optimization for floating-point types: if the second operand is a CPU
@@ -62,33 +47,7 @@ void mul_kernel_cuda(TensorIterator& iter) {
6247
}
6348
}
6449

65-
void remainder_kernel_cuda(TensorIterator& iter) {
66-
if (isIntegralType(iter.dtype(), /*includeBool*/ false)) {
67-
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "remainder_cuda", [&]() {
68-
gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
69-
scalar_t r = a % b;
70-
if ((r != 0) && ((r < 0) != (b < 0))) {
71-
r += b;
72-
}
73-
return r;
74-
});
75-
});
76-
} else {
77-
AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "remainder_cuda", [&]() {
78-
gpu_kernel_with_scalars(iter,
79-
[]GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
80-
auto mod = ::fmod(a, b);
81-
if ((mod != 0) && ((b < 0) != (mod < 0))) mod += b;
82-
return mod;
83-
});
84-
});
85-
}
86-
}
87-
88-
REGISTER_DISPATCH(add_stub, &add_kernel_cuda);
89-
REGISTER_DISPATCH(sub_stub, &sub_kernel_cuda);
9050
REGISTER_DISPATCH(div_stub, &div_kernel_cuda);
9151
REGISTER_DISPATCH(mul_stub, &mul_kernel_cuda);
92-
REGISTER_DISPATCH(remainder_stub, &remainder_kernel_cuda);
9352

9453
}} // namespace at::native

0 commit comments

Comments
 (0)