Skip to content

Commit edf751c

Browse files
smessmerfacebook-github-bot
authored andcommitted
Make empty c10-full (#46092)
Summary: Pull Request resolved: #46092 Make empty c10-full without using hacky-wrapper, i.e. port the kernel to the new style signature. This PR also changes the signature of some helpers called by empty to the new style. ghstack-source-id: 116544203 (Note: this ignores all push blocking failures!) Test Plan: vs prev diff (outdated, before c10::optional fix): https://www.internalfb.com/intern/fblearner/details/224735103/ after c10::optional fix: https://www.internalfb.com/intern/fblearner/details/231391773/ Also, after the c10::optional fix, the instruction counting benchmark shows a 2% regression for calling empty from Python. We decided this is acceptable and decided against landing D24425836 which would fix the regression. Reviewed By: ezyang Differential Revision: D24219944 fbshipit-source-id: e554096e90ce438c75b679131c3151ff8e5c5d50
1 parent 3649a2c commit edf751c

36 files changed

Lines changed: 243 additions & 181 deletions

aten/src/ATen/BatchingRegistrations.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -869,10 +869,13 @@ Tensor new_zeros_batching_rule(
869869
Tensor new_empty_batching_rule(
870870
const Tensor& self,
871871
IntArrayRef size,
872-
const TensorOptions& options) {
872+
c10::optional<ScalarType> dtype,
873+
c10::optional<Layout> layout,
874+
c10::optional<Device> device,
875+
c10::optional<bool> pin_memory) {
873876
auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self);
874877
auto physical_size = physical_view.getPhysicalShape(size);
875-
auto result = physical_view.tensor().new_empty(physical_size, options);
878+
auto result = physical_view.tensor().new_empty(physical_size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory));
876879
return physical_view.newLogicalFromPhysical(result);
877880
}
878881

aten/src/ATen/ScalarOps.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,11 @@ Tensor& scalar_fill(Tensor& self, Scalar value) {
2929
return self;
3030
}
3131

32-
Tensor scalar_tensor_static(Scalar s, const TensorOptions& options) {
32+
Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
33+
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
3334
at::tracer::impl::NoTracerDispatchMode tracer_guard;
3435
at::AutoNonVariableTypeMode non_var_type_mode(true);
35-
auto result = at::detail::empty_cpu({}, options);
36+
auto result = at::detail::empty_cpu({}, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
3637
scalar_fill(result, s);
3738
return result;
3839
}

aten/src/ATen/ScalarOps.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ namespace detail {
1212
// but we also want to skip compute_types which in not avoidable
1313
// in TensorIterator for now.
1414
Tensor& scalar_fill(Tensor& self, Scalar value);
15-
TORCH_API Tensor scalar_tensor_static(Scalar s, const TensorOptions& options);
15+
TORCH_API Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
16+
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt,
17+
c10::optional<c10::MemoryFormat> memory_format_opt);
1618
} // namespace detail
1719
} // namespace at
1820

@@ -25,12 +27,12 @@ inline at::Tensor scalar_to_tensor(Scalar s, const Device device = at::kCPU) {
2527
// This is the fast track we have for CPU scalar tensors.
2628
if (device == at::kCPU && !s.isComplex()) {
2729
if (s.isFloatingPoint()) {
28-
return at::detail::scalar_tensor_static(s, at::device(at::kCPU).dtype(at::kDouble));
30+
return at::detail::scalar_tensor_static(s, at::kDouble, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
2931
} else if (s.isBoolean()) {
30-
return at::detail::scalar_tensor_static(s, at::device(at::kCPU).dtype(at::kBool));
32+
return at::detail::scalar_tensor_static(s, at::kBool, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
3133
} else {
3234
AT_ASSERT(s.isIntegral(false));
33-
return at::detail::scalar_tensor_static(s, at::device(at::kCPU).dtype(at::kLong));
35+
return at::detail::scalar_tensor_static(s, at::kLong, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
3436
}
3537
}
3638
if (s.isFloatingPoint()) {

aten/src/ATen/Utils.cpp

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,32 +16,24 @@ int _crash_if_asan(int arg) {
1616

1717
namespace detail {
1818
// empty_cpu is used in ScalarOps.h, which can be referenced by other ATen files. Since we want to decouple direct referencing native symbols and only access native symbols through dispatching, we move its implementation here.
19-
Tensor empty_cpu(
20-
IntArrayRef size,
21-
const TensorOptions& options,
22-
c10::optional<c10::MemoryFormat> optional_memory_format) {
23-
TORCH_CHECK(
24-
!(options.has_memory_format() && optional_memory_format.has_value()),
25-
"Cannot set memory_format both in TensorOptions and explicit argument; please delete "
26-
"the redundant setter.");
27-
const MemoryFormat memory_format =
28-
optional_memory_format.value_or(
29-
options.memory_format_opt().value_or(
30-
MemoryFormat::Contiguous));
19+
Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
20+
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
21+
Device device = device_or_default(device_opt);
3122

32-
AT_ASSERT(options.device().type() == DeviceType::CPU);
23+
TORCH_CHECK(device.type() == DeviceType::CPU);
3324
check_size_nonnegative(size);
3425

26+
bool pin_memory = pinned_memory_or_default(pin_memory_opt);
3527
c10::Allocator* allocator;
36-
if (options.pinned_memory()) {
28+
if (pin_memory) {
3729
allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
3830
} else {
3931
allocator = at::getCPUAllocator();
4032
}
4133

4234
int64_t nelements = prod_intlist(size);
43-
const caffe2::TypeMeta dtype = options.dtype();
44-
const int64_t size_bytes = nelements * dtype.itemsize();
35+
caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
36+
int64_t size_bytes = nelements * dtype.itemsize();
4537
auto storage_impl = c10::make_intrusive<StorageImpl>(
4638
c10::StorageImpl::use_byte_size_t(),
4739
size_bytes,
@@ -56,6 +48,7 @@ Tensor empty_cpu(
5648
tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
5749
}
5850

51+
auto memory_format = memory_format_opt.value_or(MemoryFormat::Contiguous);
5952
tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format);
6053

6154
return tensor;

aten/src/ATen/Utils.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,8 @@ inline void check_size_nonnegative(IntArrayRef size) {
136136

137137
namespace detail {
138138
CAFFE2_API
139-
Tensor empty_cpu(
140-
IntArrayRef size,
141-
const TensorOptions& options = {},
142-
c10::optional<MemoryFormat> memory_format = c10::nullopt);
139+
Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
140+
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt);
143141
} // namespace detail
144142

145143
} // at

aten/src/ATen/native/MetaTensor.cpp

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,29 @@ namespace native {
77
// Will be promoted to a public API later, but not now
88
Tensor empty_meta(
99
IntArrayRef size,
10-
const TensorOptions& options_,
11-
c10::optional<c10::MemoryFormat> optional_memory_format
10+
c10::optional<ScalarType> dtype,
11+
c10::optional<Layout> layout,
12+
c10::optional<Device> device,
13+
c10::optional<bool> pin_memory,
14+
c10::optional<c10::MemoryFormat> memory_format
1215
) {
13-
TORCH_CHECK(
14-
!(options_.has_memory_format() && optional_memory_format.has_value()),
15-
"Cannot set memory_format both in TensorOptions and explicit argument; please delete "
16-
"the redundant setter.");
17-
TensorOptions options = options_.merge_memory_format(optional_memory_format);
18-
1916
// TODO: deduplicate this logic with empty_cpu
2017

21-
auto dtype = options.dtype();
22-
auto device = options.device();
2318
auto tensor = detail::make_tensor<TensorImpl>(
2419
// NB: We include the computed dispatch key, not because it will actually
2520
// participate in dispatch, but so that tests like is_sparse/is_cuda
2621
// give the correct result (a CUDA meta tensor "is cuda"). If we don't
2722
// like this, remove the computeDispatchKey line
28-
DispatchKeySet{DispatchKey::Meta, options.computeDispatchKey()},
29-
dtype,
23+
DispatchKeySet{DispatchKey::Meta, computeDispatchKey(dtype, layout, device)},
24+
scalarTypeToTypeMeta(dtype_or_default(dtype)),
3025
device
3126
);
3227
if (size.size() != 1 || size[0] != 0) {
3328
tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
3429
}
3530

36-
auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
37-
tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format);
31+
auto memory_format_ = memory_format.value_or(MemoryFormat::Contiguous);
32+
tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format_);
3833

3934
return tensor;
4035
}

aten/src/ATen/native/TensorFactories.cpp

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,9 @@ Tensor polar(const Tensor& abs, const Tensor& angle) {
165165
}
166166

167167
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
168-
Tensor empty_cpu(IntArrayRef size, const TensorOptions& options_, c10::optional<c10::MemoryFormat> optional_memory_format) {
169-
return at::detail::empty_cpu(size, options_, optional_memory_format);
168+
Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
169+
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
170+
return at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
170171
}
171172

172173
Tensor empty(
@@ -186,9 +187,10 @@ Tensor empty(
186187
return result;
187188
}
188189

189-
Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, const TensorOptions& options) {
190+
Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt,
191+
c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
190192
check_size_nonnegative(size);
191-
auto t = at::native::empty_cpu({0}, options);
193+
auto t = at::native::empty_cpu({0}, dtype_opt, layout_opt, device_opt, pin_memory_opt);
192194
at::native::resize_impl_cpu_(t.unsafeGetTensorImpl(), size, stride);
193195
return t;
194196
}
@@ -336,9 +338,16 @@ Tensor empty_like(
336338
Tensor new_empty(
337339
const Tensor& self,
338340
IntArrayRef size,
339-
const TensorOptions& options
341+
c10::optional<ScalarType> dtype_opt,
342+
c10::optional<Layout> layout_opt,
343+
c10::optional<Device> device_opt,
344+
c10::optional<bool> pin_memory_opt
340345
) {
341-
return at::empty(size, self.options().merge_in(options));
346+
auto dtype = dtype_opt.has_value() ? dtype_opt : optTypeMetaToScalarType(self.options().dtype_opt());
347+
auto layout = layout_opt.has_value() ? layout_opt : self.options().layout_opt();
348+
auto device = device_opt.has_value() ? device_opt : self.options().device_opt();
349+
auto pin_memory = pin_memory_opt.has_value() ? pin_memory_opt : self.options().pinned_memory_opt();
350+
return at::empty(size, dtype, layout, device, pin_memory, c10::nullopt);
342351
}
343352

344353
Tensor new_empty_strided(
@@ -507,7 +516,7 @@ Tensor scalar_tensor(Scalar s, const TensorOptions& options) {
507516
// auto result = at::empty({}, options);
508517
at::tracer::impl::NoTracerDispatchMode tracer_guard;
509518
at::AutoNonVariableTypeMode non_var_type_mode(true);
510-
auto result = empty_cpu({}, options);
519+
auto result = empty_cpu({}, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
511520
at::native::fill_(result, s);
512521
return result;
513522
}
@@ -735,13 +744,14 @@ Tensor range(
735744
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
736745

737746
Tensor tril_indices_cpu(
738-
int64_t row, int64_t col, int64_t offset, const TensorOptions& options) {
739-
check_args(row, col, options);
747+
int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype_opt,
748+
c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
749+
check_args(row, col, layout_opt);
740750

741751
auto tril_size = get_tril_size(row, col, offset);
742752

743753
// create an empty Tensor with correct size
744-
auto result = at::empty({2, tril_size}, options);
754+
auto result = at::native::empty_cpu({2, tril_size}, dtype_opt, layout_opt, device_opt, pin_memory_opt);
745755

746756
// The following three approaches result in very little performance
747757
// differences. Hence, the 2nd option is taken for simpler code, and to return
@@ -780,13 +790,14 @@ Tensor tril_indices_cpu(
780790
}
781791

782792
Tensor triu_indices_cpu(
783-
int64_t row, int64_t col, int64_t offset, const TensorOptions& options) {
784-
check_args(row, col, options);
793+
int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype_opt,
794+
c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
795+
check_args(row, col, layout_opt);
785796

786797
auto triu_size = row * col - get_tril_size(row, col, offset - 1);
787798

788799
// create an empty Tensor with correct size
789-
auto result = at::empty({2, triu_size}, options);
800+
auto result = at::native::empty_cpu({2, triu_size}, dtype_opt, layout_opt, device_opt, pin_memory_opt);
790801

791802
AT_DISPATCH_ALL_TYPES(result.scalar_type(), "triu_indices", [&]() -> void {
792803
// fill the Tensor with correct values

aten/src/ATen/native/TensorFactories.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,14 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
5050
}
5151

5252
inline void check_args(
53-
int64_t row, int64_t col, const TensorOptions& options) {
53+
int64_t row, int64_t col, c10::optional<Layout> layout_opt) {
5454
TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
5555
TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
56-
if (options.has_layout()) {
56+
if (layout_opt.has_value()) {
5757
TORCH_CHECK(
58-
options.layout() == at::kStrided,
58+
*layout_opt == at::kStrided,
5959
"only support layout=torch.strided, got",
60-
options.layout())
60+
*layout_opt)
6161
}
6262
}
6363

aten/src/ATen/native/cuda/Indexing.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,8 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
882882
//However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
883883
bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
884884
at::Tensor out_temp = need_to_copy ?
885-
at::native::empty_cuda({self.dim(), num_nonzeros_h}, out.options()) :
885+
at::native::empty_cuda({self.dim(), num_nonzeros_h}, optTypeMetaToScalarType(out.options().dtype_opt()),
886+
out.options().layout_opt(), out.options().device_opt(), out.options().pinned_memory_opt()) :
886887
out.resize_({self.dim(), num_nonzeros_h});
887888
//Scalars are expected to produce output of size (1,0), so we can't write to it
888889
if (self.dim() > 0) {
@@ -931,7 +932,7 @@ Tensor& nonzero_out_cuda(Tensor& out, const Tensor& self){
931932
}
932933

933934
Tensor nonzero_cuda(const Tensor& self){
934-
Tensor out = at::native::empty_cuda({0}, self.options().dtype(kLong));
935+
Tensor out = at::native::empty_cuda({0}, kLong, self.options().layout_opt(), self.options().device_opt(), self.options().pinned_memory_opt());
935936
return nonzero_out_cuda(out, self);
936937
}
937938

aten/src/ATen/native/cuda/MultinomialKernel.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,9 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
322322
// To exploit greater parallelism for the sampling, generate the
323323
// Uniform random samples in a separate kernel launch, into
324324
// temporarily allocated memory. The device RNG is thread-limited
325-
Tensor sampled = native::empty_cuda({numDist, n_sample}, self_v.options());
325+
Tensor sampled = native::empty_cuda({numDist, n_sample}, optTypeMetaToScalarType(self_v.options().dtype_opt()),
326+
self_v.options().layout_opt(), self_v.options().device_opt(),
327+
self_v.options().pinned_memory_opt());
326328
at::native::uniform_(sampled, 0.0, 1.0, generator);
327329
328330
dim3 block(numCategories < maxThreads ? numCategories : maxThreads);

0 commit comments

Comments
 (0)