Stable sort for the CPU take 2. (#51790)

nikitaved · facebook-github-bot · commit 9699c703c2b4 · 2021-02-19T09:28:57.000-08:00
Summary: Fixes #38681. A duplicate of #50052 created to become importable to the fb internal tests. Pull Request resolved: #51790 Reviewed By: agolynski Differential Revision: D26279045 Pulled By: glaringlee fbshipit-source-id: 348e171dee9c370a76002b65d0c82c329f57a421
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -28,6 +28,8 @@ std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, co
 std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim);
 std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending);
 std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending);
+std::tuple<Tensor &,Tensor &> _th_sort_out_stable(Tensor & values, Tensor & indices, const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending);
+std::tuple<Tensor,Tensor> _th_sort_stable(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending);
 std::tuple<Tensor &,Tensor &> _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
 std::tuple<Tensor,Tensor> _th_topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
 Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -304,10 +304,13 @@ std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdi
     }
     return std::tuple<Tensor, Tensor>(values, indices);
 }
-std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) {
+std::tuple<Tensor &,Tensor &> _th_sort_out_stable(Tensor & values, Tensor & indices, const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
 
+    TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional<bool> for stable has to have value.");
+    TORCH_CHECK(!stable.value(), "stable=True is not implemented on CUDA yet.");
+
     switch (dispatch_scalar_type) {
         case ScalarType::Byte: {
             auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CUDA, dispatch_scalar_type);
@@ -370,8 +373,15 @@ std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, co
     }
     return std::tuple<Tensor &, Tensor &>(values, indices);
 }
-std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending) {
+std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) {
+  return _th_sort_out_stable(values, indices, self, /*stable=*/false, dim, descending);
+}
+std::tuple<Tensor,Tensor> _th_sort_stable(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending) {
     // DeviceGuard omitted
+
+    TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional<bool> for stable has to have value.");
+    TORCH_CHECK(!stable.value(), "stable=True is not implemented on CUDA yet.");
+
     auto dispatch_scalar_type = infer_scalar_type(self);
     auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
     auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
@@ -423,6 +433,9 @@ std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descen
     }
     return std::tuple<Tensor, Tensor>(values, indices);
 }
+std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending) {
+  return _th_sort_stable(self, /*stable=*/false, dim, descending);
+}
 std::tuple<Tensor &,Tensor &> _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/native/CompositeRandomAccessorCommon.h b/aten/src/ATen/native/CompositeRandomAccessorCommon.h
@@ -122,6 +122,9 @@ class CompositeRandomAccessor {
   using difference_type = typename std::iterator_traits<KeyAccessor>::difference_type;
   using iterator_category = std::random_access_iterator_tag;
 
+  C10_HOST_DEVICE
+  CompositeRandomAccessor() = default;
+
   C10_HOST_DEVICE
   CompositeRandomAccessor(KeyAccessor keys, ValueAccessor values)
     : keys(keys), values(values)
diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp
@@ -359,9 +359,15 @@ Tensor scatter_add(const Tensor& self, Dimname dim, const Tensor& index, const T
 Tensor& scatter_add_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {
   reportNYIDimnameOverload("scatter_add");
 }
+std::tuple<Tensor&, Tensor&> sort_out(Tensor& values, Tensor& indices, const Tensor& self, c10::optional<bool> stable, Dimname dim, bool keepdim) {
+  reportNYIDimnameOverload("sort");
+}
 std::tuple<Tensor&, Tensor&> sort_out(Tensor& values, Tensor& indices, const Tensor& self, Dimname dim, bool keepdim) {
   reportNYIDimnameOverload("sort");
 }
+std::tuple<Tensor, Tensor> sort(const Tensor& self, c10::optional<bool> stable, Dimname dim, bool keepdim) {
+  reportNYIDimnameOverload("sort");
+}
 std::tuple<Tensor, Tensor> sort(const Tensor& self, Dimname dim, bool keepdim) {
   reportNYIDimnameOverload("sort");
 }
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
@@ -682,10 +682,11 @@ Tensor nanmedian_cpu(const Tensor& self) {
   return median_impl(self, /*ignore_nan=*/true);
 }
 
-std::tuple<Tensor&, Tensor&> sort_out_cpu(
+std::tuple<Tensor&, Tensor&> sort_out_cpu_stable(
     Tensor& values,
     Tensor& indices,
     const Tensor& self,
+    c10::optional<bool> stable,
     int64_t dim,
     bool descending) {
   values.resize_(self.sizes()).copy_(self);
@@ -697,18 +698,37 @@ std::tuple<Tensor&, Tensor&> sort_out_cpu(
     return std::forward_as_tuple(values, indices);
   }
 
-  sort_stub(kCPU, values, indices, dim, descending);
+  TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional<bool> for stable has to have value.");
+  sort_stub(kCPU, values, indices, dim, descending, stable.value());
 
   return std::forward_as_tuple(values, indices);
 }
 
-std::tuple<Tensor, Tensor> sort_cpu(
+std::tuple<Tensor&, Tensor&> sort_out_cpu(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim,
+    bool descending) {
+  return sort_out_cpu_stable(values, indices, self, /*stable=*/false, dim, descending);
+}
+
+std::tuple<Tensor, Tensor> sort_cpu_stable(
     const Tensor& self,
+    c10::optional<bool> stable,
     int64_t dim,
     bool descending) {
+  TORCH_CHECK(!self.is_complex(), "sort(): input tensor must be of non-complex type");
   Tensor values = at::empty({0}, self.options());
   Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  return sort_out_cpu(values, indices, self, dim, descending);
+  return sort_out_cpu_stable(values, indices, self, stable, dim, descending);
+}
+
+std::tuple<Tensor, Tensor> sort_cpu(
+    const Tensor& self,
+    int64_t dim,
+    bool descending) {
+  return sort_cpu_stable(self, /*stable=*/false, dim, descending);
 }
 
 Tensor& msort_out(Tensor& values, const Tensor& self) {
diff --git a/aten/src/ATen/native/Sorting.h b/aten/src/ATen/native/Sorting.h
@@ -5,7 +5,7 @@
 
 namespace at { namespace native {
 
-using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending);
+using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending, bool stable);
 using topk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
 
 DECLARE_DISPATCH(sort_fn, sort_stub);
diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp
@@ -96,7 +96,8 @@ static void sort_kernel(
     Tensor& values,
     Tensor& indices,
     int64_t dim,
-    bool descending) {
+    bool descending,
+    bool stable) {
   dim = maybe_wrap_dim(dim, values.dim());
   _fill_indices(indices, dim);
   _dim_apply(
@@ -116,12 +117,24 @@ static void sort_kernel(
       >(values_accessor, indices_accessor);
 
       if (descending) {
-        std::sort(composite_accessor, composite_accessor + dim_size,
-          KeyValueCompDesc<scalar_t>());
+        if (stable) {
+          std::stable_sort(composite_accessor, composite_accessor + dim_size,
+            KeyValueCompDesc<scalar_t>());
+        }
+        else {
+          std::sort(composite_accessor, composite_accessor + dim_size,
+            KeyValueCompDesc<scalar_t>());
+        }
       }
       else {
-        std::sort(composite_accessor, composite_accessor + dim_size,
-          KeyValueCompAsc<scalar_t>());
+        if (stable) {
+          std::stable_sort(composite_accessor, composite_accessor + dim_size,
+            KeyValueCompAsc<scalar_t>());
+        }
+        else {
+          std::sort(composite_accessor, composite_accessor + dim_size,
+            KeyValueCompAsc<scalar_t>());
+        }
       }
     }
   );
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -6391,19 +6391,38 @@
     CPU: sort_out_cpu
     CUDA: legacy::cuda::_th_sort_out
 
+- func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  dispatch:
+    CPU: sort_out_cpu_stable
+    CUDA: legacy::cuda::_th_sort_out_stable
+
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
   variants: method, function
   dispatch:
     CPU: sort_cpu
     CUDA: legacy::cuda::_th_sort
     QuantizedCPU: sort_quantized_cpu
 
+- func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  variants: method, function
+  dispatch:
+    CPU: sort_cpu_stable
+    CUDA: legacy::cuda::_th_sort_stable
+    QuantizedCPU: sort_quantized_cpu_stable
+
 - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
+- func: sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+
 - func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
   variants: method, function
 
+- func: sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+  variants: method, function
+
 - func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp
@@ -20,19 +20,27 @@ Tensor min_quantized_cpu(const Tensor& self) {
 
 // TODO: move to TensorMath.cpp
 
-std::tuple<Tensor, Tensor> sort_quantized_cpu(
+std::tuple<Tensor, Tensor> sort_quantized_cpu_stable(
     const Tensor& self,
+    c10::optional<bool> stable,
     int64_t dim,
     bool descending) {
   Tensor sort_int;
   Tensor sort_indicies;
   std::tie(sort_int, sort_indicies) =
-      at::sort(self.int_repr(), dim, descending);
+      at::sort(self.int_repr(), stable, dim, descending);
   return std::forward_as_tuple(
       at::_make_per_tensor_quantized_tensor(
           sort_int, self.q_scale(), self.q_zero_point()),
       sort_indicies);
 }
 
+std::tuple<Tensor, Tensor> sort_quantized_cpu(
+    const Tensor& self,
+    int64_t dim,
+    bool descending) {
+  return sort_quantized_cpu_stable(self, /*stable=*/false, dim, descending);
+}
+
 } // namespace native
 } // namespace at
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
@@ -9,7 +9,7 @@
     (TestCase, run_tests, make_tensor)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyOnCPUAndCUDA,
-     skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA)
+     skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA, onlyCPU)
 
 # TODO: remove this
 SIZE = 100
@@ -113,6 +113,84 @@ def test_sort(self, device):
         self.assertIsOrdered('descending', x, res2val, res2ind,
                              'random with NaNs')
 
+    @onlyCUDA
+    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    def test_stable_sort_fails_on_CUDA(self, device, dtype):
+        x = torch.tensor([1, 0, 1, 0], dtype=dtype, device=device)
+        with self.assertRaisesRegex(RuntimeError, "stable=True is not implemented on CUDA yet."):
+            x.sort(stable=True)
+
+    @onlyCPU
+    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    def test_stable_sort(self, device, dtype):
+        for ncopies in (100, 1000, 10000):
+            x = torch.tensor([0, 1] * ncopies, dtype=dtype, device=device)
+            _, idx = x.sort(stable=True)
+            self.assertEqual(
+                idx[:ncopies],
+                torch.arange(start=0, end=2 * ncopies, step=2, device=device)
+            )
+            self.assertEqual(
+                idx[ncopies:],
+                torch.arange(start=1, end=2 * ncopies, step=2, device=device)
+            )
+
+    @onlyCPU
+    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    def test_stable_sort_against_numpy(self, device, dtype):
+        if dtype in torch.testing.floating_types_and(torch.float16):
+            inf = float('inf')
+            neg_inf = -float('inf')
+            nan = float('nan')
+        else:
+            if dtype != torch.bool:
+                # no torch.iinfo support for torch.bool
+                inf = torch.iinfo(dtype).max
+                neg_inf = torch.iinfo(dtype).min
+            else:
+                inf = True
+                neg_inf = ~inf
+            # no nan for integral types, we use inf instead for simplicity
+            nan = inf
+
+        def generate_samples():
+            from itertools import chain, combinations
+
+            def repeated_index_fill(t, dim, idxs, vals):
+                res = t
+                for idx, val in zip(idxs, vals):
+                    res = res.index_fill(dim, idx, val)
+                return res
+
+            for sizes in [(1, 10), (10, 1), (10, 10), (10, 10, 10)]:
+                size = min(*sizes)
+                x = (torch.randn(*sizes, device=device) * size).to(dtype)
+                yield (x, 0)
+
+                # Generate tensors which are being filled at random locations
+                # with values from the non-empty subsets of the set (inf, neg_inf, nan)
+                # for each dimension.
+                n_fill_vals = 3  # cardinality of (inf, neg_inf, nan)
+                for dim in range(len(sizes)):
+                    idxs = (torch.randint(high=size, size=(size // 10,)) for i in range(n_fill_vals))
+                    vals = (inf, neg_inf, nan)
+                    subsets = chain.from_iterable(combinations(list(zip(idxs, vals)), r)
+                                                  for r in range(1, n_fill_vals + 1))
+                    for subset in subsets:
+                        idxs_subset, vals_subset = zip(*subset)
+                        yield (repeated_index_fill(x, dim, idxs_subset, vals_subset), dim)
+
+            for sizes in [(100,), (1000,), (10000,)]:
+                size = sizes[0]
+                # binary strings
+                yield (torch.tensor([0, 1] * size, dtype=dtype, device=device), 0)
+
+        for sample, dim in generate_samples():
+            _, idx_torch = sample.sort(dim=dim, stable=True)
+            sample_numpy = sample.numpy()
+            idx_numpy = np.argsort(sample_numpy, axis=dim, kind='stable')
+            self.assertEqual(idx_torch, idx_numpy)
+
     @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
     def test_msort(self, device, dtype):
         def test(shape):
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -6962,11 +6962,6 @@ def tmp(dtype, device):
     ('size', '', _new_t((1, 2, 3, 4)), lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('size', 'dim', _new_t((1, 2, 3, 4)), lambda t, d: [1], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('size', 'neg_dim', _new_t((1, 2, 3, 4)), lambda t, d: [-2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('sort', '', _small_3d_unique, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('sort', 'dim', _small_3d_unique, lambda t, d: [1], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('sort', 'neg_dim', _small_3d_unique, lambda t, d: [-1], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('sort', 'dim_descending', _small_3d_unique, lambda t, d: [1, True], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
-    ('sort', 'neg_dim_descending', _small_3d_unique, lambda t, d: [-1, True], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('split', '', _small_3d, lambda t, d: [2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('split', 'dim', _small_3d, lambda t, d: [2, 1], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('split', 'neg_dim', _small_3d, lambda t, d: [2, -3], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -1005,6 +1005,10 @@
   self: value_selecting_reduction_backward(grad, dim, indices, self.sizes(), true)
   output_differentiability: [True, False]
 
+- name: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+  self: value_selecting_reduction_backward(grad, dim, indices, self.sizes(), true)
+  output_differentiability: [True, False]
+
 - name: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
   self: split_backward(grads, split_size, dim, self.sizes(), self.options())
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
diff --git a/torch/overrides.py b/torch/overrides.py
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py

Original file line number	Diff line number	Diff line change
`@@ -359,9 +359,15 @@ Tensor scatter_add(const Tensor& self, Dimname dim, const Tensor& index, const T`
`359`	`359`	`Tensor& scatter_add_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {`
`360`	`360`	`reportNYIDimnameOverload("scatter_add");`
`361`	`361`	`}`
	`362`	`+std::tuple<Tensor&, Tensor&> sort_out(Tensor& values, Tensor& indices, const Tensor& self, c10::optional<bool> stable, Dimname dim, bool keepdim) {`
	`363`	`+ reportNYIDimnameOverload("sort");`
	`364`	`+}`
`362`	`365`	`std::tuple<Tensor&, Tensor&> sort_out(Tensor& values, Tensor& indices, const Tensor& self, Dimname dim, bool keepdim) {`
`363`	`366`	`reportNYIDimnameOverload("sort");`
`364`	`367`	`}`
	`368`	`+std::tuple<Tensor, Tensor> sort(const Tensor& self, c10::optional<bool> stable, Dimname dim, bool keepdim) {`
	`369`	`+ reportNYIDimnameOverload("sort");`
	`370`	`+}`
`365`	`371`	`std::tuple<Tensor, Tensor> sort(const Tensor& self, Dimname dim, bool keepdim) {`
`366`	`372`	`reportNYIDimnameOverload("sort");`
`367`	`373`	`}`