pytorch
diff --git a/‎aten/src/ATen/cuda/detail/OffsetCalculator.cuh‎
Lines changed: 30 additions & 5 deletions b/‎aten/src/ATen/cuda/detail/OffsetCalculator.cuh‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/TensorConversions.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/TensorConversions.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh‎
Lines changed: 65 additions & 15 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh‎
Lines changed: 65 additions & 15 deletions
diff --git a/‎aten/src/ATen/native/cuda/DistributionTemplates.h‎
Lines changed: 12 additions & 2 deletions b/‎aten/src/ATen/native/cuda/DistributionTemplates.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/MemoryAccess.cuh‎
Lines changed: 33 additions & 14 deletions b/‎aten/src/ATen/native/cuda/MemoryAccess.cuh‎
Lines changed: 33 additions & 14 deletions
@@ -17,10 +17,15 @@ constexpr int MAX_DIMS = 25;
 
 template <int NARGS, typename index_t = uint32_t>
 struct OffsetCalculator {
-  // The offset for each argument (in bytes). Wrapper around fixed-size array.
-  using offset_type = at::detail::Array<index_t, NARGS>;
+  // The offset for each argument. Wrapper around fixed-size array.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = at::detail::Array<index_t, std::max<int>(NARGS, 1)>;
 
-  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides) : dims(dims) {
+  // if element_sizes is nullptr, then the strides will be in bytes, otherwise
+  // the strides will be in # of elements.
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) {
     TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
     for (int i = 0; i < MAX_DIMS; ++i) {
       if (i < dims) {
@@ -29,7 +34,8 @@ struct OffsetCalculator {
         sizes_[i] = IntDivider<index_t>(1);
       }
       for (int arg = 0; arg < NARGS; arg++) {
-        strides_[i][arg] =  i < dims ? strides[arg][i] : 0;
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] =  i < dims ? strides[arg][i] / element_size : 0;
       }
     }
   }
@@ -60,5 +66,24 @@ struct OffsetCalculator {
 
   int dims;
   IntDivider<index_t> sizes_[MAX_DIMS];
-  index_t strides_[MAX_DIMS][NARGS];
+  index_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+
+template <int NARGS, typename index_t = uint32_t>
+struct TrivialOffsetCalculator {
+  // The offset for each argument. Wrapper around fixed-size array.
+  // The offsets are in # of elements, not in bytes.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = at::detail::Array<index_t, std::max<int>(NARGS, 1)>;
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
 };
@@ -33,7 +33,7 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b
     if (self.is_non_overlapping_and_dense()) {
       // Copy all strides
       auto r = at::empty_strided(self.sizes(), self.strides(), options.memory_format(c10::nullopt));
-      r.copy_(self);
+      r.copy_(self, non_blocking);
       return r;
     } else {
       memory_format = self.suggest_memory_format();
 
@@ -66,6 +66,27 @@ static constexpr int launch_bound2 = 4;
 
 namespace at { namespace native {
 
+template<int N>
+static OffsetCalculator<N> make_input_offset_calculator(const TensorIterator& iter) {
+  // array size can not be 0, this happens when N == 0
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - 1);
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + 1).data();
+    element_sizes[i] = iter.element_size(i + 1);
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+static OffsetCalculator<1> make_output_offset_calculator(const TensorIterator& iter) {
+  std::array<const int64_t*, 1> strides;
+  strides[0] = iter.strides(0).data();
+  int64_t element_size = iter.element_size(0);
+  return OffsetCalculator<1>(iter.ndim(), iter.shape().data(), strides.data(), &element_size);
+}
+
 // NOTE: @zasdfgbnm is currently working on rewriting the gpu loops.
 // Some of the old codes has been moved to namespace legacy, and
 // new codes will be put into namespace modern. These two namespaces
@@ -175,32 +196,37 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
 template<int vec_size, typename func_t, typename array_t>
 C10_LAUNCH_BOUNDS_1(num_threads)
 __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
+  using traits = function_traits<func_t>;
   int remaining = N - block_work_size * blockIdx.x;
 
   if (remaining < block_work_size) {  // if this block handles the reminder, just do a naive unrolled loop
-    elementwise_kernel_helper(f, typename memory::policies::unroll<array_t>(data, remaining));
+    auto input_calc = TrivialOffsetCalculator<traits::arity>();
+    auto output_calc = TrivialOffsetCalculator<1>();
+    auto policy = memory::policies::unroll<array_t, decltype(input_calc), decltype(output_calc)>(data, remaining, input_calc, output_calc);
+    elementwise_kernel_helper(f, policy);
   } else {  // if this block has a full `block_work_size` data to handle, use vectorized memory access
-    elementwise_kernel_helper(f, typename memory::policies::template vectorized<vec_size, array_t>(data));
+    elementwise_kernel_helper(f, memory::policies::vectorized<vec_size, array_t>(data));
   }
 }
 
-template<typename func_t, typename array_t>
+template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
 C10_LAUNCH_BOUNDS_1(num_threads)
-__global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data) {
+__global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
   int remaining = N - block_work_size * blockIdx.x;
-  elementwise_kernel_helper(f, typename memory::policies::unroll<array_t>(data, remaining));
+  elementwise_kernel_helper(f, memory::policies::unroll<array_t, inp_calc_t, out_calc_t>(data, remaining, ic, oc));
 }
 
-// TODO (@zasdfgbnm): this function assume trivial 1d and no dynamic casting
+// this function assume trivial 1d and no dynamic casting
 template<typename func_t, typename array_t>
-static void launch_kernel(int64_t N, const func_t& f, array_t data) {
-  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
-  if (N == 0) {
-    return;
-  }
+static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t data) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  using traits = function_traits<func_t>;
   int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
+  auto input_calc = TrivialOffsetCalculator<traits::arity>();
+  auto output_calc = TrivialOffsetCalculator<1>();
+
   switch (vec_size) {
   case 4:
     vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
@@ -209,14 +235,23 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) {
     vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
     break;
   case 1:
-    unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
+    unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, input_calc, output_calc);
     break;
   default:
     TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
   }
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + block_work_size - 1) / block_work_size;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 } // namespace modern
 
 
@@ -234,12 +269,29 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
     data[i] = (char*)iter.data_ptr(i);
   }
 
+  int64_t numel = iter.numel();
+
+  bool contiguous = iter.is_contiguous();
+  bool dynamic_casting = needs_dynamic_casting<func_t>::check(iter);
+
+  if (contiguous && !dynamic_casting) {
+    modern::launch_vectorized_kernel(numel, f, data);
+    return;
+  }
+
+  if (!dynamic_casting) {
+    // !contiguous
+    auto input_offset_calculator = make_input_offset_calculator<traits::arity>(iter);
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    modern::launch_unrolled_kernel(numel, f, data, input_offset_calculator, output_offset_calculator);
+    return;
+  }
+
   at::detail::Array<ScalarType, ntensors> dtypes;
   for (int i = 0; i < ntensors; i++) {
     dtypes[i] = iter.tensor(i).scalar_type();
   }
 
-  int64_t numel = iter.numel();
   if (iter.is_trivial_1d()) {
     auto inner_strides = iter.get_inner_strides();
     at::detail::Array<int, ntensors> strides;
@@ -253,8 +305,6 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
         arg0_t result = legacy::invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
         c10::cast_and_store<arg0_t>(dtypes[0], out, result);
       });
-    } else if (iter.has_contiguous_first_dim()) {
-      modern::launch_kernel(numel, f, data);
     } else {
       legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
         arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx);
 
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <ATen/Dispatch.h>
-#include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <c10/util/Half.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
 
 #include <curand.h>
 #include <curand_kernel.h>
@@ -77,6 +77,16 @@ __global__ void distribution_elementwise_grid_stride_kernel(int numel,
   }
 }
 
+template<int N>
+static OffsetCalculator<N> make_offset_calculator(const at::TensorIterator& iter) {
+  AT_ASSERT(N == iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data());
+}
+
 /**
  * distribution_nullary_kernel is analogous to gpu_kernel in
  * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses
@@ -144,7 +154,7 @@ void distribution_nullary_kernel(at::TensorIterator& iter,
       }
     );
   } else {
-    auto offset_calc = at::native::legacy::make_offset_calculator<1>(iter);
+    auto offset_calc = make_offset_calculator<1>(iter);
     distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
       numel,
       rng_engine_inputs,
 
@@ -5,6 +5,7 @@
 #include <c10/util/Exception.h>
 #include <c10/macros/Macros.h>
 #include <ATen/detail/FunctionTraits.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
 
 // References:
 // https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
@@ -44,8 +45,11 @@ struct static_unroll<func, end, end> {
   static inline C10_HOST_DEVICE void with_args(Args... args) {}
 };
 
+// helper structs to be used with static_unroll to load arguments
+// one by one
+
 template<int arg_index>
-struct load_with_policy {
+struct vectorized_load_helper {
   template <typename args_t, typename policy_t>
   static __device__ void apply(policy_t &self, args_t *args, int idx) {
     using arg_t = std::tuple_element_t<arg_index, args_t>;
@@ -57,6 +61,18 @@ struct load_with_policy {
   }
 };
 
+template<int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t>
+  static __device__ void apply(policy_t &self, args_t *args, offset_t offset, int j) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + offset[arg_index];
+    std::get<arg_index>(args[j]) = *ptr;
+  }
+};
+
 }  // namespace detail
 
 // aligned vector generates vectorized load/store on CUDA
@@ -69,37 +85,37 @@ namespace policies {
 
 // Assumption:
 // all tensors are contiguous, that is: stride == sizeof(type) for all tensors
-template<typename data_t>
+template<typename data_t, typename inp_calc_t, typename out_calc_t>
 struct unroll {
 
   data_t data;
   int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
 
-  __device__ unroll(data_t data, int remaining): data(data), remaining(remaining) {}
+  __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc):
+    data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
     return ((threadIdx.x  + thread_work_elem*num_threads) < remaining);
   }
 
-  template<typename accessor_t, typename scalar_t>
-  __device__ inline void load_single_arg(accessor_t to, scalar_t *from) {
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
     int thread_idx = threadIdx.x;
     #pragma unroll
     for (int i = 0; i < thread_work_size; i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      to(i) = from[thread_idx];
+      int linear_idx = thread_idx + block_work_size * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, i);
       thread_idx += num_threads;
     }
   }
 
-  template<typename args_t>
-  __device__ inline void load(args_t *args, int idx) {
-    constexpr int arity = std::tuple_size<args_t>::value;
-    detail::static_unroll<detail::load_with_policy, arity>::with_args(*this, args, idx);
-  }
-
   template<typename scalar_t>
   __device__ inline void store(scalar_t *from, int idx) {
     int thread_idx = threadIdx.x;
@@ -109,7 +125,10 @@ struct unroll {
       if (thread_idx >= remaining) {
         return;
       }
-      to[thread_idx] = from[i];
+      int linear_idx = thread_idx + block_work_size * idx;
+      int offset = output_offset_calculator.get(linear_idx)[0];
+      scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + offset;
+      *to = from[i];
       thread_idx += num_threads;
     }
   }
@@ -153,7 +172,7 @@ struct vectorized {
   template<typename args_t>
   __device__ inline void load(args_t *args, int idx) {
     constexpr int arity = std::tuple_size<args_t>::value;
-    detail::static_unroll<detail::load_with_policy, arity>::with_args(*this, args, idx);
+    detail::static_unroll<detail::vectorized_load_helper, arity>::with_args(*this, args, idx);
   }
 
   template<typename scalar_t>