pytorch
diff --git a/‎.github/actions/setup-ssh/action.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/actions/setup-ssh/action.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 10 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/Activation.cpp‎
Lines changed: 82 additions & 214 deletions b/‎aten/src/ATen/native/Activation.cpp‎
Lines changed: 82 additions & 214 deletions
diff --git a/‎aten/src/ATen/native/Activation.h‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/native/Activation.h‎
Lines changed: 2 additions & 0 deletions
@@ -14,3 +14,4 @@ runs:
       uses: seemethere/add-github-ssh-key@v1
       with:
         GITHUB_TOKEN: ${{ inputs.github-secret }}
+        activate-with-label: false
@@ -1,3 +1,6 @@
+[init_config]
+last_hash = "7d8b366223b22aaab788b0a7efec064dfef38b24"
+
 [[linter]]
 code = 'FLAKE8'
 include_patterns = ['**/*.py']
@@ -269,7 +272,10 @@ exclude_patterns=[
     'third_party/**',
     '**/*.expect',
     '**/*.ipynb',
+    '**/*.ptl',
     'tools/clang_format_hash/**',
+    'test/cpp/jit/upgrader_models/*.ptl',
+    'test/cpp/jit/upgrader_models/*.ptl.ff',
 ]
 command = [
     'python3',
@@ -285,6 +291,8 @@ exclude_patterns = [
     '**/contrib/**',
     '**/*.diff',
     'third_party/**',
+    'test/cpp/jit/upgrader_models/*.ptl',
+    'test/cpp/jit/upgrader_models/*.ptl.ff',
 ]
 command = [
     'python3',
@@ -310,6 +318,8 @@ exclude_patterns = [
     'third_party/**',
     '**/.gitattributes',
     '**/.gitmodules',
+    'test/cpp/jit/upgrader_models/*.ptl',
+    'test/cpp/jit/upgrader_models/*.ptl.ff',
 ]
 command = [
     'python3',
 
@@ -202,6 +202,8 @@ DEFINE_DISPATCH(silu_stub);
 DEFINE_DISPATCH(silu_backward_stub);
 DEFINE_DISPATCH(mish_stub);
 DEFINE_DISPATCH(mish_backward_stub);
+DEFINE_DISPATCH(prelu_cpu_stub);
+DEFINE_DISPATCH(prelu_backward_cpu_stub);
 
 TORCH_IMPL_FUNC(elu_out) (
   const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result
@@ -595,253 +597,119 @@ TORCH_IMPL_FUNC(threshold_backward_out)(const Tensor& grad, const Tensor& self,
   threshold_stub(device_type(), *this, threshold, 0);
 }
 
-// -----------------------------------
-// prelu forward
-// -----------------------------------
-template <typename scalar_t>
-void inline prelu_cpu_kernel_share_weights(
-  Tensor& result,
-  const Tensor& input,
-  const Tensor& weight) {
-
-  int64_t input_numel = input.numel();
-  auto result_data = result.data_ptr<scalar_t>();
-  auto input_data = input.data_ptr<scalar_t>();
-  auto weight_val = weight.data_ptr<scalar_t>()[0];
-
-  at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      scalar_t input_data_val = input_data[i];
-      // to allow for compiler optimization, here splitting into two lines:
-      scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val;
-      result_data[i] = r * input_data_val;
-    }
-  });
-}
-
-template <typename scalar_t>
-void inline prelu_cpu_kernel_multi_weights(
-  Tensor& result,
-  const Tensor& input,
-  const Tensor& weight,
-  int64_t input_dim0_size,
-  int64_t channel_size,
-  int64_t input_stride0,
-  int64_t input_stride1) {
-
-  scalar_t* result_data = result.data_ptr<scalar_t>();
-  scalar_t* input_data = input.data_ptr<scalar_t>();
-  scalar_t* weight_data = weight.data_ptr<scalar_t>();
-
-  auto loop = [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      int64_t offset = i * channel_size * input_stride1;
-      scalar_t* n_input_data = input_data + offset;
-      scalar_t* n_result_data = result_data + offset;
-      for (const auto j : c10::irange(channel_size)) {
-        for (const auto k : c10::irange(input_stride1)) {
-          // to allow for compiler optimization, here splitting into two lines:
-          scalar_t w = (n_input_data[k] > 0) ? scalar_t(1) : weight_data[j];
-          n_result_data[k] = w * n_input_data[k];
-        }
-        n_input_data += input_stride1;
-        n_result_data += input_stride1;
-      }
-    }
-  };
-  if (input.numel() > 1000) {
-    at::parallel_for(0, input_dim0_size, 0, loop);
-  } else {
-    loop(0, input_dim0_size);
-  }
-}
-
 Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
-  auto input = self.contiguous();
-  auto weight = weight_.contiguous();
-
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(weight.is_contiguous());
+  int64_t weight_num = weight_.numel();
+  Tensor result = at::empty_like(self, self.suggest_memory_format());
 
-  int64_t weight_num = weight.numel();
-  Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto strides = input.strides();
-
-  // case1: shared weight for all channels
-  if (weight_num == 1) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] {
-      prelu_cpu_kernel_share_weights<scalar_t>(result, input, weight);
-    });
-  }
-  else { // case2: multiple weights, one for each channel
-    int64_t input_ndim = input.dim();
+  if (weight_num != 1) {
+    int64_t input_ndim = self.dim();
     TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
-
     if (input_ndim > 1) {
-      channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
-      input_stride0 = strides[0];
-      input_stride1 = strides[1];
+      channel_size = self.size(1); // channel is the 2nd dim of input
     }
     TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
-
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] {
-      prelu_cpu_kernel_multi_weights<scalar_t>(
-        result,
-        input,
-        weight,
-        input_dim0_size,
-        channel_size,
-        input_stride0,
-        input_stride1);
-    });
   }
-  return result;
-}
 
-// -----------------------------------
-// prelu backward
-// -----------------------------------
-template <typename scalar_t>
-void inline prelu_cpu_backward_kernel_share_weights(
-  const Tensor& input,
-  const Tensor& weight,
-  const Tensor& grad_out,
-  Tensor& input_grad,
-  Tensor& weight_grad) {
-
-  int64_t input_numel = input.numel();
-  auto input_data = input.data_ptr<scalar_t>();
-  auto weight_val = weight.data_ptr<scalar_t>()[0];
-  auto grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto input_grad_data = input_grad.data_ptr<scalar_t>();
-  auto weight_grad_data = weight_grad.data_ptr<scalar_t>();
-
-  scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0),
-      [&](int64_t start, int64_t end, scalar_t ident) -> scalar_t {
-    scalar_t partial_sum = ident;
-    for (const auto i : c10::irange(start, end)) {
-      scalar_t input_data_val = input_data[i];
-      scalar_t grad_out_data_val = grad_out_data[i];
-      // to allow for compiler optimization, here splitting into two lines:
-      scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_val;
-      input_grad_data[i] = w * grad_out_data_val;
-      // to allow for compiler optimization, here splitting into two lines:
-      scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1);
-      partial_sum += mask * input_data_val * grad_out_data_val;
-    }
-    return partial_sum;
-  }, std::plus<scalar_t>());
-  weight_grad_data[0] = sum;
-}
-
-template <typename scalar_t>
-void inline prelu_cpu_backward_kernel_multi_weights(
-  const Tensor& input,
-  const Tensor& weight,
-  const Tensor& grad_out,
-  Tensor& input_grad,
-  Tensor& weight_grad_collector,
-  int64_t input_dim0_size,
-  int64_t channel_size,
-  int64_t input_stride0,
-  int64_t input_stride1) {
-
-  auto input_data = input.data_ptr<scalar_t>();
-  auto weight_data = weight.data_ptr<scalar_t>();
-  auto grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto input_grad_data = input_grad.data_ptr<scalar_t>();
-  auto weight_grad_collector_data = weight_grad_collector.data_ptr<scalar_t>();
-
-  auto loop = [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      for (const auto j : c10::irange(channel_size)) {
-        for (const auto k : c10::irange(input_stride1)) {
-          int64_t pos = i * input_stride0 + j * input_stride1 + k;
-          scalar_t weight_data_val = weight_data[j];
-          scalar_t input_data_val = input_data[pos];
-          scalar_t grad_out_data_val = grad_out_data[pos];
-          // to allow for compiler optimization, here splitting into two lines:
-          scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_data_val;
-          input_grad_data[pos] = w * grad_out_data_val;
-          // to allow for compiler optimization, here splitting into two lines:
-          scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1);
-          weight_grad_collector_data[pos] = mask * input_data_val * grad_out_data_val;
-        }
-      }
+  const int64_t ndim = self.dim();
+  // Helper to convert 1d tensors or scalar tensor to an nd tensor that broadcasts with input
+  // All elements go into the channel dimension
+  DimVector sizes(ndim, 1), strides(ndim, 0);
+  auto as_nd = [&](const Tensor& t) {
+    TORCH_INTERNAL_ASSERT(t.defined() && (t.dim() == 1 || t.dim() == 0));
+    if (ndim >= 2) {
+      sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1;
+      strides[1] = t.dim() == 1 ? t.strides()[0] : 0;
+      return t.as_strided(sizes, strides);
     }
+    return t.as_strided(sizes, strides);
   };
-  if (input.numel() > 1000) {
-    at::parallel_for(0, input_dim0_size, 0, loop);
+  Tensor w;
+  if (self.scalar_type() == ScalarType::BFloat16) {
+    auto w_bf16 = at::empty(weight_.sizes(), weight_.options().dtype(ScalarType::BFloat16));
+    w_bf16.copy_(weight_);
+    w = weight_.defined() ? as_nd(w_bf16) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
   } else {
-    loop(0, input_dim0_size);
+    w = weight_.defined() ? as_nd(weight_) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
   }
+
+  auto iter = TensorIteratorConfig()
+    .add_output(result)
+    .add_input(self)
+    .add_input(w)
+    .build();
+  prelu_cpu_stub(iter.device_type(), iter);
+  return result;
 }
 
 std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Tensor& self, const Tensor& weight_) {
-  auto input = self.contiguous();
-  auto grad_out = grad_out_.contiguous();
-  auto weight = weight_.contiguous();
-
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(grad_out.is_contiguous());
-  TORCH_CHECK(weight.is_contiguous());
+  int64_t weight_num = weight_.numel();
 
-  int64_t weight_num = weight.numel();
-  auto strides = input.strides();
-  auto dims = input.dim();
+  Tensor input_grad = at::empty_like(self, self.suggest_memory_format());
+  Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+  Tensor weight_grad_collector = at::empty_like(self, at::MemoryFormat::Contiguous);
 
-  Tensor input_grad = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  Tensor weight_grad = at::empty_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  Tensor weight_grad_collector = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-
-  // case1: shared parameter for all channels
-  if (weight_num == 1) {
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] {
-      prelu_cpu_backward_kernel_share_weights<scalar_t>(input, weight, grad_out, input_grad, weight_grad);
-    });
-  }
-  else { // case2: multiple parameters, one for each channel
-    int64_t input_ndim = input.dim();
+  if (weight_num != 1) {
+    int64_t input_ndim = self.dim();
     TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
-
     if (input_ndim > 1) {
-      channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
-      input_stride0 = strides[0];
-      input_stride1 = strides[1];
+      channel_size = self.size(1); // channel is the 2nd dim of input
     }
     TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
+  }
 
-    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] {
-      prelu_cpu_backward_kernel_multi_weights<scalar_t>(
-        input,
-        weight,
-        grad_out,
-        input_grad,
-        weight_grad_collector,
-        input_dim0_size,
-        channel_size,
-        input_stride0,
-        input_stride1);
-    });
+  const int64_t ndim = self.dim();
+  // Helper to convert 1d tensor or scalar tensor to an nd tensor that broadcasts with input
+  // All elements go into the channel dimension
+  DimVector sizes(ndim, 1), strides(ndim, 0);
+  auto as_nd = [&](const Tensor& t) {
+    TORCH_INTERNAL_ASSERT(t.defined() && (t.dim() == 1 || t.dim() == 0));
+    if (ndim >= 2) {
+      sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1;
+      strides[1] = t.dim() == 1 ? t.strides()[0] : 0;
+      return t.as_strided(sizes, strides);
+    }
+    return t.as_strided(sizes, strides);
+  };
+  Tensor w;
+  if (self.scalar_type() == ScalarType::BFloat16) {
+    auto w_bf16 = at::empty(weight_.sizes(), weight_.options().dtype(ScalarType::BFloat16));
+    w_bf16.copy_(weight_);
+    w = weight_.defined() ? as_nd(w_bf16) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
+  } else {
+    w = weight_.defined() ? as_nd(weight_) :
+        at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU);
+  }
+
+  auto iter = TensorIteratorConfig()
+    .add_output(input_grad)
+    .add_output(weight_grad_collector)
+    .add_input(self)
+    .add_input(grad_out_)
+    .add_input(w)
+    .build();
+
+  prelu_backward_cpu_stub(iter.device_type(), iter);
+
+  if (weight_num == 1) {
+    weight_grad.fill_(weight_grad_collector.sum());
+  } else {
     // update weight_grad
     std::vector<int64_t> reduce_dims;
+    int64_t input_ndim = self.dim();
     reduce_dims.push_back(0);
-    if (dims > 2) {
-      for (const auto i : c10::irange(2, dims)) {
-        reduce_dims.push_back(i);
-      }
+    if (input_ndim > 2) {
+      for(int64_t i = 2; i < input_ndim; i++) reduce_dims.push_back(i);
     }
     weight_grad = weight_grad_collector.sum(reduce_dims);
   }
 
@@ -80,6 +80,8 @@ DECLARE_DISPATCH(structured_activation_fn, silu_stub);
 DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub);
 DECLARE_DISPATCH(structured_activation_fn, mish_stub);
 DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub);
+DECLARE_DISPATCH(activation_fn, prelu_cpu_stub);
+DECLARE_DISPATCH(activation_backward_fn, prelu_backward_cpu_stub);
 
 } // namespace native