pytorch
diff --git a/‎.circleci/docker/common/install_conda.sh‎
Lines changed: 1 addition & 1 deletion b/‎.circleci/docker/common/install_conda.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/ci_commit_pins/vision.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/vision.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_win-test.yml‎
Lines changed: 20 additions & 6 deletions b/‎.github/workflows/_win-test.yml‎
Lines changed: 20 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/core/function_schema.cpp‎
Lines changed: 1 addition & 5 deletions b/‎aten/src/ATen/core/function_schema.cpp‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h‎
Lines changed: 5 additions & 4 deletions b/‎aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/TensorProperties.cpp‎
Lines changed: 0 additions & 16 deletions b/‎aten/src/ATen/native/TensorProperties.cpp‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎aten/src/ATen/native/cpu/ReduceUtils.h‎
Lines changed: 160 additions & 0 deletions b/‎aten/src/ATen/native/cpu/ReduceUtils.h‎
Lines changed: 160 additions & 0 deletions
@@ -75,7 +75,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   }
 
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
-  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools six"
+  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
   if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
     # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
     # TODO: Stop using `-c malfet`
 
@@ -1 +1 @@
-b094075cbc8834d63a9fa8ae08bcad3d72a43321
+135a0f9ea9841b6324b4fe8974e2543cbb95709a
@@ -67,13 +67,27 @@ jobs:
         shell: powershell
         continue-on-error: true
         run: |
-          # https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.management/stop-process
-          # This needs to be run before checking out PyTorch to avoid locking the working directory
-          try {
-              Get-Process -Name "python" -ErrorAction Stop | Stop-Process -Force
+          # This needs to be run before checking out PyTorch to avoid locking the working directory.
+          # Below is the list of commands that could lock $GITHUB_WORKSPACE gathered from sysinternals
+          # handle tool
+          $processes = "python", "ninja", "cl", "nvcc", "cmd"
+          Foreach ($process In $processes) {
+            Try {
+              # https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.management/stop-process
+              Get-Process -Name $process -ErrorAction Stop | Stop-Process -Force
+            }
+            Catch {
+              Write-Output "No leftover $process process, continuing"
+            }
           }
-          catch {
-              Write-Output "No leftover process, continuing"
+
+          Try {
+            # Print all the processes for debugging
+            Wmic Path Win32_Process Get Caption,Processid,Commandline | Format-List
+          }
+          Catch {
+            # Better to write out whatever exception thrown to help debugging any potential issue
+            Write-Output $_
           }
 
       - name: Setup SSH (Click me for login details)
 
@@ -184,7 +184,8 @@ Other potentially useful environment variables may be found in `setup.py`.
 **Common**
 
 ```bash
-conda install astunparse numpy ninja pyyaml setuptools cmake typing_extensions six requests dataclasses
+conda install cmake ninja
+pip install -r requirements.txt
 ```
 
 **On Linux**
 
@@ -19,9 +19,6 @@ const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type)
 }
 
 FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
-  auto alwaysCloneWithRealTypes = [&](const Argument& a) {
-    return a.cloneWithType(a.real_type());
-  };
   auto cloneWithRealTypes = [&](const Argument& a) {
     if (with_symint) {
       return a.cloneWithType(a.real_type());
@@ -42,8 +39,7 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
   };
   std::vector<Argument> new_arguments, new_returns;
   std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
-  // NB: SymInt returns are always SymInt
-  std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes);
+  std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), cloneWithRealTypes);
   return FunctionSchema(
     name(),
     overload_name(),
 
@@ -74,11 +74,12 @@ class HIPStreamMasqueradingAsCUDA {
     return unwrap().pack3();
   }
 
-  static HIPStreamMasqueradingAsCUDA unpack3(int64_t stream_id,
-                                             int64_t device_index,
-                                             int64_t device_type) {
+  static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
+                                             DeviceIndex device_index,
+                                             DeviceType device_type) {
     // NB: constructor manages CUDA->HIP translation for us
-    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(stream_id, device_index, device_type));
+    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
+        stream_id, device_index, device_type));
   }
 
   static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }
 
@@ -49,22 +49,6 @@ int64_t stride(const Tensor& self, int64_t dim) {
   return self.stride(dim);
 }
 
-c10::SymInt sym_size(const Tensor& self, int64_t dim) {
-  return self.sym_size(dim);
-}
-
-c10::SymInt sym_stride(const Tensor& self, int64_t dim) {
-  return self.sym_stride(dim);
-}
-
-c10::SymInt sym_numel(const Tensor& self) {
-  return self.sym_numel();
-}
-
-c10::SymInt sym_storage_offset(const Tensor& self) {
-  return self.sym_storage_offset();
-}
-
 int64_t size(const Tensor& self, Dimname dim) {
   size_t pos_dim = dimname_to_position(self, dim);
   return self.sizes()[pos_dim];
 
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/native/ReductionType.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+#define AT_DISPATCH_REDUCTION_TYPES(op, ...)                                   \
+  [&] {                                                                        \
+    switch (op) {                                                              \
+      case SUM: {                                                              \
+        static constexpr ReductionType reduce = SUM;                           \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case MEAN: {                                                             \
+        static constexpr ReductionType reduce = MEAN;                          \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case MIN: {                                                              \
+        static constexpr ReductionType reduce = MIN;                           \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case MAX: {                                                              \
+        static constexpr ReductionType reduce = MAX;                           \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case PROD: {                                                             \
+        static constexpr ReductionType reduce = PROD;                          \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+    }                                                                          \
+  }()
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value() {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val;
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    val = static_cast<acc_t>(0);
+  } else if (reduce == ReductionType::PROD) {
+    val = static_cast<acc_t>(1);
+  } else if (reduce == ReductionType::MAX) {
+    val = -std::numeric_limits<acc_t>::infinity();
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    val = std::numeric_limits<acc_t>::infinity();
+  }
+  return val;
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value(const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (initial.has_value()) {
+    return initial.value().to<acc_t>();
+  } else {
+    return init_value<scalar_t, reduce>();
+  }
+}
+
+template <typename scalar_t>
+inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val) {
+  using Vec = Vectorized<vec_scalar_t<scalar_t>>;
+  map<scalar_t>(
+      [val](Vec x) { return Vec(val); },
+      out,
+      out,
+      size);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val = init_value<scalar_t, reduce>(initial);
+  init(out, size, val);
+}
+
+// overload with `include_self`, used by scatter_reduce
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, bool include_self = false) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (!include_self) {
+    acc_t val = init_value<scalar_t, reduce>();
+    init(out, size, val);
+  }
+}
+
+template <typename scalar_t>
+inline scalar_t _max(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::max(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::maximum propagates NaN
+  return vec::maximum(x, y);
+}
+
+template <typename scalar_t>
+inline scalar_t _min(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::min(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::minimum propagates NaN
+  return vec::minimum(x, y);
+}
+
+// for Max and Min, propagate NaN:
+template <typename T, ReductionType reduce>
+inline T update(const T& x, const T& y) {
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    return x + y;
+  } else if (reduce == ReductionType::PROD) {
+    return x * y;
+  } else if (reduce == ReductionType::MAX) {
+    return _max(x, y);
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    return _min(x, y);
+  }
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void update(scalar_t* out, scalar_t* data, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  map2<scalar_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void write(scalar_t* out, int64_t count, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  if (reduce == ReductionType::MEAN) {
+    if (count > 0) {
+      vec::map<scalar_t>(
+          [count](Vec x) { return x / Vec(count); },
+          out,
+          out,
+          K);
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README`
`78`		`- CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools six"`
	`78`	`+ CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"`
`79`	`79`	`if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then`
`80`	`80`	`# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source`
`81`	`81`	# TODO: Stop using `-c malfet`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-b094075cbc8834d63a9fa8ae08bcad3d72a43321`
	`1`	`+135a0f9ea9841b6324b4fe8974e2543cbb95709a`