Skip to content

Commit 9fdd374

Browse files
committed
Update on "[Dynamo] modify IPEX backend (copy of #92067)"
This is a copy of #92067 to resolve the merge conflicts with the next PR in the stack. Go ahead and land #92067, then I can delete this one. cc mlazos soumith voznesenskym yanboliang penguinwu anijain2305 EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx desertfire [ghstack-poisoned]
2 parents 938f6b2 + 2231e57 commit 9fdd374

81 files changed

Lines changed: 1638 additions & 524 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.circleci/docker/common/install_conda.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
7575
}
7676

7777
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
78-
CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools six"
78+
CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
7979
if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
8080
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
8181
# TODO: Stop using `-c malfet`

.github/ci_commit_pins/vision.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
b094075cbc8834d63a9fa8ae08bcad3d72a43321
1+
135a0f9ea9841b6324b4fe8974e2543cbb95709a

.github/workflows/_win-test.yml

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,27 @@ jobs:
6767
shell: powershell
6868
continue-on-error: true
6969
run: |
70-
# https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.management/stop-process
71-
# This needs to be run before checking out PyTorch to avoid locking the working directory
72-
try {
73-
Get-Process -Name "python" -ErrorAction Stop | Stop-Process -Force
70+
# This needs to be run before checking out PyTorch to avoid locking the working directory.
71+
# Below is the list of commands that could lock $GITHUB_WORKSPACE gathered from sysinternals
72+
# handle tool
73+
$processes = "python", "ninja", "cl", "nvcc", "cmd"
74+
Foreach ($process In $processes) {
75+
Try {
76+
# https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.management/stop-process
77+
Get-Process -Name $process -ErrorAction Stop | Stop-Process -Force
78+
}
79+
Catch {
80+
Write-Output "No leftover $process process, continuing"
81+
}
7482
}
75-
catch {
76-
Write-Output "No leftover process, continuing"
83+
84+
Try {
85+
# Print all the processes for debugging
86+
Wmic Path Win32_Process Get Caption,Processid,Commandline | Format-List
87+
}
88+
Catch {
89+
# Better to write out whatever exception thrown to help debugging any potential issue
90+
Write-Output $_
7791
}
7892
7993
- name: Setup SSH (Click me for login details)

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ Other potentially useful environment variables may be found in `setup.py`.
184184
**Common**
185185

186186
```bash
187-
conda install astunparse numpy ninja pyyaml setuptools cmake typing_extensions six requests dataclasses
187+
conda install cmake ninja
188+
pip install -r requirements.txt
188189
```
189190

190191
**On Linux**

aten/src/ATen/core/function_schema.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@ const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type)
1919
}
2020

2121
FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
22-
auto alwaysCloneWithRealTypes = [&](const Argument& a) {
23-
return a.cloneWithType(a.real_type());
24-
};
2522
auto cloneWithRealTypes = [&](const Argument& a) {
2623
if (with_symint) {
2724
return a.cloneWithType(a.real_type());
@@ -42,8 +39,7 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
4239
};
4340
std::vector<Argument> new_arguments, new_returns;
4441
std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
45-
// NB: SymInt returns are always SymInt
46-
std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes);
42+
std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), cloneWithRealTypes);
4743
return FunctionSchema(
4844
name(),
4945
overload_name(),

aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,12 @@ class HIPStreamMasqueradingAsCUDA {
7474
return unwrap().pack3();
7575
}
7676

77-
static HIPStreamMasqueradingAsCUDA unpack3(int64_t stream_id,
78-
int64_t device_index,
79-
int64_t device_type) {
77+
static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
78+
DeviceIndex device_index,
79+
DeviceType device_type) {
8080
// NB: constructor manages CUDA->HIP translation for us
81-
return HIPStreamMasqueradingAsCUDA(Stream::unpack3(stream_id, device_index, device_type));
81+
return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
82+
stream_id, device_index, device_type));
8283
}
8384

8485
static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }

aten/src/ATen/native/TensorProperties.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,22 +49,6 @@ int64_t stride(const Tensor& self, int64_t dim) {
4949
return self.stride(dim);
5050
}
5151

52-
c10::SymInt sym_size(const Tensor& self, int64_t dim) {
53-
return self.sym_size(dim);
54-
}
55-
56-
c10::SymInt sym_stride(const Tensor& self, int64_t dim) {
57-
return self.sym_stride(dim);
58-
}
59-
60-
c10::SymInt sym_numel(const Tensor& self) {
61-
return self.sym_numel();
62-
}
63-
64-
c10::SymInt sym_storage_offset(const Tensor& self) {
65-
return self.sym_storage_offset();
66-
}
67-
6852
int64_t size(const Tensor& self, Dimname dim) {
6953
size_t pos_dim = dimname_to_position(self, dim);
7054
return self.sizes()[pos_dim];
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#pragma once
2+
3+
#include <ATen/Parallel.h>
4+
#include <ATen/NumericUtils.h>
5+
#include <ATen/cpu/vec/vec.h>
6+
#include <ATen/cpu/vec/functional.h>
7+
#include <ATen/native/ReductionType.h>
8+
#include <c10/util/irange.h>
9+
10+
namespace at::native {
11+
inline namespace CPU_CAPABILITY {
12+
13+
using namespace vec;
14+
15+
#define AT_DISPATCH_REDUCTION_TYPES(op, ...) \
16+
[&] { \
17+
switch (op) { \
18+
case SUM: { \
19+
static constexpr ReductionType reduce = SUM; \
20+
return __VA_ARGS__(); \
21+
} \
22+
case MEAN: { \
23+
static constexpr ReductionType reduce = MEAN; \
24+
return __VA_ARGS__(); \
25+
} \
26+
case MIN: { \
27+
static constexpr ReductionType reduce = MIN; \
28+
return __VA_ARGS__(); \
29+
} \
30+
case MAX: { \
31+
static constexpr ReductionType reduce = MAX; \
32+
return __VA_ARGS__(); \
33+
} \
34+
case PROD: { \
35+
static constexpr ReductionType reduce = PROD; \
36+
return __VA_ARGS__(); \
37+
} \
38+
} \
39+
}()
40+
41+
template <typename scalar_t, ReductionType reduce>
42+
inline vec_scalar_t<scalar_t> init_value() {
43+
using acc_t = vec_scalar_t<scalar_t>;
44+
acc_t val;
45+
if (reduce == ReductionType::SUM ||
46+
reduce == ReductionType::MEAN) {
47+
val = static_cast<acc_t>(0);
48+
} else if (reduce == ReductionType::PROD) {
49+
val = static_cast<acc_t>(1);
50+
} else if (reduce == ReductionType::MAX) {
51+
val = -std::numeric_limits<acc_t>::infinity();
52+
} else {
53+
TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
54+
val = std::numeric_limits<acc_t>::infinity();
55+
}
56+
return val;
57+
}
58+
59+
template <typename scalar_t, ReductionType reduce>
60+
inline vec_scalar_t<scalar_t> init_value(const c10::optional<Scalar>& initial) {
61+
using acc_t = vec_scalar_t<scalar_t>;
62+
if (initial.has_value()) {
63+
return initial.value().to<acc_t>();
64+
} else {
65+
return init_value<scalar_t, reduce>();
66+
}
67+
}
68+
69+
template <typename scalar_t>
70+
inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val) {
71+
using Vec = Vectorized<vec_scalar_t<scalar_t>>;
72+
map<scalar_t>(
73+
[val](Vec x) { return Vec(val); },
74+
out,
75+
out,
76+
size);
77+
}
78+
79+
template <typename scalar_t, ReductionType reduce>
80+
inline void init(scalar_t* out, int64_t size, const c10::optional<Scalar>& initial) {
81+
using acc_t = vec_scalar_t<scalar_t>;
82+
acc_t val = init_value<scalar_t, reduce>(initial);
83+
init(out, size, val);
84+
}
85+
86+
// overload with `include_self`, used by scatter_reduce
87+
template <typename scalar_t, ReductionType reduce>
88+
inline void init(scalar_t* out, int64_t size, bool include_self = false) {
89+
using acc_t = vec_scalar_t<scalar_t>;
90+
if (!include_self) {
91+
acc_t val = init_value<scalar_t, reduce>();
92+
init(out, size, val);
93+
}
94+
}
95+
96+
template <typename scalar_t>
97+
inline scalar_t _max(const scalar_t& x, const scalar_t& y) {
98+
return at::_isnan(y) ? y : std::max(x, y);
99+
}
100+
101+
template <typename scalar_t>
102+
inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
103+
// vec::maximum propagates NaN
104+
return vec::maximum(x, y);
105+
}
106+
107+
template <typename scalar_t>
108+
inline scalar_t _min(const scalar_t& x, const scalar_t& y) {
109+
return at::_isnan(y) ? y : std::min(x, y);
110+
}
111+
112+
template <typename scalar_t>
113+
inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
114+
// vec::minimum propagates NaN
115+
return vec::minimum(x, y);
116+
}
117+
118+
// for Max and Min, propagate NaN:
119+
template <typename T, ReductionType reduce>
120+
inline T update(const T& x, const T& y) {
121+
if (reduce == ReductionType::SUM ||
122+
reduce == ReductionType::MEAN) {
123+
return x + y;
124+
} else if (reduce == ReductionType::PROD) {
125+
return x * y;
126+
} else if (reduce == ReductionType::MAX) {
127+
return _max(x, y);
128+
} else {
129+
TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
130+
return _min(x, y);
131+
}
132+
}
133+
134+
template <typename scalar_t, ReductionType reduce>
135+
inline void update(scalar_t* out, scalar_t* data, int64_t K) {
136+
using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
137+
map2<scalar_t>(
138+
[](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
139+
out,
140+
out,
141+
data,
142+
K);
143+
}
144+
145+
template <typename scalar_t, ReductionType reduce>
146+
inline void write(scalar_t* out, int64_t count, int64_t K) {
147+
using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
148+
if (reduce == ReductionType::MEAN) {
149+
if (count > 0) {
150+
vec::map<scalar_t>(
151+
[count](Vec x) { return x / Vec(count); },
152+
out,
153+
out,
154+
K);
155+
}
156+
}
157+
}
158+
159+
} // namespace CPU_CAPABILITY
160+
} // namespace at::native

0 commit comments

Comments
 (0)