Skip to content

Commit 1fa958d

Browse files
committed
BUG: Fix AVX512_SPR dispatching for SVML half-precision operations
During the transition from distutils CPU dispatcher to Meson, `AVX512_SPR` dispatching was inadvertently omitted for half-precision operations. SVML half-precision operations were not dynamically dispatched and only benefited from `AVX512_SKX`, which led to performance reduction on Intel SPR. This patch fixes the issue above and also moves the half-precision operations from `loops_umath_fp.dispatch.c.src` into a separate source file `loops_half.dispatch.c.src`. This separation is beneficial since `loops_umath_fp.dispatch.c.src` covers SVML single/double-precision operations as well, which would generate objects for these operations for `AVX512_SPR` without any performance benefits, leading to unnecessary increases in binary size.
1 parent 6ff905a commit 1fa958d

5 files changed

Lines changed: 122 additions & 105 deletions

File tree

numpy/_core/code_generators/generate_umath.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -776,55 +776,55 @@ def english_upper(s):
776776
Ufunc(1, 1, None,
777777
docstrings.get('numpy._core.umath.arccos'),
778778
None,
779-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
779+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
780780
TD(inexact, f='acos', astype={'e': 'f'}),
781781
TD(P, f='arccos'),
782782
),
783783
'arccosh':
784784
Ufunc(1, 1, None,
785785
docstrings.get('numpy._core.umath.arccosh'),
786786
None,
787-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
787+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
788788
TD(inexact, f='acosh', astype={'e': 'f'}),
789789
TD(P, f='arccosh'),
790790
),
791791
'arcsin':
792792
Ufunc(1, 1, None,
793793
docstrings.get('numpy._core.umath.arcsin'),
794794
None,
795-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
795+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
796796
TD(inexact, f='asin', astype={'e': 'f'}),
797797
TD(P, f='arcsin'),
798798
),
799799
'arcsinh':
800800
Ufunc(1, 1, None,
801801
docstrings.get('numpy._core.umath.arcsinh'),
802802
None,
803-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
803+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
804804
TD(inexact, f='asinh', astype={'e': 'f'}),
805805
TD(P, f='arcsinh'),
806806
),
807807
'arctan':
808808
Ufunc(1, 1, None,
809809
docstrings.get('numpy._core.umath.arctan'),
810810
None,
811-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
811+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
812812
TD(inexact, f='atan', astype={'e': 'f'}),
813813
TD(P, f='arctan'),
814814
),
815815
'arctanh':
816816
Ufunc(1, 1, None,
817817
docstrings.get('numpy._core.umath.arctanh'),
818818
None,
819-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
819+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
820820
TD(inexact, f='atanh', astype={'e': 'f'}),
821821
TD(P, f='arctanh'),
822822
),
823823
'cos':
824824
Ufunc(1, 1, None,
825825
docstrings.get('numpy._core.umath.cos'),
826826
None,
827-
TD('e', dispatch=[('loops_umath_fp', 'e')]),
827+
TD('e', dispatch=[('loops_half', 'e')]),
828828
TD('f', dispatch=[('loops_trigonometric', 'f')]),
829829
TD('d', dispatch=[('loops_trigonometric', 'd')]),
830830
TD('g' + cmplx, f='cos'),
@@ -834,7 +834,7 @@ def english_upper(s):
834834
Ufunc(1, 1, None,
835835
docstrings.get('numpy._core.umath.sin'),
836836
None,
837-
TD('e', dispatch=[('loops_umath_fp', 'e')]),
837+
TD('e', dispatch=[('loops_half', 'e')]),
838838
TD('f', dispatch=[('loops_trigonometric', 'f')]),
839839
TD('d', dispatch=[('loops_trigonometric', 'd')]),
840840
TD('g' + cmplx, f='sin'),
@@ -844,31 +844,31 @@ def english_upper(s):
844844
Ufunc(1, 1, None,
845845
docstrings.get('numpy._core.umath.tan'),
846846
None,
847-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
847+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
848848
TD(inexact, f='tan', astype={'e': 'f'}),
849849
TD(P, f='tan'),
850850
),
851851
'cosh':
852852
Ufunc(1, 1, None,
853853
docstrings.get('numpy._core.umath.cosh'),
854854
None,
855-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
855+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
856856
TD(inexact, f='cosh', astype={'e': 'f'}),
857857
TD(P, f='cosh'),
858858
),
859859
'sinh':
860860
Ufunc(1, 1, None,
861861
docstrings.get('numpy._core.umath.sinh'),
862862
None,
863-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
863+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
864864
TD(inexact, f='sinh', astype={'e': 'f'}),
865865
TD(P, f='sinh'),
866866
),
867867
'tanh':
868868
Ufunc(1, 1, None,
869869
docstrings.get('numpy._core.umath.tanh'),
870870
None,
871-
TD('e', dispatch=[('loops_umath_fp', 'e')]),
871+
TD('e', dispatch=[('loops_half', 'e')]),
872872
TD('fd', dispatch=[('loops_hyperbolic', 'fd')]),
873873
TD(inexact, f='tanh', astype={'e': 'f'}),
874874
TD(P, f='tanh'),
@@ -877,7 +877,7 @@ def english_upper(s):
877877
Ufunc(1, 1, None,
878878
docstrings.get('numpy._core.umath.exp'),
879879
None,
880-
TD('e', dispatch=[('loops_umath_fp', 'e')]),
880+
TD('e', dispatch=[('loops_half', 'e')]),
881881
TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
882882
TD('fdg' + cmplx, f='exp'),
883883
TD(P, f='exp'),
@@ -886,23 +886,23 @@ def english_upper(s):
886886
Ufunc(1, 1, None,
887887
docstrings.get('numpy._core.umath.exp2'),
888888
None,
889-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
889+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
890890
TD(inexact, f='exp2', astype={'e': 'f'}),
891891
TD(P, f='exp2'),
892892
),
893893
'expm1':
894894
Ufunc(1, 1, None,
895895
docstrings.get('numpy._core.umath.expm1'),
896896
None,
897-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
897+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
898898
TD(inexact, f='expm1', astype={'e': 'f'}),
899899
TD(P, f='expm1'),
900900
),
901901
'log':
902902
Ufunc(1, 1, None,
903903
docstrings.get('numpy._core.umath.log'),
904904
None,
905-
TD('e', dispatch=[('loops_umath_fp', 'e')]),
905+
TD('e', dispatch=[('loops_half', 'e')]),
906906
TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
907907
TD('fdg' + cmplx, f='log'),
908908
TD(P, f='log'),
@@ -911,23 +911,23 @@ def english_upper(s):
911911
Ufunc(1, 1, None,
912912
docstrings.get('numpy._core.umath.log2'),
913913
None,
914-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
914+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
915915
TD(inexact, f='log2', astype={'e': 'f'}),
916916
TD(P, f='log2'),
917917
),
918918
'log10':
919919
Ufunc(1, 1, None,
920920
docstrings.get('numpy._core.umath.log10'),
921921
None,
922-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
922+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
923923
TD(inexact, f='log10', astype={'e': 'f'}),
924924
TD(P, f='log10'),
925925
),
926926
'log1p':
927927
Ufunc(1, 1, None,
928928
docstrings.get('numpy._core.umath.log1p'),
929929
None,
930-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
930+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
931931
TD(inexact, f='log1p', astype={'e': 'f'}),
932932
TD(P, f='log1p'),
933933
),
@@ -944,7 +944,7 @@ def english_upper(s):
944944
Ufunc(1, 1, None,
945945
docstrings.get('numpy._core.umath.cbrt'),
946946
None,
947-
TD('efd', dispatch=[('loops_umath_fp', 'efd')]),
947+
TD('efd', dispatch=[('loops_umath_fp', 'fd'), ('loops_half', 'e')]),
948948
TD(flts, f='cbrt', astype={'e': 'f'}),
949949
TD(P, f='cbrt'),
950950
),

numpy/_core/meson.build

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,11 @@ foreach gen_mtargets : [
10391039
LSX,
10401040
]
10411041
],
1042+
[
1043+
'loops_half.dispatch.h',
1044+
src_file.process('src/umath/loops_half.dispatch.c.src'),
1045+
[AVX512_SPR, AVX512_SKX]
1046+
],
10421047
]
10431048
mtargets = mod_features.multi_targets(
10441049
gen_mtargets[0], umath_gen_headers + gen_mtargets[1],

numpy/_core/src/umath/loops.h.src

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,9 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
348348
/**end repeat1**/
349349
/**end repeat**/
350350

351+
#ifndef NPY_DISABLE_OPTIMIZATION
352+
#include "loops_half.dispatch.h"
353+
#endif
351354
/**begin repeat
352355
* #func = sin, cos, tan, exp, exp2, log, log2, log10, expm1, log1p, cbrt, arcsin, arccos, arctan, sinh, cosh, tanh, arcsinh, arccosh, arctanh#
353356
*/
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#include "numpy/npy_math.h"
2+
#include "simd/simd.h"
3+
#include "loops_utils.h"
4+
#include "loops.h"
5+
#include "npy_svml.h"
6+
#include "fast_loop_macros.h"
7+
8+
9+
#define NPY__SVML_IS_ENABLED (NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML))
10+
11+
#if NPY__SVML_IS_ENABLED && !defined(NPY_HAVE_AVX512_SPR)
12+
13+
typedef __m256i npyvh_f16;
14+
#define npyv_cvt_f16_f32 _mm512_cvtph_ps
15+
#define npyv_cvt_f32_f16 _mm512_cvtps_ph
16+
#define npyvh_load_f16(PTR) _mm256_loadu_si256((const __m256i*)(PTR))
17+
#define npyvh_store_f16(PTR, data) _mm256_storeu_si256((__m256i*)PTR, data)
18+
NPY_FINLINE npyvh_f16 npyvh_load_till_f16(const npy_half *ptr, npy_uintp nlane, npy_half fill)
19+
{
20+
assert(nlane > 0);
21+
const __m256i vfill = _mm256_set1_epi16(fill);
22+
const __mmask16 mask = (0x0001 << nlane) - 0x0001;
23+
return _mm256_mask_loadu_epi16(vfill, mask, ptr);
24+
}
25+
NPY_FINLINE void npyvh_store_till_f16(npy_half *ptr, npy_uintp nlane, npyvh_f16 data)
26+
{
27+
assert(nlane > 0);
28+
const __mmask16 mask = (0x0001 << nlane) - 0x0001;
29+
_mm256_mask_storeu_epi16(ptr, mask, data);
30+
}
31+
32+
/**begin repeat
33+
* #func = sin, cos, tan, exp, exp2, expm1, log, log2, log10, log1p, cbrt, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh#
34+
* #default_val = 0, 0, 0, 0, 0, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x3c00, 0#
35+
*/
36+
static void
37+
avx512_@func@_f16(const npy_half *src, npy_half *dst, npy_intp len)
38+
{
39+
const int num_lanes = npyv_nlanes_f32;
40+
npyvh_f16 x, out;
41+
npyv_f32 x_ps, out_ps;
42+
for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
43+
if (len >= num_lanes) {
44+
x = npyvh_load_f16(src);
45+
x_ps = npyv_cvt_f16_f32(x);
46+
out_ps = __svml_@func@f16(x_ps);
47+
out = npyv_cvt_f32_f16(out_ps, 0);
48+
npyvh_store_f16(dst, out);
49+
}
50+
else {
51+
x = npyvh_load_till_f16(src, len, @default_val@);
52+
x_ps = npyv_cvt_f16_f32(x);
53+
out_ps = __svml_@func@f16(x_ps);
54+
out = npyv_cvt_f32_f16(out_ps, 0);
55+
npyvh_store_till_f16(dst, len, out);
56+
}
57+
}
58+
npyv_cleanup();
59+
}
60+
/**end repeat**/
61+
#endif // NPY__SVML_IS_ENABLED
62+
63+
/**begin repeat
64+
* #func = sin, cos, tan, exp, exp2, expm1, log, log2, log10, log1p, cbrt, arcsin, arccos, arctan, sinh, cosh, tanh, arcsinh, arccosh, arctanh#
65+
* #intrin = sin, cos, tan, exp, exp2, expm1, log, log2, log10, log1p, cbrt, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh#
66+
*/
67+
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
68+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
69+
{
70+
#if NPY__SVML_IS_ENABLED
71+
const npy_half *src = (npy_half*)args[0];
72+
npy_half *dst = (npy_half*)args[1];
73+
74+
const npy_intp len = dimensions[0];
75+
76+
if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
77+
(steps[0] == sizeof(npy_half)) &&
78+
(steps[1] == sizeof(npy_half))) {
79+
#ifdef NPY_HAVE_AVX512_SPR
80+
__svml_@intrin@s32(src, dst, len);
81+
#else
82+
avx512_@intrin@_f16(src, dst, len);
83+
#endif
84+
return;
85+
}
86+
#endif // NPY__SVML_IS_ENABLED
87+
UNARY_LOOP {
88+
const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
89+
*((npy_half *)op1) = npy_float_to_half(npy_@intrin@f(in1));
90+
}
91+
}
92+
/**end repeat**/
93+

numpy/_core/src/umath/loops_umath_fp.dispatch.c.src

Lines changed: 1 addition & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*@targets
2-
** $maxopt baseline avx512_skx avx512_spr
2+
** $maxopt baseline avx512_skx
33
*/
44
#include "numpy/npy_math.h"
55
#include "simd/simd.h"
@@ -98,91 +98,7 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src1, npy_intp ssrc1,
9898
}
9999
/**end repeat1**/
100100
/**end repeat**/
101-
102-
typedef __m256i npyvh_f16;
103-
#define npyv_cvt_f16_f32 _mm512_cvtph_ps
104-
#define npyv_cvt_f32_f16 _mm512_cvtps_ph
105-
#define npyvh_load_f16(PTR) _mm256_loadu_si256((const __m256i*)(PTR))
106-
#define npyvh_store_f16(PTR, data) _mm256_storeu_si256((__m256i*)PTR, data)
107-
NPY_FINLINE npyvh_f16 npyvh_load_till_f16(const npy_half *ptr, npy_uintp nlane, npy_half fill)
108-
{
109-
assert(nlane > 0);
110-
const __m256i vfill = _mm256_set1_epi16(fill);
111-
const __mmask16 mask = (0x0001 << nlane) - 0x0001;
112-
return _mm256_mask_loadu_epi16(vfill, mask, ptr);
113-
}
114-
NPY_FINLINE void npyvh_store_till_f16(npy_half *ptr, npy_uintp nlane, npyvh_f16 data)
115-
{
116-
assert(nlane > 0);
117-
const __mmask16 mask = (0x0001 << nlane) - 0x0001;
118-
_mm256_mask_storeu_epi16(ptr, mask, data);
119-
}
120-
121-
/**begin repeat
122-
* #func = sin, cos, tan, exp, exp2, expm1, log, log2, log10, log1p, cbrt, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh#
123-
* #default_val = 0, 0, 0, 0, 0, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x3c00, 0#
124-
*/
125-
static void
126-
avx512_@func@_f16(const npy_half *src, npy_half *dst, npy_intp len)
127-
{
128-
const int num_lanes = npyv_nlanes_f32;
129-
npyvh_f16 x, out;
130-
npyv_f32 x_ps, out_ps;
131-
for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
132-
if (len >= num_lanes) {
133-
x = npyvh_load_f16(src);
134-
x_ps = npyv_cvt_f16_f32(x);
135-
out_ps = __svml_@func@f16(x_ps);
136-
out = npyv_cvt_f32_f16(out_ps, 0);
137-
npyvh_store_f16(dst, out);
138-
}
139-
else {
140-
x = npyvh_load_till_f16(src, len, @default_val@);
141-
x_ps = npyv_cvt_f16_f32(x);
142-
out_ps = __svml_@func@f16(x_ps);
143-
out = npyv_cvt_f32_f16(out_ps, 0);
144-
npyvh_store_till_f16(dst, len, out);
145-
}
146-
}
147-
npyv_cleanup();
148-
}
149-
/**end repeat**/
150-
#endif
151-
152-
/**begin repeat
153-
* #func = sin, cos, tan, exp, exp2, expm1, log, log2, log10, log1p, cbrt, arcsin, arccos, arctan, sinh, cosh, tanh, arcsinh, arccosh, arctanh#
154-
* #intrin = sin, cos, tan, exp, exp2, expm1, log, log2, log10, log1p, cbrt, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh#
155-
*/
156-
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
157-
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
158-
{
159-
#if defined(NPY_HAVE_AVX512_SPR) || defined(NPY_HAVE_AVX512_SKX)
160-
#if NPY_SIMD && defined(NPY_CAN_LINK_SVML)
161-
const npy_half *src = (npy_half*)args[0];
162-
npy_half *dst = (npy_half*)args[1];
163-
164-
const npy_intp len = dimensions[0];
165-
166-
if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
167-
(steps[0] == sizeof(npy_half)) &&
168-
(steps[1] == sizeof(npy_half))) {
169-
#if defined(NPY_HAVE_AVX512_SPR)
170-
__svml_@intrin@s32(src, dst, len);
171-
return;
172-
#endif
173-
#if defined(NPY_HAVE_AVX512_SKX)
174-
avx512_@intrin@_f16(src, dst, len);
175-
return;
176101
#endif
177-
}
178-
#endif // NPY_SIMD && NPY_CAN_LINK_SVML
179-
#endif // SPR or SKX
180-
UNARY_LOOP {
181-
const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
182-
*((npy_half *)op1) = npy_float_to_half(npy_@intrin@f(in1));
183-
}
184-
}
185-
/**end repeat**/
186102

187103
/**begin repeat
188104
* #TYPE = DOUBLE, FLOAT#

0 commit comments

Comments
 (0)