Skip to content

Commit 75edab9

Browse files
authored
Merge pull request #20363 from seiko2plus/svml2npyv/tanh_f32
SIMD: Replace SVML/ASM of tanh(f32, f64) with universal intrinsics
2 parents 06ac508 + d99bf0e commit 75edab9

File tree

12 files changed

+629
-8
lines changed

12 files changed

+629
-8
lines changed

numpy/core/code_generators/generate_umath.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,7 @@ def english_upper(s):
745745
docstrings.get('numpy.core.umath.tanh'),
746746
None,
747747
TD('e', f='tanh', astype={'e': 'f'}),
748-
TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
748+
TD('fd', dispatch=[('loops_hyperbolic', 'fd')]),
749749
TD(inexact, f='tanh', astype={'e': 'f'}),
750750
TD(P, f='tanh'),
751751
),

numpy/core/setup.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,7 @@ def generate_umath_doc_header(ext, build_dir):
10141014
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
10151015
join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
10161016
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
1017+
join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
10171018
join('src', 'umath', 'matmul.h.src'),
10181019
join('src', 'umath', 'matmul.c.src'),
10191020
join('src', 'umath', 'clip.h'),
@@ -1045,8 +1046,17 @@ def generate_umath_doc_header(ext, build_dir):
10451046

10461047
svml_path = join('numpy', 'core', 'src', 'umath', 'svml')
10471048
svml_objs = []
1049+
# we have converted the following into universal intrinsics
1050+
# so we can bring the benefits of performance for all platforms
1051+
# not just for avx512 on linux without performance/accuracy regression,
1052+
# actually the other way around, better performance and
1053+
# after all maintainable code.
1054+
svml_filter = (
1055+
'svml_z0_tanh_d_la.s', 'svml_z0_tanh_s_la.s'
1056+
)
10481057
if can_link_svml() and check_svml_submodule(svml_path):
10491058
svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
1059+
svml_objs = [o for o in svml_objs if not o.endswith(svml_filter)]
10501060

10511061
config.add_extension('_multiarray_umath',
10521062
# Forcing C language even though we have C++ sources.

numpy/core/src/_simd/_simd.dispatch.c.src

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
/**begin repeat
1616
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
1717
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
18-
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
18+
* #esfx = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64#
19+
* #size = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64#
1920
* #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
2021
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
2122
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
@@ -232,6 +233,15 @@ err:
232233
/**end repeat1**/
233234
#endif // @ncont_sup@
234235

236+
/****************************
237+
* Lookup tables
238+
****************************/
239+
#if @size@ == 32
240+
SIMD_IMPL_INTRIN_2(lut32_@sfx@, v@sfx@, q@sfx@, vu@size@)
241+
#endif
242+
#if @size@ == 64
243+
SIMD_IMPL_INTRIN_2(lut16_@sfx@, v@sfx@, q@sfx@, vu@size@)
244+
#endif
235245
/***************************
236246
* Misc
237247
***************************/
@@ -470,8 +480,9 @@ static PyMethodDef simd__intrinsics_methods[] = {
470480
/**begin repeat
471481
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
472482
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
473-
* #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
474-
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
483+
* #esfx = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64#
484+
* #size = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64#
485+
* #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
475486
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
476487
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
477488
* #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0#
@@ -509,6 +520,15 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
509520
/**end repeat1**/
510521
#endif // ncont_sup
511522

523+
/****************************
524+
* Lookup tables
525+
****************************/
526+
#if @size@ == 32
527+
SIMD_INTRIN_DEF(lut32_@sfx@)
528+
#endif
529+
#if @size@ == 64
530+
SIMD_INTRIN_DEF(lut16_@sfx@)
531+
#endif
512532
/***************************
513533
* Misc
514534
***************************/

numpy/core/src/common/simd/avx2/memory.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,4 +353,25 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
353353
NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
354354
NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
355355

356+
/*********************************
357+
* Lookup tables
358+
*********************************/
359+
// uses vector as indexes into a table
360+
// that contains 32 elements of float32.
361+
NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
362+
{ return _mm256_i32gather_ps(table, idx, 4); }
363+
NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
364+
{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); }
365+
NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx)
366+
{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); }
367+
368+
// uses vector as indexes into a table
369+
// that contains 16 elements of float64.
370+
NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx)
371+
{ return _mm256_i64gather_pd(table, idx, 8); }
372+
NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx)
373+
{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); }
374+
NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx)
375+
{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); }
376+
356377
#endif // _NPY_SIMD_AVX2_MEMORY_H

numpy/core/src/common/simd/avx512/memory.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,4 +329,33 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
329329
NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
330330
NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
331331

332+
/**************************************************
333+
* Lookup table
334+
*************************************************/
335+
// uses vector as indexes into a table
336+
// that contains 32 elements of float32.
337+
NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
338+
{
339+
const npyv_f32 table0 = npyv_load_f32(table);
340+
const npyv_f32 table1 = npyv_load_f32(table + 16);
341+
return _mm512_permutex2var_ps(table0, idx, table1);
342+
}
343+
NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
344+
{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); }
345+
NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx)
346+
{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); }
347+
348+
// uses vector as indexes into a table
349+
// that contains 16 elements of float64.
350+
NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx)
351+
{
352+
const npyv_f64 table0 = npyv_load_f64(table);
353+
const npyv_f64 table1 = npyv_load_f64(table + 8);
354+
return _mm512_permutex2var_pd(table0, idx, table1);
355+
}
356+
NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx)
357+
{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); }
358+
NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx)
359+
{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); }
360+
332361
#endif // _NPY_SIMD_AVX512_MEMORY_H

numpy/core/src/common/simd/neon/memory.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,5 +332,45 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
332332
#if NPY_SIMD_F64
333333
NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
334334
#endif
335+
/*********************************
336+
* Lookup table
337+
*********************************/
338+
// uses vector as indexes into a table
339+
// that contains 32 elements of uint32.
340+
NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
341+
{
342+
const unsigned i0 = vgetq_lane_u32(idx, 0);
343+
const unsigned i1 = vgetq_lane_u32(idx, 1);
344+
const unsigned i2 = vgetq_lane_u32(idx, 2);
345+
const unsigned i3 = vgetq_lane_u32(idx, 3);
346+
347+
uint32x2_t low = vcreate_u32(table[i0]);
348+
low = vld1_lane_u32((const uint32_t*)table + i1, low, 1);
349+
uint32x2_t high = vcreate_u32(table[i2]);
350+
high = vld1_lane_u32((const uint32_t*)table + i3, high, 1);
351+
return vcombine_u32(low, high);
352+
}
353+
NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx)
354+
{ return npyv_reinterpret_s32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); }
355+
NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
356+
{ return npyv_reinterpret_f32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); }
357+
358+
// uses vector as indexes into a table
359+
// that contains 16 elements of uint64.
360+
NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx)
361+
{
362+
const unsigned i0 = vgetq_lane_u32(vreinterpretq_u32_u64(idx), 0);
363+
const unsigned i1 = vgetq_lane_u32(vreinterpretq_u32_u64(idx), 2);
364+
return vcombine_u64(
365+
vld1_u64((const uint64_t*)table + i0),
366+
vld1_u64((const uint64_t*)table + i1)
367+
);
368+
}
369+
NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx)
370+
{ return npyv_reinterpret_s64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); }
371+
#if NPY_SIMD_F64
372+
NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx)
373+
{ return npyv_reinterpret_f64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); }
374+
#endif
335375

336376
#endif // _NPY_SIMD_NEON_MEMORY_H

numpy/core/src/common/simd/sse/memory.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,4 +495,45 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
495495
NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
496496
NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
497497

498+
/*********************************
499+
* Lookup table
500+
*********************************/
501+
// uses vector as indexes into a table
502+
// that contains 32 elements of float32.
503+
NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
504+
{
505+
const int i0 = _mm_cvtsi128_si32(idx);
506+
#ifdef NPY_HAVE_SSE41
507+
const int i1 = _mm_extract_epi32(idx, 1);
508+
const int i2 = _mm_extract_epi32(idx, 2);
509+
const int i3 = _mm_extract_epi32(idx, 3);
510+
#else
511+
const int i1 = _mm_extract_epi16(idx, 2);
512+
const int i2 = _mm_extract_epi16(idx, 4);
513+
const int i3 = _mm_extract_epi16(idx, 6);
514+
#endif
515+
return npyv_set_f32(table[i0], table[i1], table[i2], table[i3]);
516+
}
517+
NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
518+
{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); }
519+
NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx)
520+
{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); }
521+
522+
// uses vector as indexes into a table
523+
// that contains 16 elements of float64.
524+
NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx)
525+
{
526+
const int i0 = _mm_cvtsi128_si32(idx);
527+
#ifdef NPY_HAVE_SSE41
528+
const int i1 = _mm_extract_epi32(idx, 2);
529+
#else
530+
const int i1 = _mm_extract_epi16(idx, 4);
531+
#endif
532+
return npyv_set_f64(table[i0], table[i1]);
533+
}
534+
NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx)
535+
{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); }
536+
NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx)
537+
{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); }
538+
498539
#endif // _NPY_SIMD_SSE_MEMORY_H

numpy/core/src/common/simd/vsx/memory.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,4 +343,41 @@ NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32)
343343
NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64)
344344
NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64)
345345

346+
/*********************************
347+
* Lookup table
348+
*********************************/
349+
// uses vector as indexes into a table
350+
// that contains 32 elements of float32.
351+
NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
352+
{
353+
const unsigned i0 = vec_extract(idx, 0);
354+
const unsigned i1 = vec_extract(idx, 1);
355+
const unsigned i2 = vec_extract(idx, 2);
356+
const unsigned i3 = vec_extract(idx, 3);
357+
npyv_f32 r = vec_promote(table[i0], 0);
358+
r = vec_insert(table[i1], r, 1);
359+
r = vec_insert(table[i2], r, 2);
360+
r = vec_insert(table[i3], r, 3);
361+
return r;
362+
}
363+
NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
364+
{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); }
365+
NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx)
366+
{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); }
367+
368+
// uses vector as indexes into a table
369+
// that contains 16 elements of float64.
370+
NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx)
371+
{
372+
const unsigned i0 = vec_extract((npyv_u32)idx, 0);
373+
const unsigned i1 = vec_extract((npyv_u32)idx, 2);
374+
npyv_f64 r = vec_promote(table[i0], 0);
375+
r = vec_insert(table[i1], r, 1);
376+
return r;
377+
}
378+
NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx)
379+
{ return npyv_reinterpret_u64_f64(npyv_lut16_f64((const double*)table, idx)); }
380+
NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx)
381+
{ return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); }
382+
346383
#endif // _NPY_SIMD_VSX_MEMORY_H

numpy/core/src/umath/loops.h.src

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,24 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
209209
/**end repeat1**/
210210
/**end repeat**/
211211

212+
#ifndef NPY_DISABLE_OPTIMIZATION
213+
#include "loops_hyperbolic.dispatch.h"
214+
#endif
215+
/**begin repeat
216+
* #TYPE = FLOAT, DOUBLE#
217+
*/
218+
/**begin repeat1
219+
* #func = tanh#
220+
*/
221+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
222+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
223+
/**end repeat1**/
224+
/**end repeat**/
225+
226+
/**end repeat1**/
227+
/**end repeat**/
228+
229+
// SVML
212230
#ifndef NPY_DISABLE_OPTIMIZATION
213231
#include "loops_umath_fp.dispatch.h"
214232
#endif

0 commit comments

Comments
 (0)