Skip to content

Commit 28ab687

Browse files
committed
Enable AVX2 functions to be built with BMI2 instructions
While these are technically different instructions, no such CPU exists that has AVX2 that doesn't have BMI2. Enabling BMI2 allows us to eliminate several flag stalls by having flagless versions of shifts, and allows us to not clobber and move around GPRs so much in scalar code. There's usually a sizeable benefit for enabling it. Since we're building with BMI2 for AVX2 functions, let's also just make sure the CPU claims to support it (just to cover our bases).
1 parent 0ed5ac8 commit 28ab687

4 files changed

Lines changed: 9 additions & 5 deletions

File tree

arch/x86/Makefile.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ SUFFIX=
1010

1111
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
1212
AVX512VNNIFLAG=-mavx512vnni -mbmi2
13-
AVX2FLAG=-mavx2
13+
AVX2FLAG=-mavx2 -mbmi2
1414
SSE2FLAG=-msse2
1515
SSSE3FLAG=-mssse3
1616
SSE42FLAG=-msse4.2

cmake/detect-intrinsics.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,12 @@ macro(check_avx2_intrinsics)
151151
if(NOT NATIVEFLAG)
152152
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
153153
if(CMAKE_HOST_UNIX OR APPLE)
154-
set(AVX2FLAG "-mavx2")
154+
set(AVX2FLAG "-mavx2 -mbmi2")
155155
else()
156156
set(AVX2FLAG "/arch:AVX2")
157157
endif()
158158
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
159-
set(AVX2FLAG "-mavx2")
159+
set(AVX2FLAG "-mavx2 -mbmi2")
160160
elseif(MSVC)
161161
set(AVX2FLAG "/arch:AVX2")
162162
endif()

configure

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ forcesse2=0
108108
# instruction scheduling unless you specify a reasonable -mtune= target
109109
avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2"
110110
avx512vnniflag="${avx512flag} -mavx512vnni"
111-
avx2flag="-mavx2"
111+
avx2flag="-mavx2 -mbmi2"
112112
sse2flag="-msse2"
113113
ssse3flag="-mssse3"
114114
sse42flag="-msse4.2"

functable.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,11 @@ static void init_functable(void) {
110110
#endif
111111
// X86 - AVX
112112
#ifdef X86_AVX2
113-
if (cf.x86.has_avx2) {
113+
/* BMI2 support is all bit implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
114+
* flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
115+
* for the shift results as an operand, eliminating several register-register moves when the original value needs
116+
* to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
117+
if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
114118
ft.adler32 = &adler32_avx2;
115119
ft.adler32_fold_copy = &adler32_fold_copy_avx2;
116120
ft.chunkmemset_safe = &chunkmemset_safe_avx2;

0 commit comments

Comments
 (0)