Skip to content

Commit 00a3168

Browse files
committed
Add AVX512 version of compare256
Improve the speed of sub-16 byte matches by first using a 128-bit intrinsic, after that use only 512-bit intrinsics. This requires us to overlap on the last run, but this is cheaper than processing the tail using a 256-bit and then a 128-bit run. Change benchmark steps to avoid it hitting chunk boundaries of one or the other function as much, this gives more fair benchmarks.
1 parent cfd90c7 commit 00a3168

8 files changed

Lines changed: 133 additions & 3 deletions

File tree

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,6 +1047,8 @@ if(WITH_OPTIM)
10471047
add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"")
10481048
list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c)
10491049
add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"")
1050+
list(APPEND AVX512_SRCS ${ARCHDIR}/compare256_avx512.c)
1051+
add_feature_info(AVX512_COMPARE256 1 "Support AVX512 optimized compare256, using \"${AVX512FLAG}\"")
10501052
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
10511053
list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
10521054
set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")

arch/x86/Makefile.in

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ all: \
3636
chunkset_ssse3.o chunkset_ssse3.lo \
3737
chorba_sse2.o chorba_sse2.lo \
3838
compare256_avx2.o compare256_avx2.lo \
39+
compare256_avx512.o compare256_avx512.lo \
3940
compare256_sse2.o compare256_sse2.lo \
4041
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
4142
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
@@ -84,6 +85,12 @@ compare256_avx2.o:
8485
compare256_avx2.lo:
8586
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
8687

88+
compare256_avx512.o:
89+
$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
90+
91+
compare256_avx512.lo:
92+
$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx512.c
93+
8794
compare256_sse2.o:
8895
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
8996

arch/x86/compare256_avx512.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/* compare256_avx512.c -- AVX512 version of compare256
2+
* Copyright (C) 2025 Hans Kristian Rosbach
3+
* Based on AVX2 implementation by Mika T. Lindqvist
4+
* For conditions of distribution and use, see copyright notice in zlib.h
5+
*/
6+
7+
#include "zbuild.h"
8+
#include "zmemory.h"
9+
#include "deflate.h"
10+
#include "fallback_builtins.h"
11+
12+
#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
13+
14+
#include <immintrin.h>
15+
#ifdef _MSC_VER
16+
# include <nmmintrin.h>
17+
#endif
18+
19+
static inline uint32_t compare256_avx512_static(const uint8_t *src0, const uint8_t *src1) {
20+
__m512i zmm_src0_4, zmm_src1_4;
21+
__m512i zmm_src0_3, zmm_src1_3;
22+
__m512i zmm_src0_2, zmm_src1_2;
23+
__m512i zmm_src0_1, zmm_src1_1;
24+
__m128i xmm_src0_0, xmm_src1_0;
25+
uint64_t mask_1, mask_2, mask_3, mask_4;
26+
uint32_t mask_0;
27+
28+
// First do a 16byte round before increasing to 64bytes, this reduces the
29+
// penalty for the short matches, and those are usually the most common ones.
30+
// This requires us to overlap on the last round, giving a small penalty
31+
// on matches of 192+ bytes (Still faster than AVX2 though).
32+
33+
// 16 bytes
34+
xmm_src0_0 = _mm_loadu_si128((__m128i*)src0);
35+
xmm_src1_0 = _mm_loadu_si128((__m128i*)src1);
36+
mask_0 = (uint32_t)_mm_cmpeq_epu8_mask(xmm_src0_0, xmm_src1_0); // zero-extended to use __builtin_ctz
37+
if (mask_0 != 0x0000FFFF) {
38+
// There is potential for using __builtin_ctzg/__builtin_ctzs/_tzcnt_u16/__tzcnt_u16 here
39+
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask_0); /* Invert bits so identical = 0 */
40+
return match_byte;
41+
}
42+
43+
// 64 bytes
44+
zmm_src0_1 = _mm512_loadu_si512((__m512i*)(src0 + 16));
45+
zmm_src1_1 = _mm512_loadu_si512((__m512i*)(src1 + 16));
46+
mask_1 = _mm512_cmpeq_epu8_mask(zmm_src0_1, zmm_src1_1);
47+
if (mask_1 != 0xFFFFFFFFFFFFFFFF) {
48+
uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_1);
49+
return 16 + match_byte;
50+
}
51+
52+
// 64 bytes
53+
zmm_src0_2 = _mm512_loadu_si512((__m512i*)(src0 + 80));
54+
zmm_src1_2 = _mm512_loadu_si512((__m512i*)(src1 + 80));
55+
mask_2 = _mm512_cmpeq_epu8_mask(zmm_src0_2, zmm_src1_2);
56+
if (mask_2 != 0xFFFFFFFFFFFFFFFF) {
57+
uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_2);
58+
return 80 + match_byte;
59+
}
60+
61+
// 64 bytes
62+
zmm_src0_3 = _mm512_loadu_si512((__m512i*)(src0 + 144));
63+
zmm_src1_3 = _mm512_loadu_si512((__m512i*)(src1 + 144));
64+
mask_3 = _mm512_cmpeq_epu8_mask(zmm_src0_3, zmm_src1_3);
65+
if (mask_3 != 0xFFFFFFFFFFFFFFFF) {
66+
uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_3);
67+
return 144 + match_byte;
68+
}
69+
70+
// 64 bytes (overlaps the previous 16 bytes for fast tail processing)
71+
zmm_src0_4 = _mm512_loadu_si512((__m512i*)(src0 + 192));
72+
zmm_src1_4 = _mm512_loadu_si512((__m512i*)(src1 + 192));
73+
mask_4 = _mm512_cmpeq_epu8_mask(zmm_src0_4, zmm_src1_4);
74+
if (mask_4 != 0xFFFFFFFFFFFFFFFF) {
75+
uint32_t match_byte = (uint32_t)__builtin_ctzll(~mask_4);
76+
return 192 + match_byte;
77+
}
78+
79+
return 256;
80+
}
81+
82+
Z_INTERNAL uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1) {
83+
return compare256_avx512_static(src0, src1);
84+
}
85+
86+
#define LONGEST_MATCH longest_match_avx512
87+
#define COMPARE256 compare256_avx512_static
88+
89+
#include "match_tpl.h"
90+
91+
#define LONGEST_MATCH_SLOW
92+
#define LONGEST_MATCH longest_match_slow_avx512
93+
#define COMPARE256 compare256_avx512_static
94+
95+
#include "match_tpl.h"
96+
97+
#endif

arch/x86/x86_functions.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *s
6060
uint32_t chunksize_avx512(void);
6161
uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
6262
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
63+
# ifdef HAVE_BUILTIN_CTZLL
64+
uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
65+
uint32_t longest_match_avx512(deflate_state *const s, Pos cur_match);
66+
uint32_t longest_match_slow_avx512(deflate_state *const s, Pos cur_match);
67+
# endif
6368
#endif
6469
#ifdef X86_AVX512VNNI
6570
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
@@ -169,6 +174,14 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
169174
# define native_chunksize chunksize_avx512
170175
# undef native_inflate_fast
171176
# define native_inflate_fast inflate_fast_avx512
177+
# ifdef HAVE_BUILTIN_CTZLL
178+
# undef native_compare256
179+
# define native_compare256 compare256_avx512
180+
# undef native_longest_match
181+
# define native_longest_match longest_match_avx512
182+
# undef native_longest_match_slow
183+
# define native_longest_match_slow longest_match_slow_avx512
184+
# endif
172185
// X86 - AVX512 (VNNI)
173186
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
174187
# undef native_adler32

configure

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,8 +1694,8 @@ case "${ARCH}" in
16941694
if test ${HAVE_AVX512_INTRIN} -eq 1; then
16951695
CFLAGS="${CFLAGS} -DX86_AVX512"
16961696
SFLAGS="${SFLAGS} -DX86_AVX512"
1697-
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o"
1698-
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo"
1697+
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o compare256_avx512.o"
1698+
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo compare256_avx512.lo"
16991699
fi
17001700

17011701
check_mtune_cascadelake_compiler_flag

functable.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ static void init_functable(void) {
139139
ft.chunkmemset_safe = &chunkmemset_safe_avx512;
140140
ft.chunksize = &chunksize_avx512;
141141
ft.inflate_fast = &inflate_fast_avx512;
142+
# ifdef HAVE_BUILTIN_CTZLL
143+
ft.compare256 = &compare256_avx512;
144+
ft.longest_match = &longest_match_avx512;
145+
ft.longest_match_slow = &longest_match_slow_avx512;
146+
# endif
142147
}
143148
#endif
144149
#ifdef X86_AVX512VNNI

test/benchmarks/benchmark_compare256.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class compare256: public benchmark::Fixture {
5959
} \
6060
Bench(state, fptr); \
6161
} \
62-
BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
62+
BENCHMARK_REGISTER_F(compare256, name)->Arg(1)->Arg(10)->Arg(40)->Arg(80)->Arg(100)->Arg(175)->Arg(256);
6363

6464
#ifdef DISABLE_RUNTIME_CPU_DETECTION
6565
BENCHMARK_COMPARE256(native, native_compare256, 1);
@@ -80,6 +80,9 @@ BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
8080
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
8181
BENCHMARK_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2);
8282
#endif
83+
#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
84+
BENCHMARK_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common);
85+
#endif
8386
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
8487
BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
8588
#endif

test/test_compare256.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ TEST_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2)
7979
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
8080
TEST_COMPARE256(avx2, compare256_avx2, test_cpu_features.x86.has_avx2)
8181
#endif
82+
#if defined(X86_AVX512) && defined(HAVE_BUILTIN_CTZLL)
83+
TEST_COMPARE256(avx512, compare256_avx512, test_cpu_features.x86.has_avx512_common)
84+
#endif
8285
#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
8386
TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon)
8487
#endif

0 commit comments

Comments
 (0)