From 95e3512df4c1bef682bf2469f17cec07712d810c Mon Sep 17 00:00:00 2001 From: Simon Hosie Date: Mon, 23 Oct 2023 15:55:59 -0700 Subject: [PATCH 1/2] Enable building for non-x86/arm targets. Generic paths for chunk ops and hash function. Add a cmake option to avoid hardware features so that the generic paths can be tested on x86. --- CMakeLists.txt | 64 +++++++++++++++++++++++++----------------- chunkcopy.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++++- deflate.c | 13 +++++++-- inflate.c | 10 +++---- 4 files changed, 128 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d9ee9128..33ee139cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,9 @@ endif() option(BUILD_SHARED_LIBS "Build shared libraries" OFF) option(BUILD_EXAMPLES "Build examples" OFF) option(SKIP_CPUID_CHECK "Assume CPU supports fast CRC" OFF) +option(BUILD_GENERIC_CODE "Avoid architecture-specific code paths" OFF) +option(FORCE_CHUNK_COPY "Force chunk-copy optimization" OFF) +option(FORCE_UNALIGNED_READ_64LE "Force unaligned 64-bit read optimizaton" OFF) if(SKIP_CPUID_CHECK) add_definitions(-DSKIP_CPUID_CHECK) @@ -67,32 +70,36 @@ endif() # Compiler dependent flags include (CheckCCompilerFlag) if(UNIX OR MINGW) - check_c_compiler_flag(-march=armv8-a+crc ARM_CRC) - if(ARM_CRC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc") + if(BUILD_GENERIC_CODE) + message(STATUS "Skipping target feature checks") else() - check_c_compiler_flag(-msse2 HAS_SSE2) - if(HAS_SSE2) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2") - add_definitions(-DHAS_SSE2) - endif() - - check_c_compiler_flag(-mssse3 HAS_SSSE3) - if(HAS_SSSE3) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3") - add_definitions(-DHAS_SSSE3) - endif() - - check_c_compiler_flag(-msse4.2 HAS_SSE42) - if(HAS_SSE42) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") - add_definitions(-DHAS_SSE42) - endif() - - check_c_compiler_flag(-mpclmul HAS_PCLMUL) - if(HAS_PCLMUL) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") - add_definitions(-DHAS_PCLMUL) + check_c_compiler_flag(-march=armv8-a+crc ARM_CRC) + if(ARM_CRC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc") + else() + check_c_compiler_flag(-msse2 HAS_SSE2) + if(HAS_SSE2) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2") + add_definitions(-DHAS_SSE2) + endif() + + check_c_compiler_flag(-mssse3 HAS_SSSE3) + if(HAS_SSSE3) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3") + add_definitions(-DHAS_SSSE3) + endif() + + check_c_compiler_flag(-msse4.2 HAS_SSE42) + if(HAS_SSE42) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") + add_definitions(-DHAS_SSE42) + endif() + + check_c_compiler_flag(-mpclmul HAS_PCLMUL) + if(HAS_PCLMUL) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") + add_definitions(-DHAS_PCLMUL) + endif() endif() endif() elseif(MSVC) @@ -144,6 +151,13 @@ set(ZLIB_SRCS ) if(UNIX OR MINGW) + if(FORCE_CHUNK_COPY) + list(APPEND ZLIB_SRCS inffast_chunk.c) + add_definitions(-DINFLATE_CHUNK_GENERIC) + endif() + if(FORCE_UNALIGNED_READ_64LE) + add_definitions(-DINFLATE_CHUNK_READ_64LE) + endif() # append "inffast_chunk.c" and "adler32_simd.c" for ARMv8 CPU if(ARM_CRC) list(APPEND ZLIB_SRCS inffast_chunk.c adler32_simd.c) diff --git a/chunkcopy.h b/chunkcopy.h index 8fd1a0d39..ce2242bcc 100644 --- a/chunkcopy.h +++ b/chunkcopy.h @@ -43,8 +43,10 @@ #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__) #define Z_BUILTIN_MEMCPY __builtin_memcpy +#define Z_BUILTIN_MEMSET __builtin_memset #else #define Z_BUILTIN_MEMCPY zmemcpy +#define Z_BUILTIN_MEMSET zmemset #endif #if defined(INFLATE_CHUNK_SIMD_NEON) @@ -54,7 +56,7 @@ typedef uint8x16_t z_vec128i_t; #include typedef __m128i z_vec128i_t; #else -#error chunkcopy.h inflate chunk SIMD is not defined for your build target +typedef struct { uint8_t x[16]; } z_vec128i_t; #endif /* @@ -271,6 +273,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) { static inline void v_store_128(void* out, const z_vec128i_t vec) { _mm_storeu_si128((__m128i*)out, vec); } +#else +/* + * Default implementations for chunk-copy functions rely on memcpy() being + * inlined by the compiler for best performance. This is most likely to work + * as expected when the length argument is constant (as is the case here) and + * the target supports unaligned loads and stores. Since that's not always a + * safe assumption, this may need extra compiler arguments such as + * `-mno-strict-align` or `-munaligned-access`, or the availability of + * extensions like SIMD. + */ + +/* + * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in + * every 64-bit component of the 128-bit result (64-bit int splat). + */ +static inline z_vec128i_t v_load64_dup(const void* src) { + int64_t in; + Z_BUILTIN_MEMCPY(&in, src, sizeof(in)); + z_vec128i_t out; + for (int i = 0; i < sizeof(out); i += sizeof(in)) { + Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in)); + } + return out; +} + +/* + * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in + * every 32-bit component of the 128-bit result (32-bit int splat). + */ +static inline z_vec128i_t v_load32_dup(const void* src) { + int32_t in; + Z_BUILTIN_MEMCPY(&in, src, sizeof(in)); + z_vec128i_t out; + for (int i = 0; i < sizeof(out); i += sizeof(in)) { + Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in)); + } + return out; +} + +/* + * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in + * every 16-bit component of the 128-bit result (16-bit int splat). + */ +static inline z_vec128i_t v_load16_dup(const void* src) { + int16_t in; + Z_BUILTIN_MEMCPY(&in, src, sizeof(in)); + z_vec128i_t out; + for (int i = 0; i < sizeof(out); i += sizeof(in)) { + Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in)); + } + return out; +} + +/* + * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit + * component of the 128-bit result (8-bit int splat). + */ +static inline z_vec128i_t v_load8_dup(const void* src) { + int8_t in = *(uint8_t const*)src; + z_vec128i_t out; + Z_BUILTIN_MEMSET(&out, in, sizeof(out)); + return out; +} + +/* + * v_store_128(): store the 128-bit vec in a memory destination (that might + * not be 16-byte aligned) void* out. + */ +static inline void v_store_128(void* out, const z_vec128i_t vec) { + Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec)); +} #endif /* diff --git a/deflate.c b/deflate.c index d861faf43..5547a6134 100644 --- a/deflate.c +++ b/deflate.c @@ -137,7 +137,7 @@ static uint32_t hash_func(deflate_state *s, void* str) { return __crc32cw(0, *(uint32_t*)str) & s->hash_mask; } -#elif defined __x86_64__ || defined _M_AMD64 +#elif defined HAS_SSE42 #include static uint32_t hash_func(deflate_state *s, void* str) { @@ -146,7 +146,14 @@ static uint32_t hash_func(deflate_state *s, void* str) { #else -#error "Only 64-bit Intel and ARM architectures are supported" +static uint32_t hash_func(deflate_state *s, void* str) { + uint32_t w; + zmemcpy(&w, str, sizeof(w)); + // generic multiply-xor hash, using some magic numbers from xxhash. + w *= 0x85ebca77u; + w ^= w >> 19; + return w & s->hash_mask; +} #endif @@ -1329,7 +1336,7 @@ static void fill_window(deflate_state *s) q+=8; } -#elif defined __x86_64__ || defined _M_AMD64 +#elif defined HAS_SSE2 __m128i W; __m128i *q; diff --git a/inflate.c b/inflate.c index 3896ce902..ef1d07846 100644 --- a/inflate.c +++ b/inflate.c @@ -84,7 +84,7 @@ #include "inftrees.h" #include "inflate.h" -#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) +#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC) #include "inffast_chunk.h" #include "chunkcopy.h" #else @@ -390,7 +390,7 @@ static int updatewindow(z_streamp strm, const Bytef *end, unsigned copy) { /* if it hasn't been done already, allocate space for the window */ if (state->window == Z_NULL) { -#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) +#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC) unsigned wsize = 1U << state->wbits; state->window = (unsigned char FAR *) ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE, @@ -1061,7 +1061,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) { if (have >= INFLATE_FAST_MIN_INPUT && left >= INFLATE_FAST_MIN_OUTPUT) { RESTORE(); -#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) +#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC) inflate_fast_chunk_(strm, out); #else inflate_fast(strm, out); @@ -1200,7 +1200,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) { else from = state->window + (state->wnext - copy); if (copy > state->length) copy = state->length; -#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) +#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC) if (copy > left) copy = left; put = chunkcopy_safe(put, from, copy, put + left); } @@ -1292,7 +1292,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) { Note: a memory error from inflate() is non-recoverable. */ inf_leave: -#if defined(ZLIB_DEBUG) && (defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)) +#if defined(ZLIB_DEBUG) && (defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)) /* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty * client code relying on undefined behavior when chunk_copy first landed. * From 3b73da82bf1e3a806e7c9519537af6e317b33cb7 Mon Sep 17 00:00:00 2001 From: Simon Hosie Date: Wed, 23 Aug 2023 21:43:43 -0700 Subject: [PATCH 2/2] Use PRI*64 macros for trace formatting in trees.c. Arguments which are uint64_t may be `%l` or `%ll` depending on target, which can produce warnings even if they're the same size. Using the PRI*64 macros makes this portable. --- trees.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/trees.c b/trees.c index e15463794..6db319397 100644 --- a/trees.c +++ b/trees.c @@ -38,6 +38,7 @@ #ifdef ZLIB_DEBUG # include +# include #endif /* =========================================================================== @@ -178,7 +179,7 @@ static void gen_trees_header OF(void); static void send_bits(deflate_state* s, uint64_t val, int len) { #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif @@ -761,7 +762,7 @@ static int build_bl_tree(deflate_state *s) { } /* Update opt_len to include the bit length tree and counts */ s->opt_len += 3*(max_blindex+1) + 5+5+4; - Tracev((stderr, "\ndyn trees: dyn %lld, stat %lld", + Tracev((stderr, "\ndyn trees: dyn %" PRId64 ", stat %" PRId64, s->opt_len, s->static_len)); return max_blindex; @@ -787,13 +788,13 @@ static void send_all_trees(deflate_state *s, int lcodes, int dcodes, Tracev((stderr, "\nbl code %2d ", bl_order[rank])); send_bits(s, s->bl_tree[bl_order[rank]].Len, 3); } - Tracev((stderr, "\nbl tree: sent %lld", s->bits_sent)); + Tracev((stderr, "\nbl tree: sent %" PRId64, s->bits_sent)); send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */ - Tracev((stderr, "\nlit tree: sent %lld", s->bits_sent)); + Tracev((stderr, "\nlit tree: sent %" PRId64, s->bits_sent)); send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */ - Tracev((stderr, "\ndist tree: sent %lld", s->bits_sent)); + Tracev((stderr, "\ndist tree: sent %" PRId64, s->bits_sent)); } /* =========================================================================== @@ -848,11 +849,11 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, uint8_t *buf, /* input bloc /* Construct the literal and distance trees */ build_tree(s, (tree_desc *)(&(s->l_desc))); - Tracev((stderr, "\nlit data: dyn %lld, stat %lld", s->opt_len, + Tracev((stderr, "\nlit data: dyn %" PRId64 ", stat %" PRId64, s->opt_len, s->static_len)); build_tree(s, (tree_desc *)(&(s->d_desc))); - Tracev((stderr, "\ndist data: dyn %lld, stat %lld", s->opt_len, + Tracev((stderr, "\ndist data: dyn %" PRId64 ", stat %" PRId64, s->opt_len, s->static_len)); /* At this point, opt_len and static_len are the total bit lengths of * the compressed block data, excluding the tree representations. @@ -867,7 +868,7 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, uint8_t *buf, /* input bloc opt_lenb = (s->opt_len+3+7)>>3; static_lenb = (s->static_len+3+7)>>3; - Tracev((stderr, "\nopt %llu(%llu) stat %llu(%llu) stored %llu lit %u ", + Tracev((stderr, "\nopt %" PRIu64 "(%" PRIu64 ") stat %" PRIu64 "(%" PRIu64 ") stored %" PRIu64 " lit %u ", opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len, s->sym_next / 3)); @@ -921,7 +922,7 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, uint8_t *buf, /* input bloc s->compressed_len += 7; /* align on byte boundary */ #endif } - Tracev((stderr,"\ncomprlen %llu(%llu) ", s->compressed_len>>3, + Tracev((stderr,"\ncomprlen %" PRIu64 "(%" PRIu64 ") ", s->compressed_len>>3, s->compressed_len-7*last)); } @@ -976,7 +977,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr uint64_t val = ltree[lc].Code; int len = ltree[lc].Len; #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif @@ -1000,7 +1001,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr val = ltree[code+LITERALS+1].Code; len = ltree[code+LITERALS+1].Len; #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif @@ -1019,7 +1020,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr val = lc; len = extra; #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif @@ -1039,7 +1040,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr val = dtree[code].Code; len = dtree[code].Len; #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif @@ -1061,7 +1062,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr bit_buf ^= (val << filled); filled += len; #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif @@ -1083,7 +1084,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr len = ltree[END_BLOCK].Len; #ifdef ZLIB_DEBUG - Tracevv((stderr," l %2d v %4llx ", len, val)); + Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val)); Assert(len > 0 && len <= 64, "invalid length"); s->bits_sent += len; #endif