Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Portability fix-ups and generic chunk copy implementation. #51

Merged
merged 2 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 39 additions & 25 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ endif()
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
option(BUILD_EXAMPLES "Build examples" OFF)
option(SKIP_CPUID_CHECK "Assume CPU supports fast CRC" OFF)
option(BUILD_GENERIC_CODE "Avoid architecture-specific code paths" OFF)
option(FORCE_CHUNK_COPY "Force chunk-copy optimization" OFF)
option(FORCE_UNALIGNED_READ_64LE "Force unaligned 64-bit read optimizaton" OFF)

if(SKIP_CPUID_CHECK)
add_definitions(-DSKIP_CPUID_CHECK)
Expand Down Expand Up @@ -67,32 +70,36 @@ endif()
# Compiler dependent flags
include (CheckCCompilerFlag)
if(UNIX OR MINGW)
check_c_compiler_flag(-march=armv8-a+crc ARM_CRC)
if(ARM_CRC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc")
if(BUILD_GENERIC_CODE)
message(STATUS "Skipping target feature checks")
else()
check_c_compiler_flag(-msse2 HAS_SSE2)
if(HAS_SSE2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
add_definitions(-DHAS_SSE2)
endif()

check_c_compiler_flag(-mssse3 HAS_SSSE3)
if(HAS_SSSE3)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
add_definitions(-DHAS_SSSE3)
endif()

check_c_compiler_flag(-msse4.2 HAS_SSE42)
if(HAS_SSE42)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
add_definitions(-DHAS_SSE42)
endif()

check_c_compiler_flag(-mpclmul HAS_PCLMUL)
if(HAS_PCLMUL)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul")
add_definitions(-DHAS_PCLMUL)
check_c_compiler_flag(-march=armv8-a+crc ARM_CRC)
if(ARM_CRC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc")
else()
check_c_compiler_flag(-msse2 HAS_SSE2)
if(HAS_SSE2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
add_definitions(-DHAS_SSE2)
endif()

check_c_compiler_flag(-mssse3 HAS_SSSE3)
if(HAS_SSSE3)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
add_definitions(-DHAS_SSSE3)
endif()

check_c_compiler_flag(-msse4.2 HAS_SSE42)
if(HAS_SSE42)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
add_definitions(-DHAS_SSE42)
endif()

check_c_compiler_flag(-mpclmul HAS_PCLMUL)
if(HAS_PCLMUL)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul")
add_definitions(-DHAS_PCLMUL)
endif()
endif()
endif()
elseif(MSVC)
Expand Down Expand Up @@ -144,6 +151,13 @@ set(ZLIB_SRCS
)

if(UNIX OR MINGW)
if(FORCE_CHUNK_COPY)
list(APPEND ZLIB_SRCS inffast_chunk.c)
add_definitions(-DINFLATE_CHUNK_GENERIC)
endif()
if(FORCE_UNALIGNED_READ_64LE)
add_definitions(-DINFLATE_CHUNK_READ_64LE)
endif()
# append "inffast_chunk.c" and "adler32_simd.c" for ARMv8 CPU
if(ARM_CRC)
list(APPEND ZLIB_SRCS inffast_chunk.c adler32_simd.c)
Expand Down
75 changes: 74 additions & 1 deletion chunkcopy.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@

#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
#define Z_BUILTIN_MEMCPY __builtin_memcpy
#define Z_BUILTIN_MEMSET __builtin_memset
#else
#define Z_BUILTIN_MEMCPY zmemcpy
#define Z_BUILTIN_MEMSET zmemset
#endif

#if defined(INFLATE_CHUNK_SIMD_NEON)
Expand All @@ -54,7 +56,7 @@ typedef uint8x16_t z_vec128i_t;
#include <emmintrin.h>
typedef __m128i z_vec128i_t;
#else
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
typedef struct { uint8_t x[16]; } z_vec128i_t;
#endif

/*
Expand Down Expand Up @@ -271,6 +273,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
static inline void v_store_128(void* out, const z_vec128i_t vec) {
_mm_storeu_si128((__m128i*)out, vec);
}
#else
/*
* Default implementations for chunk-copy functions rely on memcpy() being
* inlined by the compiler for best performance. This is most likely to work
* as expected when the length argument is constant (as is the case here) and
* the target supports unaligned loads and stores. Since that's not always a
* safe assumption, this may need extra compiler arguments such as
* `-mno-strict-align` or `-munaligned-access`, or the availability of
* extensions like SIMD.
*/

/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
int64_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
int8_t in = *(uint8_t const*)src;
z_vec128i_t out;
Z_BUILTIN_MEMSET(&out, in, sizeof(out));
return out;
}

/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
}
#endif

/*
Expand Down
13 changes: 10 additions & 3 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ static uint32_t hash_func(deflate_state *s, void* str) {
return __crc32cw(0, *(uint32_t*)str) & s->hash_mask;
}

#elif defined __x86_64__ || defined _M_AMD64
#elif defined HAS_SSE42

#include <immintrin.h>
static uint32_t hash_func(deflate_state *s, void* str) {
Expand All @@ -146,7 +146,14 @@ static uint32_t hash_func(deflate_state *s, void* str) {

#else

#error "Only 64-bit Intel and ARM architectures are supported"
static uint32_t hash_func(deflate_state *s, void* str) {
uint32_t w;
zmemcpy(&w, str, sizeof(w));
// generic multiply-xor hash, using some magic numbers from xxhash.
w *= 0x85ebca77u;
w ^= w >> 19;
return w & s->hash_mask;
}

#endif

Expand Down Expand Up @@ -1329,7 +1336,7 @@ static void fill_window(deflate_state *s)
q+=8;
}

#elif defined __x86_64__ || defined _M_AMD64
#elif defined HAS_SSE2

__m128i W;
__m128i *q;
Expand Down
10 changes: 5 additions & 5 deletions inflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
#include "inftrees.h"
#include "inflate.h"

#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
#include "inffast_chunk.h"
#include "chunkcopy.h"
#else
Expand Down Expand Up @@ -390,7 +390,7 @@ static int updatewindow(z_streamp strm, const Bytef *end, unsigned copy) {

/* if it hasn't been done already, allocate space for the window */
if (state->window == Z_NULL) {
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
unsigned wsize = 1U << state->wbits;
state->window = (unsigned char FAR *)
ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
Expand Down Expand Up @@ -1061,7 +1061,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) {
if (have >= INFLATE_FAST_MIN_INPUT &&
left >= INFLATE_FAST_MIN_OUTPUT) {
RESTORE();
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
inflate_fast_chunk_(strm, out);
#else
inflate_fast(strm, out);
Expand Down Expand Up @@ -1200,7 +1200,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) {
else
from = state->window + (state->wnext - copy);
if (copy > state->length) copy = state->length;
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
if (copy > left) copy = left;
put = chunkcopy_safe(put, from, copy, put + left);
}
Expand Down Expand Up @@ -1292,7 +1292,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) {
Note: a memory error from inflate() is non-recoverable.
*/
inf_leave:
#if defined(ZLIB_DEBUG) && (defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2))
#if defined(ZLIB_DEBUG) && (defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC))
/* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty
* client code relying on undefined behavior when chunk_copy first landed.
*
Expand Down
31 changes: 16 additions & 15 deletions trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

#ifdef ZLIB_DEBUG
# include <ctype.h>
# include <inttypes.h>
#endif

/* ===========================================================================
Expand Down Expand Up @@ -178,7 +179,7 @@ static void gen_trees_header OF(void);
static void send_bits(deflate_state* s, uint64_t val, int len)
{
#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand Down Expand Up @@ -761,7 +762,7 @@ static int build_bl_tree(deflate_state *s) {
}
/* Update opt_len to include the bit length tree and counts */
s->opt_len += 3*(max_blindex+1) + 5+5+4;
Tracev((stderr, "\ndyn trees: dyn %lld, stat %lld",
Tracev((stderr, "\ndyn trees: dyn %" PRId64 ", stat %" PRId64,
s->opt_len, s->static_len));

return max_blindex;
Expand All @@ -787,13 +788,13 @@ static void send_all_trees(deflate_state *s, int lcodes, int dcodes,
Tracev((stderr, "\nbl code %2d ", bl_order[rank]));
send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
}
Tracev((stderr, "\nbl tree: sent %lld", s->bits_sent));
Tracev((stderr, "\nbl tree: sent %" PRId64, s->bits_sent));

send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
Tracev((stderr, "\nlit tree: sent %lld", s->bits_sent));
Tracev((stderr, "\nlit tree: sent %" PRId64, s->bits_sent));

send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
Tracev((stderr, "\ndist tree: sent %lld", s->bits_sent));
Tracev((stderr, "\ndist tree: sent %" PRId64, s->bits_sent));
}

/* ===========================================================================
Expand Down Expand Up @@ -848,11 +849,11 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, uint8_t *buf, /* input bloc

/* Construct the literal and distance trees */
build_tree(s, (tree_desc *)(&(s->l_desc)));
Tracev((stderr, "\nlit data: dyn %lld, stat %lld", s->opt_len,
Tracev((stderr, "\nlit data: dyn %" PRId64 ", stat %" PRId64, s->opt_len,
s->static_len));

build_tree(s, (tree_desc *)(&(s->d_desc)));
Tracev((stderr, "\ndist data: dyn %lld, stat %lld", s->opt_len,
Tracev((stderr, "\ndist data: dyn %" PRId64 ", stat %" PRId64, s->opt_len,
s->static_len));
/* At this point, opt_len and static_len are the total bit lengths of
* the compressed block data, excluding the tree representations.
Expand All @@ -867,7 +868,7 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, uint8_t *buf, /* input bloc
opt_lenb = (s->opt_len+3+7)>>3;
static_lenb = (s->static_len+3+7)>>3;

Tracev((stderr, "\nopt %llu(%llu) stat %llu(%llu) stored %llu lit %u ",
Tracev((stderr, "\nopt %" PRIu64 "(%" PRIu64 ") stat %" PRIu64 "(%" PRIu64 ") stored %" PRIu64 " lit %u ",
opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
s->sym_next / 3));

Expand Down Expand Up @@ -921,7 +922,7 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, uint8_t *buf, /* input bloc
s->compressed_len += 7; /* align on byte boundary */
#endif
}
Tracev((stderr,"\ncomprlen %llu(%llu) ", s->compressed_len>>3,
Tracev((stderr,"\ncomprlen %" PRIu64 "(%" PRIu64 ") ", s->compressed_len>>3,
s->compressed_len-7*last));
}

Expand Down Expand Up @@ -976,7 +977,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr
uint64_t val = ltree[lc].Code;
int len = ltree[lc].Len;
#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand All @@ -1000,7 +1001,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr
val = ltree[code+LITERALS+1].Code;
len = ltree[code+LITERALS+1].Len;
#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand All @@ -1019,7 +1020,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr
val = lc;
len = extra;
#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand All @@ -1039,7 +1040,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr
val = dtree[code].Code;
len = dtree[code].Len;
#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand All @@ -1061,7 +1062,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr
bit_buf ^= (val << filled);
filled += len;
#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand All @@ -1083,7 +1084,7 @@ static void compress_block(deflate_state *s, const ct_data *ltree, /* literal tr
len = ltree[END_BLOCK].Len;

#ifdef ZLIB_DEBUG
Tracevv((stderr," l %2d v %4llx ", len, val));
Tracevv((stderr," l %2d v %4" PRIx64 " ", len, val));
Assert(len > 0 && len <= 64, "invalid length");
s->bits_sent += len;
#endif
Expand Down