8000 SSE/Neon path for MSVC x86 and ARM by cwoffenden · Pull Request #2680 · facebook/zstd · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

SSE/Neon path for MSVC x86 and ARM #2680

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions lib/common/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,4 +286,27 @@ void __asan_poison_memory_region(void const volatile *addr, size_t size);
void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
#endif

/**
* Compile-time detection of SSE2 support, either directly via Clang/GCC's <c>
* __SSE2__</c> macro, implicitly on x64 via MSVC's <c>_M_X64</c>, or on x86 by
* <c>_M_IX86_FP</c> reflecting MSCV's <c>/arch</c> flag.
*/
#ifndef ZSTD_ARCH_SSE2
# if defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
# define ZSTD_ARCH_SSE2
# endif
#endif

/**
* Compile-time detection of ARM Neon support, either directly via the <c>
* __ARM_NEON</c> macro, as defined in the <em><ARM Architecture Reference
* Manual</em>, or implicitly for ARM64 targets (including <c>_M_ARM64</c> for
* MSVC).
*/
#ifndef ZSTD_ARCH_NEON
# if defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
# define ZSTD_ARCH_NEON
# endif
#endif

#endif /* ZSTD_COMPILER_H */
8 changes: 4 additions & 4 deletions lib/common/zstd_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
/*-*************************************
* Dependencies
***************************************/
#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
#include "compiler.h" /* ARM Neon detection (needs to go before the Neon include) */
#if !defined(ZSTD_NO_INTRINSICS) && defined(ZSTD_ARCH_NEON)
#include <arm_neon.h>
#endif
#include "compiler.h"
#include "mem.h"
#include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
#include "error_private.h"
Expand Down Expand Up @@ -247,7 +247,7 @@ static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
* Shared functions to include for inlining
*********************************************/
static void ZSTD_copy8(void* dst, const void* src) {
#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
#if !defined(ZSTD_NO_INTRINSICS) && defined(ZSTD_ARCH_NEON)
vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
#else
ZSTD_memcpy(dst, src, 8);
Expand All @@ -256,7 +256,7 @@ static void ZSTD_copy8(void* dst, const void* src) {

#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
static void ZSTD_copy16(void* dst, const void* src) {
#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
#if !defined(ZSTD_NO_INTRINSICS) && defined(ZSTD_ARCH_NEON)
vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
#else
ZSTD_memcpy(dst, src, 16);
Expand Down
3 changes: 2 additions & 1 deletion lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Dependencies
***************************************/
#include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
#include "../common/compiler.h" /* ZSTD_ARCH_SSE2 */
#include "../common/cpu.h"
#include "../common/mem.h"
#include "hist.h" /* HIST_countFast_wksp */
Expand Down Expand Up @@ -222,7 +223,7 @@ static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_useR
/* Returns row matchfinder usage enum given an initial mode and cParams */
static ZSTD_useRowMatchFinderMode_e ZSTD_resolveRowMatchFinderMode(ZSTD_useRowMatchFinderMode_e mode,
const ZSTD_compressionParameters* const cParams) {
#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(_M_AMD64) || defined(__ARM_NEON))
#if !defined(ZSTD_NO_INTRINSICS) && (defined(ZSTD_ARCH_SSE2) || defined(ZSTD_ARCH_NEON))
int const kHasSIMD128 = 1;
#else
int const kHasSIMD128 = 0;
Expand Down
27 changes: 14 additions & 13 deletions lib/compress/zstd_lazy.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* You may select, at your option, one of the above-listed licenses.
*/

#include "../common/compiler.h" /* SSE2 and Neon support */
#include "zstd_compress_internal.h"
#include "zstd_lazy.h"

Expand Down Expand Up @@ -873,7 +874,7 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (

typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */

#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(_M_AMD64)) /* SIMD SSE version*/
#if !defined(ZSTD_NO_INTRINSICS) && defined(ZSTD_ARCH_SSE2) /* SIMD SSE version*/

#include <emmintrin.h>
typedef __m128i ZSTD_Vec128;
Expand Down Expand Up @@ -914,15 +915,15 @@ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
return v;
}

static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(const ZSTD_Vec256* const x, const ZSTD_Vec256* const y) {
ZSTD_VecMask fstMask;
ZSTD_VecMask sndMask;
fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
fstMask = ZSTD_Vec128_cmpMask8(x->fst, y->fst);
sndMask = ZSTD_Vec128_cmpMask8(x->snd, y->snd);
return fstMask | (sndMask << 16);
}

#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
#elif !defined(ZSTD_NO_INTRINSICS) && defined(ZSTD_ARCH_NEON) /* SIMD ARM NEON Version */

#include <arm_neon.h>
typedef uint8x16_t ZSTD_Vec128;
Expand Down Expand Up @@ -970,11 +971,11 @@ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
return v;
}

static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(const ZSTD_Vec256* const x, const ZSTD_Vec256* const y) {
ZSTD_VecMask fstMask;
ZSTD_VecMask sndMask;
fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
fstMask = ZSTD_Vec128_cmpMask8(x->fst, y->fst);
sndMask = ZSTD_Vec128_cmpMask8(x->snd, y->snd);
return fstMask | (sndMask << 16);
}

Expand Down Expand Up @@ -1045,13 +1046,13 @@ static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
}

/* Compare x to y, byte by byte, generating a "matches" bitfield */
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
static ZSTD_VecMask ZSTD_Vec256_cmpMask8(const ZSTD_Vec256* const x, const ZSTD_Vec256* const y) {
ZSTD_VecMask res = 0;
unsigned i = 0;
unsigned l = 0;
for (; i < VEC256_NB_SIZE_T; ++i) {
const size_t cmp1 = x.vec[i];
const size_t cmp2 = y.vec[i];
const size_t cmp1 = x->vec[i];
const size_t cmp2 = y->vec[i];
unsigned j = 0;
for (; j < sizeof(size_t); ++j, ++l) {
if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
Expand All @@ -1062,7 +1063,7 @@ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
return res;
}

#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(ZSTD_ARCH_SSE2) */

/* ZSTD_VecMask_next():
* Starting from the LSB, returns the idx of the next non-zero bit.
Expand Down Expand Up @@ -1237,7 +1238,7 @@ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, con
} else if (rowEntries == 32) {
ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
matches = ZSTD_Vec256_cmpMask8(&hashes, &expandedTags);
} else {
assert(0);
}
Expand Down
0