Compress-Stream-Zstd
view release on metacpan or search on metacpan
ext/zstd/lib/common/xxhash.h view on Meta::CPAN
# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
# define XXH_ACC_ALIGN 64
# endif
#endif
#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
#else
# define XXH_SEC_ALIGN 8
#endif
/*
* UGLY HACK:
* GCC usually generates the best code with -O3 for xxHash.
*
* However, when targeting AVX2, it is overzealous in its unrolling resulting
* in code roughly 3/4 the speed of Clang.
*
* There are other issues, such as GCC splitting _mm256_loadu_si256 into
* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
* only applies to Sandy and Ivy Bridge... which don't even support AVX2.
*
* That is why when compiling the AVX2 version, it is recommended to use either
* -O2 -mavx2 -march=haswell
* or
* -O2 -mavx2 -mno-avx256-split-unaligned-load
* for decent performance, or to use Clang instead.
*
* Fortunately, we can control the first one with a pragma that forces GCC into
* -O2, but the other one we can't control without "failed to inline always
* inline function due to target mismatch" warnings.
*/
#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
# pragma GCC push_options
# pragma GCC optimize("-O2")
#endif
#if XXH_VECTOR == XXH_NEON
/*
* NEON's setup for vmlal_u32 is a little more complicated than it is on
* SSE2, AVX2, and VSX.
*
* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
*
* To do the same operation, the 128-bit 'Q' register needs to be split into
* two 64-bit 'D' registers, performing this operation::
*
* [ a | b ]
* | '---------. .--------' |
* | x |
* | .---------' '--------. |
* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
*
* Due to significant changes in aarch64, the fastest method for aarch64 is
* completely different than the fastest method for ARMv7-A.
*
* ARMv7-A treats D registers as unions overlaying Q registers, so modifying
* D11 will modify the high half of Q5. This is similar to how modifying AH
* will only affect bits 8-15 of AX on x86.
*
* VZIP takes two registers, and puts even lanes in one register and odd lanes
* in the other.
*
* On ARMv7-A, this strangely modifies both parameters in place instead of
* taking the usual 3-operand form.
*
* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
* lower and upper halves of the Q register to end up with the high and low
* halves where we want - all in one instruction.
*
* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
*
* Unfortunately we need inline assembly for this: Instructions modifying two
* registers at once is not possible in GCC or Clang's IR, and they have to
* create a copy.
*
* aarch64 requires a different approach.
*
* In order to make it easier to write a decent compiler for aarch64, many
* quirks were removed, such as conditional execution.
*
* NEON was also affected by this.
*
* aarch64 cannot access the high bits of a Q-form register, and writes to a
* D-form register zero the high bits, similar to how writes to W-form scalar
* registers (or DWORD registers on x86_64) work.
*
* The formerly free vget_high intrinsics now require a vext (with a few
* exceptions)
*
* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
* operand.
*
* The equivalent of the VZIP.32 on the lower and upper halves would be this
* mess:
*
* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
*
* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
*
* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
*
* This is available on ARMv7-A, but is less efficient than a single VZIP.32.
*/
/*!
* Function-like macro:
* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
* {
* outLo = (uint32x2_t)(in & 0xFFFFFFFF);
* outHi = (uint32x2_t)(in >> 32);
* in = UNDEFINED;
* }
( run in 1.861 second using v1.01-cache-2.11-cpan-39bf76dae61 )