Config-UCL
view release on metacpan or search on metacpan
libucl-0.8.1/src/mum.h view on Meta::CPAN
return _mum_bswap64 (v);
#else
#error "Unknown endianness"
#endif
}
static inline uint32_t
_mum_le32 (uint32_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return _mum_bswap32 (v);
#else
#error "Unknown endianness"
#endif
}
/* Macro defining how many times the most nested loop in
_mum_hash_aligned will be unrolled by the compiler (although it can
make an own decision:). Use only a constant here to help a
compiler to unroll a major loop.
The macro value affects the result hash for strings > 128 bit. The
unroll factor greatly affects the hashing speed. We prefer the
speed. */
#ifndef _MUM_UNROLL_FACTOR_POWER
#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define _MUM_UNROLL_FACTOR_POWER 3
#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define _MUM_UNROLL_FACTOR_POWER 4
#else
#define _MUM_UNROLL_FACTOR_POWER 2
#endif
#endif
#if _MUM_UNROLL_FACTOR_POWER < 1
#error "too small unroll factor"
#elif _MUM_UNROLL_FACTOR_POWER > 4
#error "We have not enough primes for such unroll factor"
#endif
#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
uint64_t result = start;
const unsigned char *str = (const unsigned char *) key;
uint64_t u64;
int i;
size_t n;
result = _mum (result, _mum_block_start_prime);
while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
/* This loop could be vectorized when we have vector insns for
64x64->128-bit multiplication. AVX2 currently only have a
vector insn for 4 32x32->64-bit multiplication. */
for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
/* We will use the same prime numbers on the next iterations --
randomize the state. */
result = _mum (result, _mum_unroll_prime);
}
n = len / sizeof (uint64_t);
for (i = 0; i < (int)n; i++)
result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
switch (len) {
case 7:
u64 = _mum_le32 (*(uint32_t *) str);
u64 |= (uint64_t) str[4] << 32;
u64 |= (uint64_t) str[5] << 40;
u64 |= (uint64_t) str[6] << 48;
return result ^ _mum (u64, _mum_tail_prime);
case 6:
u64 = _mum_le32 (*(uint32_t *) str);
u64 |= (uint64_t) str[4] << 32;
u64 |= (uint64_t) str[5] << 40;
return result ^ _mum (u64, _mum_tail_prime);
case 5:
u64 = _mum_le32 (*(uint32_t *) str);
u64 |= (uint64_t) str[4] << 32;
return result ^ _mum (u64, _mum_tail_prime);
case 4:
u64 = _mum_le32 (*(uint32_t *) str);
return result ^ _mum (u64, _mum_tail_prime);
case 3:
u64 = str[0];
u64 |= (uint64_t) str[1] << 8;
u64 |= (uint64_t) str[2] << 16;
return result ^ _mum (u64, _mum_tail_prime);
case 2:
u64 = str[0];
u64 |= (uint64_t) str[1] << 8;
return result ^ _mum (u64, _mum_tail_prime);
case 1:
u64 = str[0];
return result ^ _mum (u64, _mum_tail_prime);
}
return result;
}
/* Final randomization of H. */
static inline uint64_t
_mum_final (uint64_t h) {
h ^= _mum (h, _mum_finish_prime1);
h ^= _mum (h, _mum_finish_prime2);
return h;
}
#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
it is possible. Although on modern Intel processors MULQ takes
3-cycles vs. 4 for MULX, MULX permits more freedom in insn
scheduling as it uses less fixed registers. */
static inline uint64_t _MUM_TARGET("arch=haswell")
_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
return _mum_final (_mum_hash_aligned (seed + len, key, len));
}
( run in 1.015 second using v1.01-cache-2.11-cpan-96521ef73a4 )