Config-UCL

 view release on metacpan or  search on metacpan

libucl-0.8.1/src/mum.h  view on Meta::CPAN

  return _mum_bswap64 (v);
#else
#error "Unknown endianness"
#endif
}

static inline uint32_t
_mum_le32 (uint32_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return _mum_bswap32 (v);
#else
#error "Unknown endianness"
#endif
}

/* Macro defining how many times the most nested loop in
   _mum_hash_aligned will be unrolled by the compiler (although it can
   make an own decision:).  Use only a constant here to help a
   compiler to unroll a major loop.

   The macro value affects the result hash for strings > 128 bit.  The
   unroll factor greatly affects the hashing speed.  We prefer the
   speed.  */
#ifndef _MUM_UNROLL_FACTOR_POWER
#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define _MUM_UNROLL_FACTOR_POWER 3
#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define _MUM_UNROLL_FACTOR_POWER 4
#else
#define _MUM_UNROLL_FACTOR_POWER 2
#endif
#endif

#if _MUM_UNROLL_FACTOR_POWER < 1
#error "too small unroll factor"
#elif _MUM_UNROLL_FACTOR_POWER > 4
#error "We have not enough primes for such unroll factor"
#endif

#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)

static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
  uint64_t result = start;
  const unsigned char *str = (const unsigned char *) key;
  uint64_t u64;
  int i;
  size_t n;

  result = _mum (result, _mum_block_start_prime);
  while  (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
    /* This loop could be vectorized when we have vector insns for
       64x64->128-bit multiplication.  AVX2 currently only have a
       vector insn for 4 32x32->64-bit multiplication.  */
    for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
      result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
    len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
    str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
    /* We will use the same prime numbers on the next iterations --
       randomize the state.  */
    result = _mum (result, _mum_unroll_prime);
  }
  n = len / sizeof (uint64_t);
  for (i = 0; i < (int)n; i++)
    result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
  len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
  switch (len) {
  case 7:
    u64 = _mum_le32 (*(uint32_t *) str);
    u64 |= (uint64_t) str[4] << 32;
    u64 |= (uint64_t) str[5] << 40;
    u64 |= (uint64_t) str[6] << 48;
    return result ^ _mum (u64, _mum_tail_prime);
  case 6:
    u64 = _mum_le32 (*(uint32_t *) str);
    u64 |= (uint64_t) str[4] << 32;
    u64 |= (uint64_t) str[5] << 40;
    return result ^ _mum (u64, _mum_tail_prime);
  case 5:
    u64 = _mum_le32 (*(uint32_t *) str);
    u64 |= (uint64_t) str[4] << 32;
    return result ^ _mum (u64, _mum_tail_prime);
  case 4:
    u64 = _mum_le32 (*(uint32_t *) str);
    return result ^ _mum (u64, _mum_tail_prime);
  case 3:
    u64 = str[0];
    u64 |= (uint64_t) str[1] << 8;
    u64 |= (uint64_t) str[2] << 16;
    return result ^ _mum (u64, _mum_tail_prime);
  case 2:
    u64 = str[0];
    u64 |= (uint64_t) str[1] << 8;
    return result ^ _mum (u64, _mum_tail_prime);
  case 1:
    u64 = str[0];
    return result ^ _mum (u64, _mum_tail_prime);
  }
  return result;
}

/* Final randomization of H.  */
static inline uint64_t
_mum_final (uint64_t h) {
  h ^= _mum (h, _mum_finish_prime1);
  h ^= _mum (h, _mum_finish_prime2);
  return h;
}

#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)

/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
   it is possible.  Although on modern Intel processors MULQ takes
   3-cycles vs. 4 for MULX, MULX permits more freedom in insn
   scheduling as it uses less fixed registers.  */
static inline uint64_t _MUM_TARGET("arch=haswell")
_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
  return _mum_final (_mum_hash_aligned (seed + len, key, len));
}



( run in 1.015 second using v1.01-cache-2.11-cpan-96521ef73a4 )