| |
SOFTWARE.
|
| |
*/
|
| |
|
| - |
/* This file implements MUM (MUltiply and Mix) hashing. We randomize
|
| - |
input data by 64x64-bit multiplication and mixing hi- and low-parts
|
| - |
of the multiplication result by using an addition and then mix it
|
| - |
into the current state. We use prime numbers randomly generated
|
| - |
with the equal probability of their bit values for the
|
| - |
multiplication. When all primes are used once, the state is
|
| - |
randomized and the same prime numbers are used again for data
|
| - |
randomization.
|
| - |
|
| - |
The MUM hashing passes all SMHasher tests. Pseudo Random Number
|
| - |
Generator based on MUM also passes NIST Statistical Test Suite for
|
| - |
Random and Pseudorandom Number Generators for Cryptographic
|
| - |
Applications (version 2.2.1) with 1000 bitstreams each containing
|
| - |
1M bits. MUM hashing is also faster Spooky64 and City64 on small
|
| - |
strings (at least upto 512-bit) on Haswell and Power7. The MUM bulk
|
| - |
speed (speed on very long data) is bigger than Spooky and City on
|
| - |
Power7. On Haswell the bulk speed is bigger than Spooky one and
|
| - |
close to City speed. */
|
| + |
/* This file implements MUM (MUltiply and Mix) hashing. We randomize input data by 64x64-bit
|
| + |
multiplication and mixing hi- and low-parts of the multiplication result by using an addition and
|
| + |
then mix it into the current state. We use prime numbers randomly generated with the equal
|
| + |
probability of their bit values for the multiplication. When all primes are used once, the state
|
| + |
is randomized and the same prime numbers are used again for data randomization.
|
| + |
|
| + |
The MUM hashing passes all SMHasher tests. Pseudo Random Number Generator based on MUM also
|
| + |
passes NIST Statistical Test Suite for Random and Pseudorandom Number Generators for
|
| + |
Cryptographic Applications (version 2.2.1) with 1000 bitstreams each containing 1M bits. MUM
|
| + |
hashing is also faster Spooky64 and City64 on small strings (at least upto 512-bit) on Haswell
|
| + |
and Power7. The MUM bulk speed (speed on very long data) is bigger than Spooky and City on
|
| + |
Power7. On Haswell the bulk speed is bigger than Spooky one and close to City speed. */
|
| |
|
| |
#ifndef __MUM_HASH__
|
| |
#define __MUM_HASH__
|
| |
#endif
|
| |
|
| |
#ifdef __GNUC__
|
| - |
#define _MUM_ATTRIBUTE_UNUSED __attribute__((unused))
|
| - |
#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
|
| - |
#define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
|
| + |
#define _MUM_ATTRIBUTE_UNUSED __attribute__ ((unused))
|
| + |
#define _MUM_INLINE inline __attribute__ ((always_inline))
|
| |
#else
|
| |
#define _MUM_ATTRIBUTE_UNUSED
|
| - |
#define _MUM_OPTIMIZE(opts)
|
| - |
#define _MUM_TARGET(opts)
|
| + |
#define _MUM_INLINE inline
|
| + |
#endif
|
| + |
|
| + |
#if defined(MUM_QUALITY) && !defined(MUM_TARGET_INDEPENDENT_HASH)
|
| + |
#define MUM_TARGET_INDEPENDENT_HASH
|
| |
#endif
|
| |
|
| - |
/* Macro saying to use 128-bit integers implemented by GCC for some
|
| - |
targets. */
|
| + |
/* Macro saying to use 128-bit integers implemented by GCC for some targets. */
|
| |
#ifndef _MUM_USE_INT128
|
| - |
/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
|
| - |
HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
|
| - |
otherwise int. */
|
| + |
/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64. HOST_WIDE_INT is long if
|
| + |
HOST_BITS_PER_LONG > HOST_BITS_PER_INT, otherwise int. */
|
| |
#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
|
| |
#define _MUM_USE_INT128 1
|
| |
#else
|
| |
static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
|
| |
static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
|
| |
static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
|
| - |
|
| - |
static uint64_t _mum_primes [] = {
|
| + |
|
| + |
static uint64_t _mum_primes[] = {
|
| |
0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
|
| |
0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
|
| |
0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
|
| |
0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
|
| |
};
|
| |
|
| - |
/* Multiply 64-bit V and P and return sum of high and low parts of the
|
| - |
result. */
|
| - |
static inline uint64_t
|
| - |
_mum (uint64_t v, uint64_t p) {
|
| + |
/* Multiply 64-bit V and P and return sum of high and low parts of the result. */
|
| + |
static _MUM_INLINE uint64_t _mum (uint64_t v, uint64_t p) {
|
| |
uint64_t hi, lo;
|
| |
#if _MUM_USE_INT128
|
| - |
#if defined(__aarch64__)
|
| - |
/* AARCH64 needs 2 insns to calculate 128-bit result of the
|
| - |
multiplication. If we use a generic code we actually call a
|
| - |
function doing 128x128->128 bit multiplication. The function is
|
| - |
very slow. */
|
| - |
lo = v * p;
|
| - |
asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
|
| - |
#else
|
| |
__uint128_t r = (__uint128_t) v * (__uint128_t) p;
|
| |
hi = (uint64_t) (r >> 64);
|
| |
lo = (uint64_t) r;
|
| - |
#endif
|
| |
#else
|
| - |
/* Implementation of 64x64->128-bit multiplication by four 32x32->64
|
| - |
bit multiplication. */
|
| + |
/* Implementation of 64x64->128-bit multiplication by four 32x32->64 bit multiplication. */
|
| |
uint64_t hv = v >> 32, hp = p >> 32;
|
| |
uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
|
| - |
uint64_t rh = hv * hp;
|
| + |
uint64_t rh = hv * hp;
|
| |
uint64_t rm_0 = hv * lp;
|
| |
uint64_t rm_1 = hp * lv;
|
| - |
uint64_t rl = lv * lp;
|
| + |
uint64_t rl = lv * lp;
|
| |
uint64_t t, carry = 0;
|
| - |
|
| - |
/* We could ignore a carry bit here if we did not care about the
|
| - |
same hash for 32-bit and 64-bit targets. */
|
| + |
|
| + |
/* We could ignore a carry bit here if we did not care about the same hash for 32-bit and 64-bit
|
| + |
targets. */
|
| |
t = rl + (rm_0 << 32);
|
| |
#ifdef MUM_TARGET_INDEPENDENT_HASH
|
| |
carry = t < rl;
|
| |
#endif
|
| |
}
|
| |
|
| - |
/* Macro defining how many times the most nested loop in
|
| - |
_mum_hash_aligned will be unrolled by the compiler (although it can
|
| - |
make an own decision:). Use only a constant here to help a
|
| - |
compiler to unroll a major loop.
|
| + |
/* Macro defining how many times the most nested loop in _mum_hash_aligned will be unrolled by the
|
| + |
compiler (although it can make an own decision:). Use only a constant here to help a compiler to
|
| + |
unroll a major loop.
|
| |
|
| - |
The macro value affects the result hash for strings > 128 bit. The
|
| - |
unroll factor greatly affects the hashing speed. We prefer the
|
| - |
speed. */
|
| + |
The macro value affects the result hash for strings > 128 bit. The unroll factor greatly affects
|
| + |
the hashing speed. We prefer the speed. */
|
| |
#ifndef _MUM_UNROLL_FACTOR_POWER
|
| |
#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
|
| |
#define _MUM_UNROLL_FACTOR_POWER 3
|
| |
#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
|
| |
#define _MUM_UNROLL_FACTOR_POWER 4
|
| - |
#elif defined (MUM_V1) || defined (MUM_V2)
|
| + |
#elif defined(MUM_V1) || defined(MUM_V2)
|
| |
#define _MUM_UNROLL_FACTOR_POWER 2
|
| |
#else
|
| |
#define _MUM_UNROLL_FACTOR_POWER 3
|
| |
|
| |
#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
|
| |
|
| - |
/* Rotate V left by SH. */
|
| - |
static inline uint64_t _mum_rotl (uint64_t v, int sh) {
|
| - |
return v << sh | v >> (64 - sh);
|
| - |
}
|
| + |
/* Rotate V left by SH. */
|
| + |
static _MUM_INLINE uint64_t _mum_rotl (uint64_t v, int sh) { return v << sh | v >> (64 - sh); }
|
| |
|
| - |
static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
|
| - |
_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
|
| + |
#if defined(MUM_V1) || defined(MUM_V2) || !defined(MUM_QUALITY)
|
| + |
#define _MUM_TAIL_START(v) 0
|
| + |
#else
|
| + |
#define _MUM_TAIL_START(v) v
|
| + |
#endif
|
| + |
static _MUM_INLINE uint64_t
|
| + |
#if defined(__GNUC__) && !defined(__clang__)
|
| + |
__attribute__ ((__optimize__ ("unroll-loops")))
|
| + |
#endif
|
| + |
_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
|
| |
uint64_t result = start;
|
| |
const unsigned char *str = (const unsigned char *) key;
|
| |
uint64_t u64;
|
| |
size_t i;
|
| |
size_t n;
|
| - |
|
| + |
|
| |
#ifndef MUM_V2
|
| |
result = _mum (result, _mum_block_start_prime);
|
| |
#endif
|
| - |
while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
|
| - |
/* This loop could be vectorized when we have vector insns for
|
| - |
64x64->128-bit multiplication. AVX2 currently only have vector
|
| - |
insns for 4 32x32->64-bit multiplication and for 1
|
| - |
64x64->128-bit multiplication (pclmulqdq). */
|
| - |
#if defined (MUM_V1) || defined (MUM_V2)
|
| + |
while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
|
| + |
/* This loop could be vectorized when we have vector insns for 64x64->128-bit multiplication.
|
| + |
AVX2 currently only have vector insns for 4 32x32->64-bit multiplication and for 1
|
| + |
64x64->128-bit multiplication (pclmulqdq). */
|
| + |
#if defined(MUM_V1) || defined(MUM_V2)
|
| |
for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
|
| |
result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
|
| |
#else
|
| |
for (i = 0; i < _MUM_UNROLL_FACTOR; i += 2)
|
| |
result ^= _mum (_mum_le (((uint64_t *) str)[i]) ^ _mum_primes[i],
|
| - |
_mum_le (((uint64_t *) str)[i + 1]) ^ _mum_primes[i + 1]);
|
| + |
_mum_le (((uint64_t *) str)[i + 1]) ^ _mum_primes[i + 1]);
|
| |
#endif
|
| |
len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
|
| |
str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
|
| - |
/* We will use the same prime numbers on the next iterations --
|
| - |
randomize the state. */
|
| + |
/* We will use the same prime numbers on the next iterations -- randomize the state. */
|
| |
result = _mum (result, _mum_unroll_prime);
|
| |
}
|
| |
n = len / sizeof (uint64_t);
|
| + |
#if defined(MUM_V1) || defined(MUM_V2) || !defined(MUM_QUALITY)
|
| + |
for (i = 0; i < n; i++) result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
|
| + |
#else
|
| |
for (i = 0; i < n; i++)
|
| - |
result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
|
| - |
len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
|
| + |
result ^= _mum (_mum_le (((uint64_t *) str)[i]) + _mum_primes[i], _mum_primes[i]);
|
| + |
#endif
|
| + |
len -= n * sizeof (uint64_t);
|
| + |
str += n * sizeof (uint64_t);
|
| |
switch (len) {
|
| |
case 7:
|
| - |
u64 = _mum_le32 (*(uint32_t *) str);
|
| - |
u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32;
|
| - |
u64 |= (uint64_t) str[6] << 48;
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[0]) + _mum_le32 (*(uint32_t *) str);
|
| + |
u64 += _mum_le16 (*(uint16_t *) (str + 4)) << 32;
|
| + |
u64 += (uint64_t) str[6] << 48;
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
case 6:
|
| - |
u64 = _mum_le32 (*(uint32_t *) str);
|
| - |
u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32;
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[1]) + _mum_le32 (*(uint32_t *) str);
|
| + |
u64 += _mum_le16 (*(uint16_t *) (str + 4)) << 32;
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
case 5:
|
| - |
u64 = _mum_le32 (*(uint32_t *) str);
|
| - |
u64 |= (uint64_t) str[4] << 32;
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[2]) + _mum_le32 (*(uint32_t *) str);
|
| + |
u64 += (uint64_t) str[4] << 32;
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
case 4:
|
| - |
u64 = _mum_le32 (*(uint32_t *) str);
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[3]) + _mum_le32 (*(uint32_t *) str);
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
case 3:
|
| - |
u64 = _mum_le16 (*(uint16_t *) str);
|
| - |
u64 |= (uint64_t) str[2] << 16;
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[4]) + _mum_le16 (*(uint16_t *) str);
|
| + |
u64 += (uint64_t) str[2] << 16;
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
case 2:
|
| - |
u64 = _mum_le16 (*(uint16_t *) str);
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[5]) + _mum_le16 (*(uint16_t *) str);
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
case 1:
|
| - |
u64 = str[0];
|
| + |
u64 = _MUM_TAIL_START (_mum_primes[6]) + str[0];
|
| |
return result ^ _mum (u64, _mum_tail_prime);
|
| |
}
|
| |
return result;
|
| |
}
|
| |
|
| - |
/* Final randomization of H. */
|
| - |
static inline uint64_t
|
| - |
_mum_final (uint64_t h) {
|
| - |
#if defined (MUM_V1)
|
| + |
/* Final randomization of H. */
|
| + |
static _MUM_INLINE uint64_t _mum_final (uint64_t h) {
|
| + |
#if defined(MUM_V1)
|
| |
h ^= _mum (h, _mum_finish_prime1);
|
| |
h ^= _mum (h, _mum_finish_prime2);
|
| - |
#elif defined (MUM_V2)
|
| + |
#elif defined(MUM_V2)
|
| |
h ^= _mum_rotl (h, 33);
|
| |
h ^= _mum (h, _mum_finish_prime1);
|
| |
#else
|
| |
#error "too small block length"
|
| |
#endif
|
| |
|
| - |
static inline uint64_t
|
| - |
#if defined(__x86_64__)
|
| - |
_MUM_TARGET("inline-all-stringops")
|
| + |
static _MUM_INLINE uint64_t
|
| + |
#if defined(__x86_64__) && defined(__GNUC__) && !defined(__clang__)
|
| + |
__attribute__ ((__target__ ("inline-all-stringops")))
|
| |
#endif
|
| - |
_mum_hash_default (const void *key, size_t len, uint64_t seed) {
|
| + |
_mum_hash_default (const void *key, size_t len, uint64_t seed) {
|
| |
uint64_t result;
|
| |
const unsigned char *str = (const unsigned char *) key;
|
| |
size_t block_len;
|
| |
uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
|
| - |
|
| + |
|
| |
result = seed + len;
|
| |
if (((size_t) str & 0x7) == 0)
|
| |
result = _mum_hash_aligned (result, key, len);
|
| |
else {
|
| |
while (len != 0) {
|
| |
block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
|
| - |
memmove (buf, str, block_len);
|
| + |
memcpy (buf, str, block_len);
|
| |
result = _mum_hash_aligned (result, buf, block_len);
|
| |
len -= block_len;
|
| |
str += block_len;
|
| |
_mum_primes[i] = _mum_next_factor ();
|
| |
}
|
| |
|
| - |
/* Start hashing data with SEED. Return the state. */
|
| - |
static inline uint64_t
|
| - |
mum_hash_init (uint64_t seed) {
|
| - |
return seed;
|
| - |
}
|
| + |
/* Start hashing data with SEED. Return the state. */
|
| + |
static _MUM_INLINE uint64_t mum_hash_init (uint64_t seed) { return seed; }
|
| |
|
| - |
/* Process data KEY with the state H and return the updated state. */
|
| - |
static inline uint64_t
|
| - |
mum_hash_step (uint64_t h, uint64_t key) {
|
| + |
/* Process data KEY with the state H and return the updated state. */
|
| + |
static _MUM_INLINE uint64_t mum_hash_step (uint64_t h, uint64_t key) {
|
| |
return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
|
| |
}
|
| |
|
| - |
/* Return the result of hashing using the current state H. */
|
| - |
static inline uint64_t
|
| - |
mum_hash_finish (uint64_t h) {
|
| - |
return _mum_final (h);
|
| - |
}
|
| + |
/* Return the result of hashing using the current state H. */
|
| + |
static _MUM_INLINE uint64_t mum_hash_finish (uint64_t h) { return _mum_final (h); }
|
| |
|
| - |
/* Fast hashing of KEY with SEED. The hash is always the same for the
|
| - |
same key on any target. */
|
| - |
static inline size_t
|
| - |
mum_hash64 (uint64_t key, uint64_t seed) {
|
| + |
/* Fast hashing of KEY with SEED. The hash is always the same for the same key on any target. */
|
| + |
static _MUM_INLINE size_t mum_hash64 (uint64_t key, uint64_t seed) {
|
| |
return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
|
| |
}
|
| |
|
| - |
/* Hash data KEY of length LEN and SEED. The hash depends on the
|
| - |
target endianess and the unroll factor. */
|
| - |
static inline uint64_t
|
| - |
mum_hash (const void *key, size_t len, uint64_t seed) {
|
| + |
/* Hash data KEY of length LEN and SEED. The hash depends on the target endianess and the unroll
|
| + |
factor. */
|
| + |
static _MUM_INLINE uint64_t mum_hash (const void *key, size_t len, uint64_t seed) {
|
| |
#if _MUM_UNALIGNED_ACCESS
|
| |
return _mum_final (_mum_hash_aligned (seed + len, key, len));
|
| |
#else
|