X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=include%2Fllvm%2FADT%2FHashing.h;h=6ab07254a21d5849fdbccfa59ce8d6232ab3c236;hb=40dab1059e72d3af59f2523fa8a7d05f40dafca5;hp=ccf352cb52b520ca2d90d612b637620c3b10e4b8;hpb=0b66c6fca22e85f732cf58f459a06c06833d1882;p=oota-llvm.git diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h index ccf352cb52b..6ab07254a21 100644 --- a/include/llvm/ADT/Hashing.h +++ b/include/llvm/ADT/Hashing.h @@ -47,6 +47,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/DataTypes.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/type_traits.h" #include #include @@ -74,43 +76,20 @@ namespace llvm { /// using llvm::hash_value; /// llvm::hash_code code = hash_value(x); /// \endcode -/// -/// Also note that there are two numerical values which are reserved, and the -/// implementation ensures will never be produced for real hash_codes. These -/// can be used as sentinels within hashing data structures. class hash_code { size_t value; public: - /// \brief Default construct a hash_code. Constructs a null code. - hash_code() : value() {} + /// \brief Default construct a hash_code. + /// Note that this leaves the value uninitialized. + hash_code() {} /// \brief Form a hash code directly from a numerical value. - hash_code(size_t value) : value(value) { - // Ensure we don't form a hash_code with one of the prohibited values. - assert(value != get_null_code().value); - assert(value != get_invalid_code().value); - } + hash_code(size_t value) : value(value) {} /// \brief Convert the hash code to its numerical value for use. /*explicit*/ operator size_t() const { return value; } - /// \brief Get a hash_code object which corresponds to a null code. - /// - /// The null code must never be the result of any 'hash_value' calls and can - /// be used to detect an unset hash_code. - static hash_code get_null_code() { return hash_code(); } - - /// \brief Get a hash_code object which corresponds to an invalid code. - /// - /// The invalid code must never be the result of any 'hash_value' calls. This - /// can be used to flag invalid hash_codes or mark entries in a hash table. - static hash_code get_invalid_code() { - hash_code invalid_code; - invalid_code.value = static_cast(-1); - return invalid_code; - } - friend bool operator==(const hash_code &lhs, const hash_code &rhs) { return lhs.value == rhs.value; } @@ -122,6 +101,46 @@ public: friend size_t hash_value(const hash_code &code) { return code.value; } }; +/// \brief Compute a hash_code for any integer value. +/// +/// Note that this function is intended to compute the same hash_code for +/// a particular value without regard to the pre-promotion type. This is in +/// contrast to hash_combine which may produce different hash_codes for +/// differing argument types even if they would implicit promote to a common +/// type without changing the value. +template +typename enable_if, hash_code>::type hash_value(T value); + +/// \brief Compute a hash_code for a pointer's address. +/// +/// N.B.: This hashes the *address*. Not the value and not the type. +template hash_code hash_value(const T *ptr); + +/// \brief Compute a hash_code for a pair of objects. +template +hash_code hash_value(const std::pair &arg); + +/// \brief Compute a hash_code for a standard string. +template +hash_code hash_value(const std::basic_string &arg); + + +/// \brief Override the execution seed with a fixed value. +/// +/// This hashing library uses a per-execution seed designed to change on each +/// run with high probability in order to ensure that the hash codes are not +/// attackable and to ensure that output which is intended to be stable does +/// not rely on the particulars of the hash codes produced. +/// +/// That said, there are use cases where it is important to be able to +/// reproduce *exactly* a specific behavior. To that end, we provide a function +/// which will forcibly set the seed to a fixed value. This must be done at the +/// start of the program, before any hashes are computed. Also, it cannot be +/// undone. This makes it thread-hostile and very hard to use outside of +/// immediately on start of a simple program designed for reproducible +/// behavior. +void set_fixed_execution_hash_seed(size_t fixed_value); + // All of the implementation details of actually computing the various hash // code values are held within this namespace. These routines are included in @@ -132,12 +151,16 @@ namespace detail { inline uint64_t fetch64(const char *p) { uint64_t result; memcpy(&result, p, sizeof(result)); + if (sys::isBigEndianHost()) + return sys::SwapByteOrder(result); return result; } inline uint32_t fetch32(const char *p) { uint32_t result; memcpy(&result, p, sizeof(result)); + if (sys::isBigEndianHost()) + return sys::SwapByteOrder(result); return result; } @@ -150,7 +173,7 @@ static const uint64_t k3 = 0xc949d7c7509e6557ULL; /// \brief Bitwise right rotate. /// Normally this will compile to a single instruction, especially if the /// shift is a manifest constant. -inline uint64_t rotate(uint64_t val, unsigned shift) { +inline uint64_t rotate(uint64_t val, size_t shift) { // Avoid shifting by 64: doing so yields an undefined result. return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); } @@ -185,9 +208,9 @@ inline uint64_t hash_4to8_bytes(const char *s, size_t len, uint64_t seed) { } inline uint64_t hash_9to16_bytes(const char *s, size_t len, uint64_t seed) { - uint64_t a = fetch64(s); - uint64_t b = fetch64(s + len - 8); - return hash_16_bytes(seed ^ a, rotate(b + len, len)) ^ b; + uint64_t a = fetch64(s); + uint64_t b = fetch64(s + len - 8); + return hash_16_bytes(seed ^ a, rotate(b + len, len)) ^ b; } inline uint64_t hash_17to32_bytes(const char *s, size_t len, uint64_t seed) { @@ -219,31 +242,22 @@ inline uint64_t hash_33to64_bytes(const char *s, size_t len, uint64_t seed) { uint64_t wf = a + z; uint64_t ws = b + rotate(a, 31) + c; uint64_t r = shift_mix((vf + ws) * k2 + (wf + vs) * k0); - return shift_mix(seed ^ (r * k0) + vs) * k2; + return shift_mix((seed ^ (r * k0)) + vs) * k2; } inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) { - uint64_t hash; if (length >= 4 && length <= 8) - hash = hash_4to8_bytes(s, length, seed); - else if (length > 8 && length <= 16) - hash = hash_9to16_bytes(s, length, seed); - else if (length > 16 && length <= 32) - hash = hash_17to32_bytes(s, length, seed); - else if (length > 32) - hash = hash_33to64_bytes(s, length, seed); - else if (length != 0) - hash = hash_1to3_bytes(s, length, seed); - else - return k2 ^ seed; - - // FIXME: The invalid hash_code check is really expensive; there should be - // a better way of ensuring these invariants hold. - if (hash == static_cast(hash_code::get_null_code())) - hash = k1 ^ seed; - else if (hash == static_cast(hash_code::get_invalid_code())) - hash = k3 ^ seed; - return hash; + return hash_4to8_bytes(s, length, seed); + if (length > 8 && length <= 16) + return hash_9to16_bytes(s, length, seed); + if (length > 16 && length <= 32) + return hash_17to32_bytes(s, length, seed); + if (length > 32) + return hash_33to64_bytes(s, length, seed); + if (length != 0) + return hash_1to3_bytes(s, length, seed); + + return k2 ^ seed; } /// \brief The intermediate state used during hashing. @@ -259,9 +273,8 @@ struct hash_state { static hash_state create(const char *s, uint64_t seed) { hash_state state = { 0, seed, hash_16_bytes(seed, k1), rotate(seed ^ k1, 49), - seed * k1, shift_mix(seed), hash_16_bytes(state.h4, state.h5), - seed - }; + seed * k1, shift_mix(seed), 0, seed }; + state.h6 = hash_16_bytes(state.h4, state.h5); state.mix(s); return state; } @@ -299,14 +312,8 @@ struct hash_state { /// \brief Compute the final 64-bit hash code value based on the current /// state and the length of bytes hashed. uint64_t finalize(size_t length) { - uint64_t final_value - = hash_16_bytes(hash_16_bytes(h3, h5) + shift_mix(h1) * k1 + h2, - hash_16_bytes(h4, h6) + shift_mix(length) * k1 + h0); - if (final_value == static_cast(hash_code::get_null_code())) - final_value = k1 ^ seed; - if (final_value == static_cast(hash_code::get_invalid_code())) - final_value = k3 ^ seed; - return final_value; + return hash_16_bytes(hash_16_bytes(h3, h5) + shift_mix(h1) * k1 + h2, + hash_16_bytes(h4, h6) + shift_mix(length) * k1 + h0); } }; @@ -325,87 +332,40 @@ inline size_t get_execution_seed() { // // However, if there is a fixed seed override set the first time this is // called, return that instead of the per-execution seed. + const uint64_t seed_prime = 0xff51afd7ed558ccdULL; static size_t seed = fixed_seed_override ? fixed_seed_override - : 0xff51afd7ed558ccdULL; + : (size_t)seed_prime; return seed; } -/// \brief Helper to hash the value of a single integer. -/// -/// Overloads for smaller integer types are not provided to ensure consistent -/// behavior in the presence of integral promotions. Essentially, -/// "hash_value('4')" and "hash_value('0' + 4)" should be the same. -inline hash_code hash_integer_value(uint64_t value) { - // Similar to hash_4to8_bytes but using a seed instead of length. - const uint64_t seed = get_execution_seed(); - const char *s = reinterpret_cast(&value); - const uint64_t a = fetch32(s); - return hash_16_bytes(seed + (a << 3), fetch32(s + 4)); -} - -} // namespace detail -} // namespace hashing - - -/// \brief Override the execution seed with a fixed value. -/// -/// This hashing library uses a per-execution seed designed to change on each -/// run with high probability in order to ensure that the hash codes are not -/// attackable and to ensure that output which is intended to be stable does -/// not rely on the particulars of the hash codes produced. -/// -/// That said, there are use cases where it is important to be able to -/// reproduce *exactly* a specific behavior. To that end, we provide a function -/// which will forcibly set the seed to a fixed value. This must be done at the -/// start of the program, before any hashes are computed. Also, it cannot be -/// undone. This makes it thread-hostile and very hard to use outside of -/// immediately on start of a simple program designed for reproducible -/// behavior. -void set_fixed_execution_hash_seed(size_t fixed_value); - - -/// \brief Compute a hash_code for any integer value. -/// -/// Note that this function is intended to compute the same hash_code for -/// a particular value without regard to the pre-promotion type. This is in -/// contrast to hash_combine which may produce different hash_codes for -/// differing argument types even if they would implicit promote to a common -/// type without changing the value. -template -typename enable_if, hash_code>::type hash_value(T value) { - return ::llvm::hashing::detail::hash_integer_value(value); -} - -/// \brief Compute a hash_code for a pointer's address. -/// -/// N.B.: This hashes the *address*. Not the value and not the type. -template hash_code hash_value(const T *ptr) { - return ::llvm::hashing::detail::hash_integer_value( - reinterpret_cast(ptr)); -} - - -// Implementation details for implementing hash combining functions. -namespace hashing { -namespace detail { - /// \brief Trait to indicate whether a type's bits can be hashed directly. /// /// A type trait which is true if we want to combine values for hashing by /// reading the underlying data. It is false if values of this type must /// first be passed to hash_value, and the resulting hash_codes combined. // -// FIXME: We want to replace is_integral and is_pointer here with a predicate -// which asserts that comparing the underlying storage of two values of the -// type for equality is equivalent to comparing the two values for equality. -// For all the platforms we care about, this holds for integers and pointers, -// but there are platforms where it doesn't and we would like to support -// user-defined types which happen to satisfy this property. +// FIXME: We want to replace is_integral_or_enum and is_pointer here with +// a predicate which asserts that comparing the underlying storage of two +// values of the type for equality is equivalent to comparing the two values +// for equality. For all the platforms we care about, this holds for integers +// and pointers, but there are platforms where it doesn't and we would like to +// support user-defined types which happen to satisfy this property. template struct is_hashable_data - : integral_constant::value || is_pointer::value) && + : integral_constant::value || + is_pointer::value) && 64 % sizeof(T) == 0)> {}; +// Special case std::pair to detect when both types are viable and when there +// is no alignment-derived padding in the pair. This is a bit of a lie because +// std::pair isn't truly POD, but it's close enough in all reasonable +// implementations for our use case of hashing the underlying data. +template struct is_hashable_data > + : integral_constant::value && + is_hashable_data::value && + (sizeof(T) + sizeof(U)) == + sizeof(std::pair))> {}; + /// \brief Helper to get the hashable data representation for a type. /// This variant is enabled when the type itself can be used. template @@ -456,8 +416,6 @@ hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) { while (first != last && store_and_advance(buffer_ptr, buffer_end, get_hashable_data(*first))) ++first; -/// \brief Metafunction that determines whether the given type is an integral -/// type. if (first == last) return hash_short(buffer, buffer_ptr - buffer, seed); assert(buffer_ptr == buffer_end); @@ -495,7 +453,7 @@ hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) { /// and directly reads from the underlying memory. template typename enable_if, hash_code>::type -hash_combine_range_impl(const ValueT *first, const ValueT *last) { +hash_combine_range_impl(ValueT *first, ValueT *last) { const size_t seed = get_execution_seed(); const char *s_begin = reinterpret_cast(first); const char *s_end = reinterpret_cast(last); @@ -543,13 +501,10 @@ namespace detail { /// recursive combining of arguments used in hash_combine. It is particularly /// useful at minimizing the code in the recursive calls to ease the pain /// caused by a lack of variadic functions. -class hash_combine_recursive_helper { - const size_t seed; +struct hash_combine_recursive_helper { char buffer[64]; - char *const buffer_end; - char *buffer_ptr; - size_t length; hash_state state; + const size_t seed; public: /// \brief Construct a recursive hash combining helper. @@ -557,10 +512,7 @@ public: /// This sets up the state for a recursive hash combine, including getting /// the seed and buffer setup. hash_combine_recursive_helper() - : seed(get_execution_seed()), - buffer_end(buffer + array_lengthof(buffer)), - buffer_ptr(buffer), - length(0) {} + : seed(get_execution_seed()) {} /// \brief Combine one chunk of data into the current in-flight hash. /// @@ -568,7 +520,8 @@ public: /// the data. If the buffer is full, it hashes the buffer into its /// hash_state, empties it, and then merges the new chunk in. This also /// handles cases where the data straddles the end of the buffer. - template void combine_data(T data) { + template + char *combine_data(size_t &length, char *buffer_ptr, char *buffer_end, T data) { if (!store_and_advance(buffer_ptr, buffer_end, data)) { // Check for skew which prevents the buffer from being packed, and do // a partial store into the buffer to fill it. This is only a concern @@ -599,6 +552,7 @@ public: partial_store_size)) abort(); } + return buffer_ptr; } #if defined(__has_feature) && __has_feature(__cxx_variadic_templates__) @@ -608,11 +562,12 @@ public: /// This function recurses through each argument, combining that argument /// into a single hash. template - hash_code combine(const T &arg, const Ts &...args) { - combine_data( get_hashable_data(arg)); + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T &arg, const Ts &...args) { + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg)); // Recurse to the next argument. - return combine(args...); + return combine(length, buffer_ptr, buffer_end, args...); } #else @@ -621,37 +576,43 @@ public: template - hash_code combine(const T1 &arg1, const T2 &arg2, const T3 &arg3, + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T1 &arg1, const T2 &arg2, const T3 &arg3, const T4 &arg4, const T5 &arg5, const T6 &arg6) { - combine_data(get_hashable_data(arg1)); - return combine(arg2, arg3, arg4, arg5, arg6); + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1)); + return combine(length, buffer_ptr, buffer_end, arg2, arg3, arg4, arg5, arg6); } template - hash_code combine(const T1 &arg1, const T2 &arg2, const T3 &arg3, + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T1 &arg1, const T2 &arg2, const T3 &arg3, const T4 &arg4, const T5 &arg5) { - combine_data(get_hashable_data(arg1)); - return combine(arg2, arg3, arg4, arg5); + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1)); + return combine(length, buffer_ptr, buffer_end, arg2, arg3, arg4, arg5); } template - hash_code combine(const T1 &arg1, const T2 &arg2, const T3 &arg3, + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T1 &arg1, const T2 &arg2, const T3 &arg3, const T4 &arg4) { - combine_data(get_hashable_data(arg1)); - return combine(arg2, arg3, arg4); + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1)); + return combine(length, buffer_ptr, buffer_end, arg2, arg3, arg4); } template - hash_code combine(const T1 &arg1, const T2 &arg2, const T3 &arg3) { - combine_data(get_hashable_data(arg1)); - return combine(arg2, arg3); + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T1 &arg1, const T2 &arg2, const T3 &arg3) { + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1)); + return combine(length, buffer_ptr, buffer_end, arg2, arg3); } template - hash_code combine(const T1 &arg1, const T2 &arg2) { - combine_data(get_hashable_data(arg1)); - return combine(arg2); + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T1 &arg1, const T2 &arg2) { + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1)); + return combine(length, buffer_ptr, buffer_end, arg2); } template - hash_code combine(const T1 &arg1) { - combine_data(get_hashable_data(arg1)); - return combine(); + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end, + const T1 &arg1) { + buffer_ptr = combine_data(length, buffer_ptr, buffer_end, get_hashable_data(arg1)); + return combine(length, buffer_ptr, buffer_end); } #endif @@ -661,7 +622,7 @@ public: /// The base case when combining arguments recursively is reached when all /// arguments have been handled. It flushes the remaining buffer and /// constructs a hash_code. - hash_code combine() { + hash_code combine(size_t length, char *buffer_ptr, char *buffer_end) { // Check whether the entire set of values fit in the buffer. If so, we'll // use the optimized short hashing routine and skip state entirely. if (length == 0) @@ -701,7 +662,7 @@ public: template hash_code hash_combine(const Ts &...args) { // Recursively hash each argument using a helper class. ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(args...); + return helper.combine(0, helper.buffer, helper.buffer + 64, args...); } #else @@ -712,40 +673,94 @@ template hash_code hash_combine(const Ts &...args) { template hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3, - const T4 &arg4, const T5 &arg5, const T6 &arg6) { + const T4 &arg4, const T5 &arg5, const T6 &arg6) { ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(arg1, arg2, arg3, arg4, arg5, arg6); + return helper.combine(0, helper.buffer, helper.buffer + 64, + arg1, arg2, arg3, arg4, arg5, arg6); } template hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3, - const T4 &arg4, const T5 &arg5) { + const T4 &arg4, const T5 &arg5) { ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(arg1, arg2, arg3, arg4, arg5); + return helper.combine(0, helper.buffer, helper.buffer + 64, + arg1, arg2, arg3, arg4, arg5); } template hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3, - const T4 &arg4) { + const T4 &arg4) { ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(arg1, arg2, arg3, arg4); + return helper.combine(0, helper.buffer, helper.buffer + 64, + arg1, arg2, arg3, arg4); } template hash_code hash_combine(const T1 &arg1, const T2 &arg2, const T3 &arg3) { ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(arg1, arg2, arg3); + return helper.combine(0, helper.buffer, helper.buffer + 64, arg1, arg2, arg3); } template hash_code hash_combine(const T1 &arg1, const T2 &arg2) { ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(arg1, arg2); + return helper.combine(0, helper.buffer, helper.buffer + 64, arg1, arg2); } template hash_code hash_combine(const T1 &arg1) { ::llvm::hashing::detail::hash_combine_recursive_helper helper; - return helper.combine(arg1); + return helper.combine(0, helper.buffer, helper.buffer + 64, arg1); } #endif + +// Implementation details for implementatinos of hash_value overloads provided +// here. +namespace hashing { +namespace detail { + +/// \brief Helper to hash the value of a single integer. +/// +/// Overloads for smaller integer types are not provided to ensure consistent +/// behavior in the presence of integral promotions. Essentially, +/// "hash_value('4')" and "hash_value('0' + 4)" should be the same. +inline hash_code hash_integer_value(uint64_t value) { + // Similar to hash_4to8_bytes but using a seed instead of length. + const uint64_t seed = get_execution_seed(); + const char *s = reinterpret_cast(&value); + const uint64_t a = fetch32(s); + return hash_16_bytes(seed + (a << 3), fetch32(s + 4)); +} + +} // namespace detail +} // namespace hashing + +// Declared and documented above, but defined here so that any of the hashing +// infrastructure is available. +template +typename enable_if, hash_code>::type +hash_value(T value) { + return ::llvm::hashing::detail::hash_integer_value(value); +} + +// Declared and documented above, but defined here so that any of the hashing +// infrastructure is available. +template hash_code hash_value(const T *ptr) { + return ::llvm::hashing::detail::hash_integer_value( + reinterpret_cast(ptr)); +} + +// Declared and documented above, but defined here so that any of the hashing +// infrastructure is available. +template +hash_code hash_value(const std::pair &arg) { + return hash_combine(arg.first, arg.second); +} + +// Declared and documented above, but defined here so that any of the hashing +// infrastructure is available. +template +hash_code hash_value(const std::basic_string &arg) { + return hash_combine_range(arg.begin(), arg.end()); +} + } // namespace llvm #endif