From 0b988b15791ff86168241e31d27ae091edf35728 Mon Sep 17 00:00:00 2001 From: Giuseppe Ottaviano Date: Wed, 3 May 2017 14:58:39 -0700 Subject: [PATCH] Do not store the lower bits mask in EliasFanoReader Summary: Computing the mask on access has negligible cost as it can be hoisted out of the linear search loop, and furthermore on Haswell we can use the the `BZHI` instruction. I also experimented with `BEXTR` but it ended up being slower because computing the pattern operand requires a shift and an or (it's probably meant for when the pattern is precomputed). Reviewed By: philippv Differential Revision: D4976657 fbshipit-source-id: e4c4ca5f0a785595587e6d6ad4676f5b216291cf --- folly/experimental/EliasFanoCoding.h | 8 +++++--- folly/experimental/Instructions.h | 24 ++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/folly/experimental/EliasFanoCoding.h b/folly/experimental/EliasFanoCoding.h index f75fabde..98ac516e 100644 --- a/folly/experimental/EliasFanoCoding.h +++ b/folly/experimental/EliasFanoCoding.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -526,7 +527,6 @@ class EliasFanoReader { : size_(list.size), lower_(list.lower), upper_(list), - lowerMask_((ValueType(1) << list.numLowerBits) - 1), numLowerBits_(list.numLowerBits) { DCHECK(Instructions::supported()); // To avoid extra branching during skipTo() while reading @@ -654,7 +654,10 @@ class EliasFanoReader { const size_t pos = i * numLowerBits_; const unsigned char* ptr = lower_ + (pos / 8); const uint64_t ptrv = folly::loadUnaligned(ptr); - return lowerMask_ & (ptrv >> (pos % 8)); + // This removes the branch in the fallback implementation of + // bzhi. The condition is verified at encoding time. + assume(numLowerBits_ < sizeof(ValueType) * 8); + return Instructions::bzhi(ptrv >> (pos % 8), numLowerBits_); } void iterateTo(ValueType value) { @@ -671,7 +674,6 @@ class EliasFanoReader { size_t size_; const uint8_t* lower_; detail::UpperBitsReader upper_; - const ValueType lowerMask_; ValueType value_ = kInvalidValue; ValueType lastValue_; uint8_t numLowerBits_; diff --git a/folly/experimental/Instructions.h b/folly/experimental/Instructions.h index a2754b89..972b62d7 100644 --- a/folly/experimental/Instructions.h +++ b/folly/experimental/Instructions.h @@ -72,6 +72,14 @@ struct Default { return (value >> start) & ((length == 64) ? (~0ULL) : ((1ULL << length) - 1ULL)); } + + // Clear high bits starting at position index. + static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) { + if (index > 63) { + return 0; + } + return value & ((uint64_t(1) << index) - 1); + } }; struct Nehalem : public Default { @@ -94,12 +102,12 @@ struct Nehalem : public Default { struct Haswell : public Nehalem { static bool supported(const folly::CpuId& cpuId = {}) { - return Nehalem::supported(cpuId) && cpuId.bmi1(); + return Nehalem::supported(cpuId) && cpuId.bmi1() && cpuId.bmi2(); } static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) { // BMI1 is supported starting with Intel Haswell, AMD Piledriver. -// BLSR combines two instuctions into one and reduces register pressure. +// BLSR combines two instructions into one and reduces register pressure. #if defined(__GNUC__) || defined(__clang__) // GCC and Clang won't inline the intrinsics. uint64_t result; @@ -124,6 +132,18 @@ struct Haswell : public Nehalem { return result; #else return _bextr_u64(value, start, length); +#endif + } + + static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) { +#if defined(__GNUC__) || defined(__clang__) + // GCC and Clang won't inline the intrinsics. + const uint64_t index64 = index; + uint64_t result; + asm("bzhiq %2, %1, %0" : "=r"(result) : "r"(value), "r"(index64)); + return result; +#else + return _bzhi_u64(value, index); #endif } }; -- 2.34.1