From c2c47da2bc90ddc852cc102aaad562cb15229479 Mon Sep 17 00:00:00 2001 From: Giuseppe Ottaviano Date: Thu, 12 Feb 2015 13:34:47 -0800 Subject: [PATCH] Elias-Fano micro-optimizations Summary: Short skips have been optimized by adding special cases that use simple iteration when it is convenient. Large skips have been optimized by using the broadword selection algorithm by Vigna (improved with ideas by Gog&Petri) instead of iterating on the zeros/ones of the upper bits. The benchmarks had to be made more granular to measure the differences, in particular they used to test skipping with a fixed skip length for each test, while now we average over a range of skips to better simulate a random distribution. The improvements are very significant for `skipTo()` on short skips, about 2-3x for skips at distance 1 or 2, which can occur when intersecting dense lists. On large skips the gain is about 17%. `skip()` exhibits slightly smaller improvements. before after ============================================================================ ================== folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s time/iter iters/s ============================================================================ ================== Next 2.52ns 396.26M 2.52ns 397.28M Skip_ForwardQ128(1) 8.66ns 115.52M 3.92ns 255.28M Skip_ForwardQ128(2) 8.37ns 119.42M 5.08ns 197.04M Skip_ForwardQ128(4_pm_1) 9.67ns 103.41M 7.04ns 142.02M Skip_ForwardQ128(16_pm_4) 21.44ns 46.65M 19.68ns 50.82M Skip_ForwardQ128(64_pm_16) 30.86ns 32.40M 27.58ns 36.26M Skip_ForwardQ128(256_pm_64) 37.80ns 26.45M 32.49ns 30.78M Skip_ForwardQ128(1024_pm_256) 38.99ns 25.65M 33.39ns 29.95M Jump_ForwardQ128 37.91ns 26.37M 34.05ns 29.37M ---------------------------------------------------------------------------- ------------------ SkipTo_SkipQ128(1) 13.87ns 72.10M 4.42ns 226.49M SkipTo_SkipQ128(2) 18.80ns 53.20M 8.58ns 116.55M SkipTo_SkipQ128(4_pm_1) 23.59ns 42.38M 11.43ns 87.50M SkipTo_SkipQ128(16_pm_4) 36.04ns 27.74M 31.19ns 32.06M SkipTo_SkipQ128(64_pm_16) 53.34ns 18.75M 43.88ns 22.79M SkipTo_SkipQ128(256_pm_64) 62.27ns 16.06M 49.08ns 20.37M SkipTo_SkipQ128(1024_pm_256) 65.63ns 15.24M 52.24ns 19.14M JumpTo_SkipQ128 65.89ns 15.18M 54.61ns 18.31M ---------------------------------------------------------------------------- ------------------ Encode_10 111.94ns 8.93M 117.24ns 8.53M Encode 5.35ms 187.02 5.64ms 177.15 ---------------------------------------------------------------------------- ------------------ Select64 8.07ns 123.96M 8.04ns 124.35M ============================================================================ ================== Test Plan: fbconfig folly/experimental/test:eliasfano_test && fbmake runtests_opt Reviewed By: philipp@fb.com Subscribers: yfeldblum, fbcode-common-diffs@, chaoyc, search-fbcode-diffs@, unicorn-diffs@, trunkagent, folly-diffs@ FB internal diff: D1793554 Signature: t1:1793554:1423619344:1b078c0789639f317342ebcc77b11fe91859cd7b --- folly/Makefile.am | 2 + folly/experimental/EliasFanoCoding.h | 62 +++++---- folly/experimental/Select64.cpp | 105 ++++++++++++++ folly/experimental/Select64.h | 65 +++++++++ folly/experimental/test/CodingTestUtils.h | 92 ++++++++----- .../experimental/test/EliasFanoCodingTest.cpp | 128 ++++++++++-------- 6 files changed, 337 insertions(+), 117 deletions(-) create mode 100644 folly/experimental/Select64.cpp create mode 100644 folly/experimental/Select64.h diff --git a/folly/Makefile.am b/folly/Makefile.am index ac5fad4c..a8d54828 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -75,6 +75,7 @@ nobase_follyinclude_HEADERS = \ experimental/Singleton.h \ experimental/Singleton-inl.h \ experimental/TestUtil.h \ + experimental/Select64.h \ FBString.h \ FBVector.h \ File.h \ @@ -335,6 +336,7 @@ libfolly_la_SOURCES = \ experimental/io/FsUtil.cpp \ experimental/Singleton.cpp \ experimental/TestUtil.cpp \ + experimental/Select64.cpp \ wangle/acceptor/Acceptor.cpp \ wangle/acceptor/ConnectionManager.cpp \ wangle/acceptor/LoadShedConfiguration.cpp \ diff --git a/folly/experimental/EliasFanoCoding.h b/folly/experimental/EliasFanoCoding.h index c44fb6f6..61335819 100644 --- a/folly/experimental/EliasFanoCoding.h +++ b/folly/experimental/EliasFanoCoding.h @@ -42,6 +42,7 @@ #include #include #include +#include #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ #error EliasFanoCoding.h requires little endianness @@ -405,18 +406,10 @@ class UpperBitsReader { block_ = folly::loadUnaligned(start_ + outer_); } - // NOTE: Trying to skip half-block here didn't show any - // performance improvements. - + // Skip to the n-th one in the block. DCHECK_GT(n, 0); - - // Kill n - 1 least significant 1-bits. - for (size_t i = 0; i < n - 1; ++i) { - block_ = Instructions::blsr(block_); - } - - inner_ = Instructions::ctz(block_); - block_ = Instructions::blsr(block_); + inner_ = select64(block_, n - 1); + block_ &= (block_t(-1) << inner_) << 1; return setValue(); } @@ -463,16 +456,13 @@ class UpperBitsReader { block_ = folly::loadUnaligned(start_ + outer_); } - // Try to skip half-block. - constexpr size_t kBitsPerHalfBlock = 4 * sizeof(block_t); - constexpr block_t halfBlockMask = (block_t(1) << kBitsPerHalfBlock) - 1; - if ((cnt = Instructions::popcount(~block_ & halfBlockMask)) < skip) { - position_ += kBitsPerHalfBlock - cnt; - block_ &= ~halfBlockMask; + if (LIKELY(skip)) { + auto inner = select64(~block_, skip - 1); + position_ += inner - skip + 1; + block_ &= block_t(-1) << inner; } - // Just skip until we see expected value. - while (next() < v) { } + next(); return value_; } @@ -567,8 +557,13 @@ class EliasFanoReader : private boost::noncopyable { progress_ += n; if (LIKELY(progress_ <= list_.size)) { + if (LIKELY(n < kLinearScanThreshold)) { + for (size_t i = 0; i < n; ++i) upper_.next(); + } else { + upper_.skip(n); + } value_ = readLowerPart(progress_ - 1) | - (upper_.skip(n) << list_.numLowerBits); + (upper_.value() << list_.numLowerBits); return true; } @@ -587,7 +582,18 @@ class EliasFanoReader : private boost::noncopyable { return false; } - upper_.skipToNext(value >> list_.numLowerBits); + size_t upperValue = (value >> list_.numLowerBits); + size_t upperSkip = upperValue - upper_.value(); + // The average density of ones in upper bits is 1/2. + // LIKELY here seems to make things worse, even for small skips. + if (upperSkip < 2 * kLinearScanThreshold) { + do { + upper_.next(); + } while (UNLIKELY(upper_.value() < upperValue)); + } else { + upper_.skipToNext(upperValue); + } + iterateTo(value); return true; } @@ -636,15 +642,17 @@ class EliasFanoReader : private boost::noncopyable { } void iterateTo(ValueType value) { - progress_ = upper_.position(); - value_ = readLowerPart(progress_) | (upper_.value() << list_.numLowerBits); - ++progress_; - while (value_ < value) { - value_ = readLowerPart(progress_) | (upper_.next() << list_.numLowerBits); - ++progress_; + while (true) { + value_ = readLowerPart(upper_.position()) | + (upper_.value() << list_.numLowerBits); + if (LIKELY(value_ >= value)) break; + upper_.next(); } + progress_ = upper_.position() + 1; } + constexpr static size_t kLinearScanThreshold = 8; + const EliasFanoCompressedList list_; const ValueType lowerMask_; detail::UpperBitsReader upper_; diff --git a/folly/experimental/Select64.cpp b/folly/experimental/Select64.cpp new file mode 100644 index 00000000..fba8633e --- /dev/null +++ b/folly/experimental/Select64.cpp @@ -0,0 +1,105 @@ +/* + * Copyright 2015 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace folly { namespace detail { + +const uint8_t kSelectInByte[2048] = { + 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, + 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, + 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, + 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, + 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, + 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, + 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, + 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, + 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, + 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, + 4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, + 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, + 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2, + 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, + 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, + 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, + 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, + 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, + 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, + 8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, + 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, + 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, + 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, + 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, + 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, + 3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, + 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, + 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, + 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8, 5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, + 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6, + 6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, + 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, + 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, + 8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, + 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, + 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, + 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, + 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, + 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, + 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, + 8, 7, 7, 5, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, + 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, + 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, + 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, + 8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, + 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, + 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 +}; + +}} diff --git a/folly/experimental/Select64.h b/folly/experimental/Select64.h new file mode 100644 index 00000000..4e29e3c7 --- /dev/null +++ b/folly/experimental/Select64.h @@ -0,0 +1,65 @@ +/* + * Copyright 2015 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_EXPERIMENTAL_SELECT64_H +#define FOLLY_EXPERIMENTAL_SELECT64_H + +#include + +namespace folly { + +namespace detail { +extern const uint8_t kSelectInByte[2048]; +} + +/** + * Returns the position of the k-th 1 in the 64-bit word x. + * k is 0-based, so k=0 returns the position of the first 1. + * + * Uses the broadword selection algorithm by Vigna [1], improved by Gog + * and Petri [2] and Vigna [3]. + * + * [1] Sebastiano Vigna. Broadword Implementation of Rank/Select + * Queries. WEA, 2008 + * + * [2] Simon Gog, Matthias Petri. Optimized succinct data structures + * for massive data. Softw. Pract. Exper., 2014 + * + * [3] Sebastiano Vigna. MG4J 5.2.1. http://mg4j.di.unimi.it/ +*/ +template +inline uint64_t select64(uint64_t x, uint64_t k) { + DCHECK_LT(k, Instructions::popcount(x)); + + constexpr uint64_t kOnesStep4 = 0x1111111111111111ULL; + constexpr uint64_t kOnesStep8 = 0x0101010101010101ULL; + constexpr uint64_t kMSBsStep8 = 0x80ULL * kOnesStep8; + + auto s = x; + s = s - ((s & 0xA * kOnesStep4) >> 1); + s = (s & 0x3 * kOnesStep4) + ((s >> 2) & 0x3 * kOnesStep4); + s = (s + (s >> 4)) & 0xF * kOnesStep8; + uint64_t byteSums = s * kOnesStep8; + + uint64_t kStep8 = k * kOnesStep8; + uint64_t geqKStep8 = (((kStep8 | kMSBsStep8) - byteSums) & kMSBsStep8); + uint64_t place = Instructions::popcount(geqKStep8) * 8; + uint64_t byteRank = k - (((byteSums << 8) >> place) & uint64_t(0xFF)); + return place + detail::kSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)]; +} + +} +#endif diff --git a/folly/experimental/test/CodingTestUtils.h b/folly/experimental/test/CodingTestUtils.h index e8b81b0f..2d686b95 100644 --- a/folly/experimental/test/CodingTestUtils.h +++ b/folly/experimental/test/CodingTestUtils.h @@ -80,9 +80,11 @@ void testNext(const std::vector& data, const List& list) { for (size_t i = 0; i < data.size(); ++i) { EXPECT_TRUE(reader.next()); EXPECT_EQ(reader.value(), data[i]); + EXPECT_EQ(reader.position(), i); } EXPECT_FALSE(reader.next()); EXPECT_EQ(reader.value(), std::numeric_limits::max()); + EXPECT_EQ(reader.position(), reader.size() - 1); } template @@ -94,9 +96,11 @@ void testSkip(const std::vector& data, const List& list, for (size_t i = skipStep - 1; i < data.size(); i += skipStep) { EXPECT_TRUE(reader.skip(skipStep)); EXPECT_EQ(reader.value(), data[i]); + EXPECT_EQ(reader.position(), i); } EXPECT_FALSE(reader.skip(skipStep)); EXPECT_EQ(reader.value(), std::numeric_limits::max()); + EXPECT_EQ(reader.position(), reader.size() - 1); EXPECT_FALSE(reader.next()); } @@ -132,6 +136,7 @@ void testSkipTo(const std::vector& data, const List& list, value = reader.value() + delta; } EXPECT_EQ(reader.value(), std::numeric_limits::max()); + EXPECT_EQ(reader.position(), reader.size() - 1); EXPECT_FALSE(reader.next()); } @@ -148,6 +153,7 @@ void testSkipTo(const std::vector& data, const List& list) { Reader reader(list); EXPECT_FALSE(reader.skipTo(data.back() + 1)); EXPECT_EQ(reader.value(), std::numeric_limits::max()); + EXPECT_EQ(reader.position(), reader.size() - 1); EXPECT_FALSE(reader.next()); } } @@ -170,9 +176,11 @@ void testJump(const std::vector& data, const List& list) { for (auto i : is) { EXPECT_TRUE(reader.jump(i + 1)); EXPECT_EQ(reader.value(), data[i]); + EXPECT_EQ(reader.position(), i); } EXPECT_FALSE(reader.jump(data.size() + 1)); EXPECT_EQ(reader.value(), std::numeric_limits::max()); + EXPECT_EQ(reader.position(), reader.size() - 1); } template @@ -190,21 +198,26 @@ void testJumpTo(const std::vector& data, const List& list) { CHECK(it != data.end()); EXPECT_TRUE(reader.jumpTo(value)); EXPECT_EQ(reader.value(), *it); + EXPECT_EQ(reader.position(), std::distance(data.begin(), it)); } EXPECT_TRUE(reader.jumpTo(0)); EXPECT_EQ(reader.value(), 0); + EXPECT_EQ(reader.position(), -1); if (data.front() > 0) { EXPECT_TRUE(reader.jumpTo(1)); EXPECT_EQ(reader.value(), data.front()); + EXPECT_EQ(reader.position(), 0); } EXPECT_TRUE(reader.jumpTo(data.back())); EXPECT_EQ(reader.value(), data.back()); + EXPECT_EQ(reader.position(), reader.size() - 1); EXPECT_FALSE(reader.jumpTo(data.back() + 1)); EXPECT_EQ(reader.value(), std::numeric_limits::max()); + EXPECT_EQ(reader.position(), reader.size() - 1); } template @@ -242,46 +255,57 @@ void testAll(const std::vector& data) { } template -void bmNext(const List& list, const std::vector& data, - size_t iters) { +void bmNext(const List& list, const std::vector& data, size_t iters) { if (data.empty()) { return; } - for (size_t i = 0, j; i < iters; ) { - Reader reader(list); - for (j = 0; reader.next(); ++j, ++i) { - CHECK_EQ(reader.value(), data[j]) << j; + + Reader reader(list); + for (size_t i = 0; i < iters; ++i) { + if (LIKELY(reader.next())) { + folly::doNotOptimizeAway(reader.value()); + } else { + reader.reset(); } } } template void bmSkip(const List& list, const std::vector& data, - size_t skip, size_t iters) { - if (skip >= data.size()) { - return; - } - for (size_t i = 0, j; i < iters; ) { - Reader reader(list); - for (j = skip - 1; j < data.size(); j += skip, ++i) { - reader.skip(skip); - CHECK_EQ(reader.value(), data[j]); + size_t logAvgSkip, size_t iters) { + size_t avg = (size_t(1) << logAvgSkip); + size_t base = avg - (avg >> 2); + size_t mask = (avg > 1) ? (avg >> 1) - 1 : 0; + + Reader reader(list); + for (size_t i = 0; i < iters; ++i) { + size_t skip = base + (i & mask); + if (LIKELY(reader.skip(skip))) { + folly::doNotOptimizeAway(reader.value()); + } else { + reader.reset(); } } } template void bmSkipTo(const List& list, const std::vector& data, - size_t skip, size_t iters) { - if (skip >= data.size()) { - return; - } - for (size_t i = 0, j; i < iters; ) { - Reader reader(list); - for (j = 0; j < data.size(); j += skip, ++i) { - reader.skipTo(data[j]); - CHECK_EQ(reader.value(), data[j]); + size_t logAvgSkip, size_t iters) { + size_t avg = (size_t(1) << logAvgSkip); + size_t base = avg - (avg >> 2); + size_t mask = (avg > 1) ? (avg >> 1) - 1 : 0; + + Reader reader(list); + for (size_t i = 0, j = -1; i < iters; ++i) { + size_t skip = base + (i & mask); + j += skip; + if (j >= data.size()) { + reader.reset(); + j = -1; } + + reader.skipTo(data[j]); + folly::doNotOptimizeAway(reader.value()); } } @@ -292,12 +316,10 @@ void bmJump(const List& list, const std::vector& data, CHECK_EQ(data.size(), order.size()); Reader reader(list); - for (size_t i = 0; i < iters; ) { - for (size_t j : order) { - reader.jump(j + 1); - CHECK_EQ(reader.value(), data[j]); - ++i; - } + for (size_t i = 0, j = 0; i < iters; ++i, ++j) { + if (j == order.size()) j = 0; + reader.jump(order[j]); + folly::doNotOptimizeAway(reader.value()); } } @@ -308,12 +330,10 @@ void bmJumpTo(const List& list, const std::vector& data, CHECK_EQ(data.size(), order.size()); Reader reader(list); - for (size_t i = 0; i < iters; ) { - for (size_t j : order) { - reader.jumpTo(data[j]); - CHECK_EQ(reader.value(), data[j]); - ++i; - } + for (size_t i = 0, j = 0; i < iters; ++i, ++j) { + if (j == order.size()) j = 0; + reader.jumpTo(data[order[j]]); + folly::doNotOptimizeAway(reader.value()); } } diff --git a/folly/experimental/test/EliasFanoCodingTest.cpp b/folly/experimental/test/EliasFanoCodingTest.cpp index 565d4e27..4af8db9f 100644 --- a/folly/experimental/test/EliasFanoCodingTest.cpp +++ b/folly/experimental/test/EliasFanoCodingTest.cpp @@ -15,15 +15,25 @@ */ #include +#include #include #include #include #include +#include #include using namespace folly::compression; +#if defined(EF_TEST_NEHALEM) +#define EF_TEST_ARCH Nehalem +#elif defined(EF_TEST_HASWELL) +#define EF_TEST_ARCH Haswell +#else +#define EF_TEST_ARCH Default +#endif + template struct TestType { static constexpr size_t Version = kVersion; @@ -42,7 +52,7 @@ class EliasFanoCodingTest : public ::testing::Test { void doTestAll() { typedef EliasFanoEncoder< uint32_t, uint32_t, kSkipQuantum, kForwardQuantum, T::Version> Encoder; - typedef EliasFanoReader Reader; + typedef EliasFanoReader Reader; testAll(generateRandomList(100 * 1000, 10 * 1000 * 1000)); testAll(generateSeqList(1, 100000, 100)); } @@ -94,11 +104,8 @@ void init() { //data = loadList("/home/philipp/pl_test_dump.txt"); list = Encoder::encode(data.begin(), data.end()); - order.clear(); - order.reserve(data.size()); - for (size_t i = 0; i < data.size(); ++i) { - order.push_back(i); - } + order.resize(data.size()); + std::iota(order.begin(), order.end(), size_t()); std::shuffle(order.begin(), order.end(), gen); encodeSmallData = generateRandomList(10, 100 * 1000, gen); @@ -111,50 +118,44 @@ void free() { } // namespace bm -BENCHMARK(Next_1M) { - bmNext(bm::list, bm::data, bm::k1M); +BENCHMARK(Next, iters) { + bmNext(bm::list, bm::data, iters); } -BENCHMARK(Skip1_ForwarQ128_1M) { - bmSkip(bm::list, bm::data, 1, bm::k1M); +size_t Skip_ForwardQ128(size_t iters, size_t logAvgSkip) { + bmSkip(bm::list, bm::data, logAvgSkip, iters); + return iters; } -BENCHMARK(Skip10_ForwarQ128_1M) { - bmSkip(bm::list, bm::data, 10, bm::k1M); -} +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 1, 0) +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 2, 1) +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 4_pm_1, 2) +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 16_pm_4, 4) +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 64_pm_16, 6) +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 256_pm_64, 8) +BENCHMARK_NAMED_PARAM_MULTI(Skip_ForwardQ128, 1024_pm_256, 10) -BENCHMARK(Skip100_ForwardQ128_1M) { - bmSkip(bm::list, bm::data, 100, bm::k1M); -} - -BENCHMARK(Skip1000_ForwardQ128_1M) { - bmSkip(bm::list, bm::data, 1000, bm::k1M); -} - -BENCHMARK(Jump_ForwardQ128_1M) { - bmJump(bm::list, bm::data, bm::order, bm::k1M); +BENCHMARK(Jump_ForwardQ128, iters) { + bmJump(bm::list, bm::data, bm::order, iters); } BENCHMARK_DRAW_LINE(); -BENCHMARK(SkipTo1_SkipQ128_1M) { - bmSkipTo(bm::list, bm::data, 1, bm::k1M); +size_t SkipTo_SkipQ128(size_t iters, size_t logAvgSkip) { + bmSkipTo(bm::list, bm::data, logAvgSkip, iters); + return iters; } -BENCHMARK(SkipTo10_SkipQ128_1M) { - bmSkipTo(bm::list, bm::data, 10, bm::k1M); -} +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 1, 0) +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 2, 1) +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 4_pm_1, 2) +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 16_pm_4, 4) +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 64_pm_16, 6) +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 256_pm_64, 8) +BENCHMARK_NAMED_PARAM_MULTI(SkipTo_SkipQ128, 1024_pm_256, 10) -BENCHMARK(SkipTo100_SkipQ128_1M) { - bmSkipTo(bm::list, bm::data, 100, bm::k1M); -} - -BENCHMARK(SkipTo1000_SkipQ128_1M) { - bmSkipTo(bm::list, bm::data, 1000, bm::k1M); -} - -BENCHMARK(JumpTo_SkipQ128_1M) { - bmJumpTo(bm::list, bm::data, bm::order, bm::k1M); +BENCHMARK(JumpTo_SkipQ128, iters) { + bmJumpTo(bm::list, bm::data, bm::order, iters); } BENCHMARK_DRAW_LINE(); @@ -165,33 +166,52 @@ BENCHMARK(Encode_10) { list.free(); } -BENCHMARK(Encode_1M) { +BENCHMARK(Encode) { auto list = bm::Encoder::encode(bm::encodeLargeData.begin(), bm::encodeLargeData.end()); list.free(); } -#if 0 -Intel(R) Xeon(R) CPU E5-2660 @ 2.7GHz (turbo on), using instructions::Nehalem. +BENCHMARK_DRAW_LINE(); +BENCHMARK(Select64, iters) { + typedef instructions::EF_TEST_ARCH instr; + constexpr uint64_t kPrime = uint64_t(-59); + for (uint64_t x = kPrime, i = 0; i < iters; x *= kPrime, i += 1) { + size_t w = instr::popcount(x); + folly::doNotOptimizeAway(folly::select64(x, w - 1)); + } +} + +#if 0 +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz (turbo off), +using instructions::Haswell and GCC 4.9 with --bm_min_usec 100000. ============================================================================ folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s ============================================================================ -Next_1M 4.61ms 216.70 -Skip1_ForwarQ128_1M 5.33ms 187.71 -Skip10_ForwarQ128_1M 14.23ms 70.26 -Skip100_ForwardQ128_1M 29.10ms 34.37 -Skip1000_ForwardQ128_1M 21.15ms 47.28 -Jump_ForwardQ128_1M 46.30ms 21.60 +Next 2.52ns 397.28M +Skip_ForwardQ128(1) 3.92ns 255.28M +Skip_ForwardQ128(2) 5.08ns 197.04M +Skip_ForwardQ128(4_pm_1) 7.04ns 142.02M +Skip_ForwardQ128(16_pm_4) 19.68ns 50.82M +Skip_ForwardQ128(64_pm_16) 27.58ns 36.26M +Skip_ForwardQ128(256_pm_64) 32.49ns 30.78M +Skip_ForwardQ128(1024_pm_256) 33.39ns 29.95M +Jump_ForwardQ128 34.05ns 29.37M +---------------------------------------------------------------------------- +SkipTo_SkipQ128(1) 4.42ns 226.49M +SkipTo_SkipQ128(2) 8.58ns 116.55M +SkipTo_SkipQ128(4_pm_1) 11.43ns 87.50M +SkipTo_SkipQ128(16_pm_4) 31.19ns 32.06M +SkipTo_SkipQ128(64_pm_16) 43.88ns 22.79M +SkipTo_SkipQ128(256_pm_64) 49.08ns 20.37M +SkipTo_SkipQ128(1024_pm_256) 52.24ns 19.14M +JumpTo_SkipQ128 54.61ns 18.31M ---------------------------------------------------------------------------- -SkipTo1_SkipQ128_1M 12.03ms 83.15 -SkipTo10_SkipQ128_1M 36.11ms 27.69 -SkipTo100_SkipQ128_1M 42.91ms 23.30 -SkipTo1000_SkipQ128_1M 36.92ms 27.08 -JumpTo_SkipQ128_1M 78.51ms 12.74 +Encode_10 117.24ns 8.53M +Encode 5.64ms 177.15 ---------------------------------------------------------------------------- -Encode_10 199.19ns 5.02M -Encode_1M 8.82ms 113.37 +Select64 8.04ns 124.35M ============================================================================ #endif -- 2.34.1