From b28186247104f8b90cfbe094d289c91f9e413317 Mon Sep 17 00:00:00 2001 From: Dmitry Pleshkov Date: Wed, 20 Jan 2016 22:23:30 -0800 Subject: [PATCH] Haswell-specific implementation of Select64 routine. Summary: k-th bit selection could be efficiently implemented via new BMI2 instruction set. Reviewed By: ot, philippv Differential Revision: D2843311 fb-gh-sync-id: 4c0cf52176a03422aef276ce5f677080f67f5fdf --- folly/experimental/Select64.h | 16 +++++ .../experimental/test/EliasFanoCodingTest.cpp | 64 +++++++++---------- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/folly/experimental/Select64.h b/folly/experimental/Select64.h index 4e29e3c7..805fa4a0 100644 --- a/folly/experimental/Select64.h +++ b/folly/experimental/Select64.h @@ -19,6 +19,8 @@ #include +#include + namespace folly { namespace detail { @@ -61,5 +63,19 @@ inline uint64_t select64(uint64_t x, uint64_t k) { return place + detail::kSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)]; } +template <> +inline uint64_t select64(uint64_t x, + uint64_t k) { + uint64_t result = uint64_t(1) << k; + + asm("pdep %1, %0, %0\n\t" + "tzcnt %0, %0" + : "+r"(result) + : "r"(x)); + + return result; } + +} // namespace folly + #endif diff --git a/folly/experimental/test/EliasFanoCodingTest.cpp b/folly/experimental/test/EliasFanoCodingTest.cpp index e02adca8..16482e09 100644 --- a/folly/experimental/test/EliasFanoCodingTest.cpp +++ b/folly/experimental/test/EliasFanoCodingTest.cpp @@ -69,6 +69,19 @@ TEST_F(EliasFanoCodingTest, SkipForwardPointers) { doTestAll<128, 128>(); } +TEST_F(EliasFanoCodingTest, Select64) { + typedef instructions::EF_TEST_ARCH instr; + constexpr uint64_t kPrime = uint64_t(-59); + for (uint64_t x = kPrime, i = 0; i < (1 << 20); x *= kPrime, i += 1) { + size_t w = instr::popcount(x); + for (size_t k = 0; k < w; ++k) { + auto pos = folly::select64(x, k); + CHECK_EQ((x >> pos) & 1, 1); + CHECK_EQ(instr::popcount(x & ((uint64_t(1) << pos) - 1)), k); + } + } +} + namespace bm { constexpr size_t k1M = 1000000; @@ -158,46 +171,33 @@ BENCHMARK(Encode) { list.free(); } -BENCHMARK_DRAW_LINE(); - -BENCHMARK(Select64, iters) { - typedef instructions::EF_TEST_ARCH instr; - constexpr uint64_t kPrime = uint64_t(-59); - for (uint64_t x = kPrime, i = 0; i < iters; x *= kPrime, i += 1) { - size_t w = instr::popcount(x); - folly::doNotOptimizeAway(folly::select64(x, w - 1)); - } -} - #if 0 Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz (turbo off), using instructions::Haswell and GCC 4.9 with --bm_min_usec 100000. ============================================================================ folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s ============================================================================ -Next 2.52ns 397.28M -Skip_ForwardQ128(1) 3.92ns 255.28M -Skip_ForwardQ128(2) 5.08ns 197.04M -Skip_ForwardQ128(4_pm_1) 7.04ns 142.02M -Skip_ForwardQ128(16_pm_4) 19.68ns 50.82M -Skip_ForwardQ128(64_pm_16) 27.58ns 36.26M -Skip_ForwardQ128(256_pm_64) 32.49ns 30.78M -Skip_ForwardQ128(1024_pm_256) 33.39ns 29.95M -Jump_ForwardQ128 34.05ns 29.37M ----------------------------------------------------------------------------- -SkipTo_SkipQ128(1) 4.42ns 226.49M -SkipTo_SkipQ128(2) 8.58ns 116.55M -SkipTo_SkipQ128(4_pm_1) 11.43ns 87.50M -SkipTo_SkipQ128(16_pm_4) 31.19ns 32.06M -SkipTo_SkipQ128(64_pm_16) 43.88ns 22.79M -SkipTo_SkipQ128(256_pm_64) 49.08ns 20.37M -SkipTo_SkipQ128(1024_pm_256) 52.24ns 19.14M -JumpTo_SkipQ128 54.61ns 18.31M +Next 2.59ns 386.60M +Skip_ForwardQ128(1) 4.03ns 248.16M +Skip_ForwardQ128(2) 5.28ns 189.39M +Skip_ForwardQ128(4_pm_1) 7.48ns 133.75M +Skip_ForwardQ128(16_pm_4) 20.28ns 49.32M +Skip_ForwardQ128(64_pm_16) 28.19ns 35.47M +Skip_ForwardQ128(256_pm_64) 31.99ns 31.26M +Skip_ForwardQ128(1024_pm_256) 32.51ns 30.76M +Jump_ForwardQ128 33.77ns 29.61M ---------------------------------------------------------------------------- -Encode_10 117.24ns 8.53M -Encode 5.64ms 177.15 +SkipTo_SkipQ128(1) 4.34ns 230.66M +SkipTo_SkipQ128(2) 8.90ns 112.38M +SkipTo_SkipQ128(4_pm_1) 12.12ns 82.49M +SkipTo_SkipQ128(16_pm_4) 32.52ns 30.75M +SkipTo_SkipQ128(64_pm_16) 44.82ns 22.31M +SkipTo_SkipQ128(256_pm_64) 49.52ns 20.19M +SkipTo_SkipQ128(1024_pm_256) 52.88ns 18.91M +JumpTo_SkipQ128 54.65ns 18.30M ---------------------------------------------------------------------------- -Select64 8.04ns 124.35M +Encode_10 98.70ns 10.13M +Encode 5.48ms 182.33 ============================================================================ #endif -- 2.34.1