From 3c1306e1e2892142a121d715db0ef8090c69a289 Mon Sep 17 00:00:00 2001 From: Philip Pronin Date: Tue, 4 Nov 2014 10:21:34 -0800 Subject: [PATCH] BMI1 support in EliasFanoCoding Summary: This diff updates `folly::CpuId` with support of extended features (EAX = 7, ECX = 0) to provide detection logic for BMI1 introduced in Haswell, and provides support for `BLSR` instruction in `EliasFanoReader`. Test Plan: I used clang to compile the logic and run unittests Reviewed By: lucian@fb.com Subscribers: fbcode-common-diffs@, trunkagent, chaoyc, search-fbcode-diffs@, unicorn-diffs@, njormrod, folly-diffs@ FB internal diff: D1658100 Signature: t1:1658100:1415126635:d1820b8eb41c9e9786b5c8062b801cf1e2049a97 --- folly/CpuId.h | 88 ++++++++++++++----- folly/experimental/EliasFanoCoding.h | 30 +++++-- .../experimental/test/EliasFanoCodingTest.cpp | 2 +- 3 files changed, 89 insertions(+), 31 deletions(-) diff --git a/folly/CpuId.h b/folly/CpuId.h index e3c50674..d35d3d34 100644 --- a/folly/CpuId.h +++ b/folly/CpuId.h @@ -24,7 +24,7 @@ namespace folly { /** * Identification of an Intel CPU. - * Supports CPUID (EAX=1) feature flags. + * Supports CPUID feature flags (EAX=1) and extended features (EAX=7, ECX=0). * Values from http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html */ class CpuId { @@ -32,23 +32,34 @@ class CpuId { CpuId() { #ifdef _MSC_VER int reg[4]; - - __cpuid((int *)reg, 1); - c_ = reg[2]; - d_ = reg[3]; - + __cpuid(static_cast(reg), 0); + const int n = reg[0]; + if (n >= 1) { + __cpuid(static_cast(reg), 1); + f1c_ = reg[2]; + f1d_ = reg[3]; + } + if (n >= 7) { + __cpuidex(static_cast(reg), 7, 0); + f7b_ = reg[1]; + f7c_ = reg[2]; + } #elif FOLLY_X64 || defined(__i386__) - __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); -#else - // On non-Intel, none of these features exist; at least not in the same form - // as they do on Intel - c_ = 0; - d_ = 0; + uint32_t n; + __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "edx", "ecx"); + if (n >= 1) { + __asm__("cpuid" : "=c"(f1c_), "=d"(f1d_) : "a"(1) : "ebx"); + } + if (n >= 7) { + __asm__("cpuid" : "=b"(f7b_), "=c"(f7c_) : "a"(7), "c"(0) : "edx"); + } #endif } -#define X(name, r, bit) bool name() const { return r & (1U << bit); } -#define C(name, bit) X(name, c_, bit) -#define D(name, bit) X(name, d_, bit) + +#define X(name, r, bit) bool name() const { return (r) & (1U << bit); } + + // cpuid(1): Processor Info and Feature Bits. +#define C(name, bit) X(name, f1c_, bit) C(sse3, 0) C(pclmuldq, 1) C(dtes64, 2) @@ -60,12 +71,10 @@ class CpuId { C(tm2, 8) C(ssse3, 9) C(cnxtid, 10) - // 11 is reserved C(fma, 12) C(cx16, 13) C(xtpr, 14) C(pdcm, 15) - // 16 is reserved C(pcid, 17) C(dca, 18) C(sse41, 19) @@ -80,7 +89,8 @@ class CpuId { C(avx, 28) C(f16c, 29) C(rdrand, 30) - // 31 is not used +#undef C +#define D(name, bit) X(name, f1d_, bit) D(fpu, 0) D(vme, 1) D(de, 2) @@ -91,7 +101,6 @@ class CpuId { D(mce, 7) D(cx8, 8) D(apic, 9) - // 10 is reserved D(sep, 11) D(mtrr, 12) D(pge, 13) @@ -101,7 +110,6 @@ class CpuId { D(pse36, 17) D(psn, 18) D(clfsh, 19) - // 20 is reserved D(ds, 21) D(acpi, 22) D(mmx, 23) @@ -111,14 +119,48 @@ class CpuId { D(ss, 27) D(htt, 28) D(tm, 29) - // 30 is reserved D(pbe, 31) #undef D + + // cpuid(7): Extended Features. +#define B(name, bit) X(name, f7b_, bit) + B(bmi1, 3) + B(hle, 4) + B(avx2, 5) + B(smep, 7) + B(bmi2, 8) + B(erms, 9) + B(invpcid, 10) + B(rtm, 11) + B(mpx, 14) + B(avx512f, 16) + B(avx512dq, 17) + B(rdseed, 18) + B(adx, 19) + B(smap, 20) + B(avx512ifma, 21) + B(pcommit, 22) + B(clflushopt, 23) + B(clwb, 24) + B(avx512pf, 26) + B(avx512er, 27) + B(avx512cd, 28) + B(sha, 29) + B(avx512bw, 30) + B(avx512vl, 31) +#undef B +#define C(name, bit) X(name, f7c_, bit) + C(prefetchwt1, 0) + C(avx512vbmi, 1) #undef C + #undef X + private: - uint32_t c_; // ECX - uint32_t d_; // EDX + uint32_t f1c_ = 0; + uint32_t f1d_ = 0; + uint32_t f7b_ = 0; + uint32_t f7c_ = 0; }; } // namespace folly diff --git a/folly/experimental/EliasFanoCoding.h b/folly/experimental/EliasFanoCoding.h index 2a1e21a9..c44fb6f6 100644 --- a/folly/experimental/EliasFanoCoding.h +++ b/folly/experimental/EliasFanoCoding.h @@ -291,7 +291,7 @@ struct EliasFanoEncoder { namespace instructions { struct Default { - static bool supported() { + static bool supported(const folly::CpuId& cpuId = {}) { return true; } static inline uint64_t popcount(uint64_t value) { @@ -301,20 +301,36 @@ struct Default { DCHECK_GT(value, 0); return __builtin_ctzll(value); } + static inline uint64_t blsr(uint64_t value) { + return value & (value - 1); + } }; -struct Fast : public Default { - static bool supported() { - folly::CpuId cpuId; +struct Nehalem : public Default { + static bool supported(const folly::CpuId& cpuId = {}) { return cpuId.popcnt(); } static inline uint64_t popcount(uint64_t value) { + // POPCNT is supported starting with Intel Nehalem, AMD K10. uint64_t result; asm ("popcntq %1, %0" : "=r" (result) : "r" (value)); return result; } }; +struct Haswell : public Nehalem { + static bool supported(const folly::CpuId& cpuId = {}) { + return Nehalem::supported(cpuId) && cpuId.bmi1(); + } + static inline uint64_t blsr(uint64_t value) { + // BMI1 is supported starting with Intel Haswell, AMD Piledriver. + // BLSR combines two instuctions into one and reduces register pressure. + uint64_t result; + asm ("blsrq %1, %0" : "=r" (result) : "r" (value)); + return result; + } +}; + } // namespace instructions namespace detail { @@ -352,7 +368,7 @@ class UpperBitsReader { ++position_; inner_ = Instructions::ctz(block_); - block_ &= block_ - 1; + block_ = Instructions::blsr(block_); return setValue(); } @@ -396,11 +412,11 @@ class UpperBitsReader { // Kill n - 1 least significant 1-bits. for (size_t i = 0; i < n - 1; ++i) { - block_ &= block_ - 1; + block_ = Instructions::blsr(block_); } inner_ = Instructions::ctz(block_); - block_ &= block_ - 1; + block_ = Instructions::blsr(block_); return setValue(); } diff --git a/folly/experimental/test/EliasFanoCodingTest.cpp b/folly/experimental/test/EliasFanoCodingTest.cpp index 547cd309..565d4e27 100644 --- a/folly/experimental/test/EliasFanoCodingTest.cpp +++ b/folly/experimental/test/EliasFanoCodingTest.cpp @@ -172,7 +172,7 @@ BENCHMARK(Encode_1M) { } #if 0 -Intel(R) Xeon(R) CPU E5-2660 @ 2.7GHz (turbo on), using instructions::Fast. +Intel(R) Xeon(R) CPU E5-2660 @ 2.7GHz (turbo on), using instructions::Nehalem. ============================================================================ folly/experimental/test/EliasFanoCodingTest.cpp relative time/iter iters/s -- 2.34.1