From 11e98e939c840f416e38ecd7a0208f518a99e5c1 Mon Sep 17 00:00:00 2001 From: Philip Pronin Date: Sun, 2 Oct 2016 21:25:03 -0700 Subject: [PATCH] switch back to inline assembly in compression::instructions Summary: D3278901 never worked, GCC (at least 4.9) failed to propagate `FOLLY_TARGET_ATTRIBUTE("arch=haswell")` down, `blsr` and `select64` weren't inlined in EF coding on Haswell, showing up in CPU profile. This diff switches back to inline assembly, removing second dispatching mechanism (IFUNC; in additional to caller-side `Default` / `Nehalem` / `Haswell` dispatching) for `compression::instructions`, and disables functionality on compilers not supporting AT&T syntax (MSVC++) for now. Added `FOLLY_ALWAYS_INLINE` to make sure failure to inline results in compilation failure. Reviewed By: ot Differential Revision: D3959438 fbshipit-source-id: e40573fbfbf38991caa2cd70293aeaeeec3afad7 --- folly/experimental/Instructions.h | 52 ++++++++++++++++--------------- folly/experimental/Select64.h | 17 +++++----- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/folly/experimental/Instructions.h b/folly/experimental/Instructions.h index 97e6e78c..ffc4fa1b 100644 --- a/folly/experimental/Instructions.h +++ b/folly/experimental/Instructions.h @@ -17,15 +17,18 @@ #pragma once #include -#include -#ifdef __clang__ -// Clang defines the intrinsics in weird places. -#include -#endif #include +#include #include +#if defined(__GNUC__) || defined(__clang__) +// For compilers supporting AT&T assembly syntax. +#define FOLLY_INSTRUCTIONS_SUPPORTED 1 +#else +#define FOLLY_INSTRUCTIONS_SUPPORTED 0 +#endif + namespace folly { namespace compression { namespace instructions { // NOTE: It's recommended to compile EF coding with -msse4.2, starting @@ -37,39 +40,37 @@ namespace folly { namespace compression { namespace instructions { // use explicitly. struct Default { - static bool supported(const folly::CpuId& /* cpuId */ = {}) { return true; } - static inline uint64_t popcount(uint64_t value) { + static bool supported(const folly::CpuId& /* cpuId */ = {}) { + return true; + } + static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) { return __builtin_popcountll(value); } - static inline int ctz(uint64_t value) { + static FOLLY_ALWAYS_INLINE int ctz(uint64_t value) { DCHECK_GT(value, 0); return __builtin_ctzll(value); } - static inline int clz(uint64_t value) { + static FOLLY_ALWAYS_INLINE int clz(uint64_t value) { DCHECK_GT(value, 0); return __builtin_clzll(value); } - static inline uint64_t blsr(uint64_t value) { + static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) { return value & (value - 1); } }; +#if FOLLY_INSTRUCTIONS_SUPPORTED + struct Nehalem : public Default { static bool supported(const folly::CpuId& cpuId = {}) { return cpuId.popcnt(); } - FOLLY_TARGET_ATTRIBUTE("popcnt") - static inline uint64_t popcount(uint64_t value) { + static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) { // POPCNT is supported starting with Intel Nehalem, AMD K10. -#if defined(__GNUC__) && !defined(__clang__) && !__GNUC_PREREQ(4, 9) - // GCC 4.8 doesn't support the intrinsics. uint64_t result; asm ("popcntq %1, %0" : "=r" (result) : "r" (value)); return result; -#else - return _mm_popcnt_u64(value); -#endif } }; @@ -78,19 +79,20 @@ struct Haswell : public Nehalem { return Nehalem::supported(cpuId) && cpuId.bmi1(); } - FOLLY_TARGET_ATTRIBUTE("bmi") - static inline uint64_t blsr(uint64_t value) { + static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) { // BMI1 is supported starting with Intel Haswell, AMD Piledriver. // BLSR combines two instuctions into one and reduces register pressure. -#if defined(__GNUC__) && !defined(__clang__) && !__GNUC_PREREQ(4, 9) - // GCC 4.8 doesn't support the intrinsics. uint64_t result; asm ("blsrq %1, %0" : "=r" (result) : "r" (value)); return result; -#else - return _blsr_u64(value); -#endif } }; -}}} // namespaces +#else // FOLLY_INSTRUCTIONS_SUPPORTED + +struct Nehalem : public Default {}; +struct Haswell : public Nehalem {}; + +#endif // FOLLY_INSTRUCTIONS_SUPPORTED + +}}} // namespaces diff --git a/folly/experimental/Select64.h b/folly/experimental/Select64.h index 039cfa85..e79b489c 100644 --- a/folly/experimental/Select64.h +++ b/folly/experimental/Select64.h @@ -18,13 +18,14 @@ #include +#include #include namespace folly { namespace detail { extern const uint8_t kSelectInByte[2048]; -} +} // namespace detail /** * Returns the position of the k-th 1 in the 64-bit word x. @@ -62,12 +63,11 @@ inline uint64_t select64(uint64_t x, uint64_t k) { return place + detail::kSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)]; } +#if FOLLY_INSTRUCTIONS_SUPPORTED + template <> -FOLLY_TARGET_ATTRIBUTE("bmi,bmi2") -inline uint64_t select64(uint64_t x, - uint64_t k) { -#if defined(__GNUC__) && !defined(__clang__) && !__GNUC_PREREQ(4, 9) - // GCC 4.8 doesn't support the intrinsics. +FOLLY_ALWAYS_INLINE uint64_t +select64(uint64_t x, uint64_t k) { uint64_t result = uint64_t(1) << k; asm("pdep %1, %0, %0\n\t" @@ -76,9 +76,8 @@ inline uint64_t select64(uint64_t x, : "r"(x)); return result; -#else - return _tzcnt_u64(_pdep_u64(1ULL << k, x)); -#endif } +#endif // FOLLY_INSTRUCTIONS_SUPPORTED + } // namespace folly -- 2.34.1