From bf385e23f37014ae94ed151097876b8c03338479 Mon Sep 17 00:00:00 2001 From: Nathan Bronson Date: Mon, 22 Feb 2016 13:08:30 -0800 Subject: [PATCH] fix SIOF in CacheLocality.h's AccessSpreader Summary:This diff moves all data accessed during AccessSpreader<>::current(x) into the .data segment, avoiding SIOF without adding indirection or dynamic gating as would be the case for normal singleton-like constructs. The diff also trims the AccessSpreader API to include only those methods that people actually seem to use. Reviewed By: djwatson Differential Revision: D2945205 fb-gh-sync-id: 847e31adc4450217f4ed0575686be261fb504d7c shipit-source-id: 847e31adc4450217f4ed0575686be261fb504d7c --- folly/detail/CacheLocality.cpp | 58 +--- folly/detail/CacheLocality.h | 221 ++++++-------- folly/test/CacheLocalityTest.cpp | 411 ++++++++++----------------- folly/test/DeterministicSchedule.cpp | 19 +- folly/test/DeterministicSchedule.h | 3 +- 5 files changed, 252 insertions(+), 460 deletions(-) diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp index 6f7657f5..d174b6f5 100644 --- a/folly/detail/CacheLocality.cpp +++ b/folly/detail/CacheLocality.cpp @@ -28,6 +28,8 @@ #include #include +DECLARE_ACCESS_SPREADER_TYPE(std::atomic) + namespace folly { namespace detail { @@ -60,8 +62,8 @@ static CacheLocality getSystemLocalityInfo() { template <> const CacheLocality& CacheLocality::system() { - static CacheLocality cache(getSystemLocalityInfo()); - return cache; + static auto* cache = new CacheLocality(getSystemLocalityInfo()); + return *cache; } // Each level of cache has sharing sets, which are the set of cpus @@ -110,8 +112,7 @@ CacheLocality CacheLocality::readFromSysfsTree( std::vector levels; for (size_t index = 0;; ++index) { auto dir = - format("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index) - .str(); + sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index); auto cacheType = mapping(dir + "type"); auto equivStr = mapping(dir + "shared_cpu_list"); if (cacheType.size() == 0 || equivStr.size() == 0) { @@ -208,9 +209,7 @@ CacheLocality CacheLocality::uniform(size_t numCpus) { ////////////// Getcpu -/// Resolves the dynamically loaded symbol __vdso_getcpu, returning null -/// on failure -static Getcpu::Func loadVdsoGetcpu() { +Getcpu::Func Getcpu::resolveVdsoFunc() { #if defined(_MSC_VER) || defined(__BIONIC__) return nullptr; #else @@ -232,11 +231,6 @@ static Getcpu::Func loadVdsoGetcpu() { #endif } -Getcpu::Func Getcpu::vdsoFunc() { - static Func func = loadVdsoGetcpu(); - return func; -} - #ifdef FOLLY_TLS /////////////// SequentialThreadId @@ -250,40 +244,10 @@ FOLLY_TLS size_t SequentialThreadId::currentId(0); /////////////// AccessSpreader template <> -const AccessSpreader AccessSpreader::stripeByCore( - CacheLocality::system<>().numCachesByLevel.front()); - -template <> -const AccessSpreader AccessSpreader::stripeByChip( - CacheLocality::system<>().numCachesByLevel.back()); - -template <> -AccessSpreaderArray - AccessSpreaderArray::sharedInstance = {}; - -/// Always claims to be on CPU zero, node zero -static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* /* unused */) { - if (cpu != nullptr) { - *cpu = 0; - } - if (node != nullptr) { - *node = 0; - } - return 0; +Getcpu::Func AccessSpreader::pickGetcpuFunc() { + auto best = Getcpu::resolveVdsoFunc(); + return best ? best : &FallbackGetcpuType::getcpu; } -template <> -Getcpu::Func AccessSpreader::pickGetcpuFunc(size_t numStripes) { - if (numStripes == 1) { - // there's no need to call getcpu if there is only one stripe. - // This should not be common, so we don't want to waste a test and - // branch in the main code path, but we might as well use a faster - // function pointer - return °enerateGetcpu; - } else { - auto best = Getcpu::vdsoFunc(); - return best ? best : &FallbackGetcpuType::getcpu; - } -} -} -} // namespace folly::detail +} // namespace detail +} // namespace folly diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h index ac9de657..c58d21c5 100644 --- a/folly/detail/CacheLocality.h +++ b/folly/detail/CacheLocality.h @@ -131,15 +131,16 @@ struct CacheLocality { /// it doesn't have false sharing with anything at a smaller memory address. #define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING FOLLY_ALIGNED(128) -/// Holds a function pointer to the VDSO implementation of getcpu(2), -/// if available +/// Knows how to derive a function pointer to the VDSO implementation of +/// getcpu(2), if available struct Getcpu { /// Function pointer to a function with the same signature as getcpu(2). typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused); /// Returns a pointer to the VDSO implementation of getcpu(2), if - /// available, or nullptr otherwise - static Func vdsoFunc(); + /// available, or nullptr otherwise. This function may be quite + /// expensive, be sure to cache the result. + static Func resolveVdsoFunc(); }; #ifdef FOLLY_TLS @@ -197,23 +198,14 @@ typedef FallbackGetcpu> FallbackGetcpuType; typedef FallbackGetcpu FallbackGetcpuType; #endif -template