From: Nathan Bronson Date: Fri, 6 Dec 2013 00:48:13 +0000 (-0800) Subject: centralize cache-line alignment goo into CacheLocality X-Git-Tag: v0.22.0~765 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=9b9fb1d60d65ddbdf538851494f3899474b1dd17;p=folly.git centralize cache-line alignment goo into CacheLocality Summary: The alignment requirements and details required to avoid false sharing belong in CacheLocality, so this diff moves them there. @override-unit-failures Test Plan: fbmake runtests Reviewed By: davejwatson@fb.com FB internal diff: D1086090 --- diff --git a/folly/MPMCQueue.h b/folly/MPMCQueue.h index defbdfe0..339be20c 100644 --- a/folly/MPMCQueue.h +++ b/folly/MPMCQueue.h @@ -25,9 +25,11 @@ #include #include #include +#include #include #include +#include #include namespace folly { @@ -80,11 +82,13 @@ template class MPMCPipelineStageImpl; /// FOLLY_ASSUME_FBVECTOR_COMPATIBLE then your type can be put in /// MPMCQueue. template class Atom = std::atomic, - typename = typename std::enable_if< - std::is_nothrow_constructible::value || - folly::IsRelocatable::value>::type> + template class Atom = std::atomic> class MPMCQueue : boost::noncopyable { + + static_assert(std::is_nothrow_constructible::value || + folly::IsRelocatable::value, + "T must be relocatable or have a noexcept move constructor"); + friend class detail::MPMCPipelineStageImpl; public: typedef T value_type; @@ -100,7 +104,11 @@ class MPMCQueue : boost::noncopyable { , popSpinCutoff_(0) { // ideally this would be a static assert, but g++ doesn't allow it - assert(alignof(MPMCQueue) >= kFalseSharingRange); + assert(alignof(MPMCQueue) + >= detail::CacheLocality::kFalseSharingRange); + assert(static_cast(static_cast(&popTicket_)) + - static_cast(static_cast(&pushTicket_)) + >= detail::CacheLocality::kFalseSharingRange); } /// A default-constructed queue is useful because a usable (non-zero @@ -324,27 +332,15 @@ class MPMCQueue : boost::noncopyable { /// the proper spin backoff kAdaptationFreq = 128, - /// Memory locations on the same cache line are subject to false - /// sharing, which is very bad for performance - kFalseSharingRange = 64, - /// To avoid false sharing in slots_ with neighboring memory /// allocations, we pad it with this many SingleElementQueue-s at /// each end - kSlotPadding = 1 + - (kFalseSharingRange - 1) / sizeof(detail::SingleElementQueue) + kSlotPadding = (detail::CacheLocality::kFalseSharingRange - 1) + / sizeof(detail::SingleElementQueue) + 1 }; - static_assert(kFalseSharingRange == 64, - "FOLLY_ON_NEXT_CACHE_LINE must track kFalseSharingRange"); - -// This literal "64' should be kFalseSharingRange, -// but gcc-4.8.0 and 4.8.1 reject it. -// FIXME: s/64/kFalseSharingRange/ if that ever changes. -#define FOLLY_ON_NEXT_CACHE_LINE __attribute__((aligned(64))) - /// The maximum number of items in the queue at once - size_t capacity_ FOLLY_ON_NEXT_CACHE_LINE; + size_t FOLLY_ALIGN_TO_AVOID_FALSE_SHARING capacity_; /// An array of capacity_ SingleElementQueue-s, each of which holds /// either 0 or 1 item. We over-allocate by 2 * kSlotPadding and don't @@ -357,24 +353,23 @@ class MPMCQueue : boost::noncopyable { int stride_; /// Enqueuers get tickets from here - Atom pushTicket_ FOLLY_ON_NEXT_CACHE_LINE; + Atom FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushTicket_; /// Dequeuers get tickets from here - Atom popTicket_ FOLLY_ON_NEXT_CACHE_LINE; + Atom FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popTicket_; /// This is how many times we will spin before using FUTEX_WAIT when /// the queue is full on enqueue, adaptively computed by occasionally /// spinning for longer and smoothing with an exponential moving average - Atom pushSpinCutoff_ FOLLY_ON_NEXT_CACHE_LINE; + Atom FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushSpinCutoff_; /// The adaptive spin cutoff when the queue is empty on dequeue - Atom popSpinCutoff_ FOLLY_ON_NEXT_CACHE_LINE; + Atom FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popSpinCutoff_; - /// Alignment doesn't avoid false sharing at the end of the struct, + /// Alignment doesn't prevent false sharing at the end of the struct, /// so fill out the last cache line - char padding_[kFalseSharingRange - sizeof(Atom)]; + char padding_[detail::CacheLocality::kFalseSharingRange - sizeof(Atom)]; -#undef FOLLY_ON_NEXT_CACHE_LINE /// We assign tickets in increasing order, but we don't want to /// access neighboring elements of slots_ because that will lead to @@ -771,7 +766,7 @@ struct SingleElementQueue { const bool updateSpinCutoff, Args&&... args) noexcept { sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff); - new (contents_) T(std::forward(args)...); + new (&contents_) T(std::forward(args)...); sequencer_.completeTurn(turn * 2); } @@ -789,13 +784,13 @@ struct SingleElementQueue { if (std::is_nothrow_constructible::value) { // this is preferred sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff); - new (contents_) T(std::move(goner)); + new (&contents_) T(std::move(goner)); sequencer_.completeTurn(turn * 2); } else { // simulate nothrow move with relocation, followed by default // construction to fill the gap we created sequencer_.waitForTurn(turn * 2, spinCutoff, updateSpinCutoff); - memcpy(contents_, &goner, sizeof(T)); + memcpy(&contents_, &goner, sizeof(T)); sequencer_.completeTurn(turn * 2); new (&goner) T(); } @@ -818,7 +813,7 @@ struct SingleElementQueue { // unlikely, but if we don't complete our turn the queue will die } sequencer_.waitForTurn(turn * 2 + 1, spinCutoff, updateSpinCutoff); - memcpy(&elem, contents_, sizeof(T)); + memcpy(&elem, &contents_, sizeof(T)); sequencer_.completeTurn(turn * 2 + 1); } else { // use nothrow move assignment @@ -835,13 +830,13 @@ struct SingleElementQueue { private: /// Storage for a T constructed with placement new - char contents_[sizeof(T)] __attribute__((aligned(alignof(T)))); + typename std::aligned_storage::type contents_; /// Even turns are pushes, odd turns are pops TurnSequencer sequencer_; T* ptr() noexcept { - return static_cast(static_cast(contents_)); + return static_cast(static_cast(&contents_)); } void destroyContents() noexcept { @@ -851,7 +846,7 @@ struct SingleElementQueue { // g++ doesn't seem to have std::is_nothrow_destructible yet } #ifndef NDEBUG - memset(contents_, 'Q', sizeof(T)); + memset(&contents_, 'Q', sizeof(T)); #endif } }; diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h index 0d839bec..3645c8b6 100644 --- a/folly/detail/CacheLocality.h +++ b/folly/detail/CacheLocality.h @@ -108,8 +108,26 @@ struct CacheLocality { /// CacheLocality structure with the specified number of cpus and a /// single cache level that associates one cpu per cache. static CacheLocality uniform(size_t numCpus); + + enum { + /// Memory locations on the same cache line are subject to false + /// sharing, which is very bad for performance. Microbenchmarks + /// indicate that pairs of cache lines also see interference under + /// heavy use of atomic operations (observed for atomic increment on + /// Sandy Bridge). See FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + kFalseSharingRange = 128 + }; + + static_assert(kFalseSharingRange == 128, + "FOLLY_ALIGN_TO_AVOID_FALSE_SHARING should track kFalseSharingRange"); }; +// TODO replace __attribute__ with alignas and 128 with kFalseSharingRange + +/// An attribute that will cause a variable or field to be aligned so that +/// it doesn't have false sharing with anything at a smaller memory address. +#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING __attribute__((aligned(128))) + /// Holds a function pointer to the VDSO implementation of getcpu(2), /// if available struct Getcpu { @@ -329,12 +347,6 @@ struct AccessSpreaderArray { private: - /// If we align the access spreaders at the beginning of a cache line - /// then getcpuFunc_ and the first 56 bytes of stripeByCpu will be on - /// the same cache line - enum { kAlignment = 64 }; - - // AccessSpreader uses sharedInstance friend AccessSpreader; @@ -343,7 +355,8 @@ struct AccessSpreaderArray { /// aligned_storage is uninitialized, we use placement new since there /// is no AccessSpreader default constructor - typename std::aligned_storage),kAlignment>::type + typename std::aligned_storage), + CacheLocality::kFalseSharingRange>::type raw[kMaxStripe + 1]; };