From dc4be288a0cd5b88beb4f8c1b5d74d98791d43c9 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Wed, 22 Nov 2017 08:16:04 -0800 Subject: [PATCH] Add a fast path to folly::ThreadLocal Summary: Currently folly::ThreadLocal[Ptr] is pretty heavy-weight for a get(): 1) call instance(), take a static init guard, branch 2) call getThreadEntry, check if thread_local is not null, branch 3) check if id < threadEntry->capacity, branch 4) Finally, return threadEntry->elements[id] If we have real thread_locals, we can do better by caching the capacity directly, combining all three checks: 1) checkif id < threadLocalCapacityCheck, branch. If not, do slow path. 2) return threadEntry->elements[id]. Threadentry is never null if capacity > 0, and instance() setup work is called during the first getThreadEntry call when threadlocalcapacity == 0. Reviewed By: yfeldblum Differential Revision: D6379878 fbshipit-source-id: 4fc7564bbb2f319d65875124026aef28d910ef06 --- folly/ThreadLocal.h | 8 ++--- folly/detail/ThreadLocalDetail.h | 50 +++++++++++++++++++++++++---- folly/test/ThreadLocalBenchmark.cpp | 12 +++---- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/folly/ThreadLocal.h b/folly/ThreadLocal.h index 3c050744..87ae3546 100644 --- a/folly/ThreadLocal.h +++ b/folly/ThreadLocal.h @@ -161,7 +161,7 @@ class ThreadLocalPtr { } T* get() const { - threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); + threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_); return static_cast(w.ptr); } @@ -174,14 +174,14 @@ class ThreadLocalPtr { } T* release() { - threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); + threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_); return static_cast(w.release()); } void reset(T* newPtr = nullptr) { auto guard = makeGuard([&] { delete newPtr; }); - threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); + threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_); w.dispose(TLPDestructionMode::THIS_THREAD); guard.dismiss(); @@ -235,7 +235,7 @@ class ThreadLocalPtr { deleter(newPtr, TLPDestructionMode::THIS_THREAD); } }); - threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_); + threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_); w.dispose(TLPDestructionMode::THIS_THREAD); guard.dismiss(); w.set(newPtr, deleter); diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h index 3d5a2aa8..735541b5 100644 --- a/folly/detail/ThreadLocalDetail.h +++ b/folly/detail/ThreadLocalDetail.h @@ -290,7 +290,7 @@ struct StaticMetaBase { */ void reserve(EntryID* id); - ElementWrapper& get(EntryID* ent); + ElementWrapper& getElement(EntryID* ent); static void initAtFork(); static void registerAtFork( @@ -335,7 +335,35 @@ struct StaticMeta : StaticMetaBase { return *instance; } - ElementWrapper& get(EntryID* ent) { +#ifdef FOLLY_TLD_USE_FOLLY_TLS + // Eliminate as many branches as possible: + // One branch on capacityCache, vs. three: + // 1) instance() static initializer + // 2) getThreadEntry null check + // 3) elementsCapacity size check. + // 3 will never be true if 1 or 2 are false. + FOLLY_ALWAYS_INLINE static ElementWrapper& get(EntryID* ent) { + uint32_t id = ent->getOrInvalid(); + if (UNLIKELY(capacityCache_ <= id)) { + return getSlow(ent); + } else { + return threadEntryCache_->elements[id]; + } + } + + static ElementWrapper& getSlow(EntryID* ent) { + ElementWrapper& res = instance().getElement(ent); + // Cache new capacity + capacityCache_ = getThreadEntry()->elementsCapacity; + return res; + } +#else + static ElementWrapper& get(EntryID* ent) { + return instance().getElement(ent); + } +#endif + + ElementWrapper& getElement(EntryID* ent) { ThreadEntry* threadEntry = getThreadEntry(); uint32_t id = ent->getOrInvalid(); // if id is invalid, it is equal to uint32_t's max value. @@ -369,11 +397,10 @@ struct StaticMeta : StaticMetaBase { inline static ThreadEntry* getThreadEntry() { #ifdef FOLLY_TLD_USE_FOLLY_TLS - static FOLLY_TLS ThreadEntry* threadEntryCache{nullptr}; - if (UNLIKELY(threadEntryCache == nullptr)) { - threadEntryCache = instance().threadEntry_(); + if (UNLIKELY(threadEntryCache_ == nullptr)) { + threadEntryCache_ = instance().threadEntry_(); } - return threadEntryCache; + return threadEntryCache_; #else return instance().threadEntry_(); #endif @@ -397,7 +424,18 @@ struct StaticMeta : StaticMetaBase { } instance().lock_.unlock(); } + +#ifdef FOLLY_TLD_USE_FOLLY_TLS + static FOLLY_TLS ThreadEntry* threadEntryCache_; + static FOLLY_TLS size_t capacityCache_; +#endif }; +#ifdef FOLLY_TLD_USE_FOLLY_TLS +template +FOLLY_TLS ThreadEntry* StaticMeta::threadEntryCache_{nullptr}; +template +FOLLY_TLS size_t StaticMeta::capacityCache_{0}; +#endif } // namespace threadlocal_detail } // namespace folly diff --git a/folly/test/ThreadLocalBenchmark.cpp b/folly/test/ThreadLocalBenchmark.cpp index 81f00779..666b7e31 100644 --- a/folly/test/ThreadLocalBenchmark.cpp +++ b/folly/test/ThreadLocalBenchmark.cpp @@ -132,13 +132,13 @@ int main(int argc, char** argv) { ============================================================================ folly/test/ThreadLocalBenchmark.cpp relative time/iter iters/s ============================================================================ -BM_mt_tlp 2.30ns 434.53M -BM_mt_pthread_get_specific 2.69ns 371.75M -BM_mt_boost_tsp 11.66ns 85.78M +BM_mt_tlp 1.92ns 520.02M +BM_mt_pthread_get_specific 2.69ns 372.15M +BM_mt_boost_tsp 11.81ns 84.67M ---------------------------------------------------------------------------- -BM_mt_tlp_multi 12.46ns 80.25M -BM_mt_pthread_get_specific_multi 16.58ns 60.32M -BM_mt_boost_tsp_multi 70.85ns 14.12M +BM_mt_tlp_multi 7.53ns 132.79M +BM_mt_pthread_get_specific_multi 15.80ns 63.29M +BM_mt_boost_tsp_multi 71.70ns 13.95M ---------------------------------------------------------------------------- ============================================================================ */ -- 2.34.1