Add a fast path to folly::ThreadLocal
authorDave Watson <davejwatson@fb.com>
Wed, 22 Nov 2017 16:16:04 +0000 (08:16 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 22 Nov 2017 16:25:44 +0000 (08:25 -0800)
Summary:
Currently folly::ThreadLocal[Ptr] is pretty heavy-weight for a get():

1) call instance(), take a static init guard, branch
2) call getThreadEntry, check if thread_local is not null, branch
3) check if id < threadEntry->capacity, branch
4) Finally, return threadEntry->elements[id]

If we have real thread_locals, we can do better by caching the capacity directly,
combining all three checks:

1) checkif id < threadLocalCapacityCheck, branch.  If not, do slow path.
2) return threadEntry->elements[id].  Threadentry is never null if capacity > 0, and
    instance() setup work is called during the first getThreadEntry call when threadlocalcapacity == 0.

Reviewed By: yfeldblum

Differential Revision: D6379878

fbshipit-source-id: 4fc7564bbb2f319d65875124026aef28d910ef06

folly/ThreadLocal.h
folly/detail/ThreadLocalDetail.h
folly/test/ThreadLocalBenchmark.cpp

index 3c05074463e8ded6ff54928886b5fd748a8d2a63..87ae35461798437b0d593d2023700985bc9e83ba 100644 (file)
@@ -161,7 +161,7 @@ class ThreadLocalPtr {
   }
 
   T* get() const {
-    threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_);
+    threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
     return static_cast<T*>(w.ptr);
   }
 
@@ -174,14 +174,14 @@ class ThreadLocalPtr {
   }
 
   T* release() {
-    threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_);
+    threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
 
     return static_cast<T*>(w.release());
   }
 
   void reset(T* newPtr = nullptr) {
     auto guard = makeGuard([&] { delete newPtr; });
-    threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_);
+    threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
 
     w.dispose(TLPDestructionMode::THIS_THREAD);
     guard.dismiss();
@@ -235,7 +235,7 @@ class ThreadLocalPtr {
         deleter(newPtr, TLPDestructionMode::THIS_THREAD);
       }
     });
-    threadlocal_detail::ElementWrapper& w = StaticMeta::instance().get(&id_);
+    threadlocal_detail::ElementWrapper& w = StaticMeta::get(&id_);
     w.dispose(TLPDestructionMode::THIS_THREAD);
     guard.dismiss();
     w.set(newPtr, deleter);
index 3d5a2aa83c34433b0174399bbeccf3011771739d..735541b5cec999572b10b80729b2e4d80ea1dd27 100644 (file)
@@ -290,7 +290,7 @@ struct StaticMetaBase {
    */
   void reserve(EntryID* id);
 
-  ElementWrapper& get(EntryID* ent);
+  ElementWrapper& getElement(EntryID* ent);
 
   static void initAtFork();
   static void registerAtFork(
@@ -335,7 +335,35 @@ struct StaticMeta : StaticMetaBase {
     return *instance;
   }
 
-  ElementWrapper& get(EntryID* ent) {
+#ifdef FOLLY_TLD_USE_FOLLY_TLS
+  // Eliminate as many branches as possible:
+  // One branch on capacityCache, vs. three:
+  // 1) instance() static initializer
+  // 2) getThreadEntry null check
+  // 3) elementsCapacity size check.
+  // 3 will never be true if 1 or 2 are false.
+  FOLLY_ALWAYS_INLINE static ElementWrapper& get(EntryID* ent) {
+    uint32_t id = ent->getOrInvalid();
+    if (UNLIKELY(capacityCache_ <= id)) {
+      return getSlow(ent);
+    } else {
+      return threadEntryCache_->elements[id];
+    }
+  }
+
+  static ElementWrapper& getSlow(EntryID* ent) {
+    ElementWrapper& res = instance().getElement(ent);
+    // Cache new capacity
+    capacityCache_ = getThreadEntry()->elementsCapacity;
+    return res;
+  }
+#else
+  static ElementWrapper& get(EntryID* ent) {
+    return instance().getElement(ent);
+  }
+#endif
+
+  ElementWrapper& getElement(EntryID* ent) {
     ThreadEntry* threadEntry = getThreadEntry();
     uint32_t id = ent->getOrInvalid();
     // if id is invalid, it is equal to uint32_t's max value.
@@ -369,11 +397,10 @@ struct StaticMeta : StaticMetaBase {
 
   inline static ThreadEntry* getThreadEntry() {
 #ifdef FOLLY_TLD_USE_FOLLY_TLS
-    static FOLLY_TLS ThreadEntry* threadEntryCache{nullptr};
-    if (UNLIKELY(threadEntryCache == nullptr)) {
-      threadEntryCache = instance().threadEntry_();
+    if (UNLIKELY(threadEntryCache_ == nullptr)) {
+      threadEntryCache_ = instance().threadEntry_();
     }
-    return threadEntryCache;
+    return threadEntryCache_;
 #else
     return instance().threadEntry_();
 #endif
@@ -397,7 +424,18 @@ struct StaticMeta : StaticMetaBase {
     }
     instance().lock_.unlock();
   }
+
+#ifdef FOLLY_TLD_USE_FOLLY_TLS
+  static FOLLY_TLS ThreadEntry* threadEntryCache_;
+  static FOLLY_TLS size_t capacityCache_;
+#endif
 };
 
+#ifdef FOLLY_TLD_USE_FOLLY_TLS
+template <class Tag, class AccessMode>
+FOLLY_TLS ThreadEntry* StaticMeta<Tag, AccessMode>::threadEntryCache_{nullptr};
+template <class Tag, class AccessMode>
+FOLLY_TLS size_t StaticMeta<Tag, AccessMode>::capacityCache_{0};
+#endif
 } // namespace threadlocal_detail
 } // namespace folly
index 81f00779cd762b9dc03e2e3c81cb577775165603..666b7e31e28d420fe13ebb453448978b6e855f24 100644 (file)
@@ -132,13 +132,13 @@ int main(int argc, char** argv) {
 ============================================================================
 folly/test/ThreadLocalBenchmark.cpp             relative  time/iter  iters/s
 ============================================================================
-BM_mt_tlp                                                    2.30ns  434.53M
-BM_mt_pthread_get_specific                                   2.69ns  371.75M
-BM_mt_boost_tsp                                             11.66ns   85.78M
+BM_mt_tlp                                                    1.92ns  520.02M
+BM_mt_pthread_get_specific                                   2.69ns  372.15M
+BM_mt_boost_tsp                                             11.81ns   84.67M
 ----------------------------------------------------------------------------
-BM_mt_tlp_multi                                             12.46ns   80.25M
-BM_mt_pthread_get_specific_multi                            16.58ns   60.32M
-BM_mt_boost_tsp_multi                                       70.85ns   14.12M
+BM_mt_tlp_multi                                              7.53ns  132.79M
+BM_mt_pthread_get_specific_multi                            15.80ns   63.29M
+BM_mt_boost_tsp_multi                                       71.70ns   13.95M
 ----------------------------------------------------------------------------
 ============================================================================
 */