From d0a6ecc6823908af8ed78e25ed3ca0f9f14e27c5 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@fb.com>
Date: Fri, 15 Jul 2016 14:43:18 -0700
Subject: [PATCH] out-line the SharedMutexImpl members that directly reference
 tls_lastTokenlessSlot

Summary: Recent changes to SharedMutex.h broke mcrouter's open source build. See https://travis-ci.org/facebook/mcrouter/builds/140608809. It looks like we're getting bitten by https://sourceware.org/bugzilla/show_bug.cgi?id=16773. We're hitting it now because the extern template is forcing the instantiation of the thread-local static member to be located in SharedMutex.o, but the inline members that reference that thread-local are *not* located in SharedMutex.o. binutils seems to be stepping on its own feet trying to fix up the references at link time. We can fix it by making sure the code that references the thread-local is colocated with the thread-local.

Reviewed By: yfeldblum

Differential Revision: D3498477

fbshipit-source-id: 86ea86812010ff1ef7351e6f8c106bb4291d0234
---
 folly/SharedMutex.h | 287 +++++++++++++++++++++++---------------------
 1 file changed, 152 insertions(+), 135 deletions(-)
diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h
index 2d71a884..e0974d44 100644
--- a/folly/SharedMutex.h
+++ b/folly/SharedMutex.h
@@ -1067,121 +1067,7 @@ class SharedMutexImpl {
   }
 
   template <class WaitContext>
-  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
-    while (true) {
-      if (UNLIKELY((state & kHasE) != 0) &&
-          !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
-        return false;
-      }
-
-      uint32_t slot;
-      uintptr_t slotValue = 1; // any non-zero value will do
-
-      bool canAlreadyDefer = (state & kMayDefer) != 0;
-      bool aboveDeferThreshold =
-          (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
-      bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
-      if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
-        // starting point for our empty-slot search, can change after
-        // calling waitForZeroBits
-        uint32_t bestSlot =
-            (uint32_t)folly::detail::AccessSpreader<Atom>::current(
-                kMaxDeferredReaders);
-
-        // deferred readers are already enabled, or it is time to
-        // enable them if we can find a slot
-        for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
-          slot = bestSlot ^ i;
-          assert(slot < kMaxDeferredReaders);
-          slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
-          if (slotValue == 0) {
-            // found empty slot
-            break;
-          }
-        }
-      }
-
-      if (slotValue != 0) {
-        // not yet deferred, or no empty slots
-        if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
-          // successfully recorded the read lock inline
-          if (token != nullptr) {
-            token->type_ = Token::Type::INLINE_SHARED;
-          }
-          return true;
-        }
-        // state is updated, try again
-        continue;
-      }
-
-      // record that deferred readers might be in use if necessary
-      if ((state & kMayDefer) == 0) {
-        if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
-          // keep going if CAS failed because somebody else set the bit
-          // for us
-          if ((state & (kHasE | kMayDefer)) != kMayDefer) {
-            continue;
-          }
-        }
-        // state = state | kMayDefer;
-      }
-
-      // try to use the slot
-      bool gotSlot = deferredReader(slot)->compare_exchange_strong(
-          slotValue,
-          token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
-
-      // If we got the slot, we need to verify that an exclusive lock
-      // didn't happen since we last checked.  If we didn't get the slot we
-      // need to recheck state_ anyway to make sure we don't waste too much
-      // work.  It is also possible that since we checked state_ someone
-      // has acquired and released the write lock, clearing kMayDefer.
-      // Both cases are covered by looking for the readers-possible bit,
-      // because it is off when the exclusive lock bit is set.
-      state = state_.load(std::memory_order_acquire);
-
-      if (!gotSlot) {
-        continue;
-      }
-
-      if (token == nullptr) {
-        tls_lastTokenlessSlot = slot;
-      }
-
-      if ((state & kMayDefer) != 0) {
-        assert((state & kHasE) == 0);
-        // success
-        if (token != nullptr) {
-          token->type_ = Token::Type::DEFERRED_SHARED;
-          token->slot_ = (uint16_t)slot;
-        }
-        return true;
-      }
-
-      // release the slot before retrying
-      if (token == nullptr) {
-        // We can't rely on slot.  Token-less slot values can be freed by
-        // any unlock_shared(), so we need to do the full deferredReader
-        // search during unlock.  Unlike unlock_shared(), we can't trust
-        // kPrevDefer here.  This deferred lock isn't visible to lock()
-        // (that's the whole reason we're undoing it) so there might have
-        // subsequently been an unlock() and lock() with no intervening
-        // transition to deferred mode.
-        if (!tryUnlockTokenlessSharedDeferred()) {
-          unlockSharedInline();
-        }
-      } else {
-        if (!tryUnlockSharedDeferred(slot)) {
-          unlockSharedInline();
-        }
-      }
-
-      // We got here not because the lock was unavailable, but because
-      // we lost a compare-and-swap.  Try-lock is typically allowed to
-      // have spurious failures, but there is no lock efficiency gain
-      // from exploiting that freedom here.
-    }
-  }
+  bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
 
   // Updates the state in/out argument as if the locks were made inline,
   // but does not update state_
@@ -1199,19 +1085,7 @@ class SharedMutexImpl {
     }
   }
 
-  bool tryUnlockTokenlessSharedDeferred() {
-    auto bestSlot = tls_lastTokenlessSlot;
-    for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
-      auto slotPtr = deferredReader(bestSlot ^ i);
-      auto slotValue = slotPtr->load(std::memory_order_relaxed);
-      if (slotValue == tokenlessSlotValue() &&
-          slotPtr->compare_exchange_strong(slotValue, 0)) {
-        tls_lastTokenlessSlot = bestSlot ^ i;
-        return true;
-      }
-    }
-    return false;
-  }
+  bool tryUnlockTokenlessSharedDeferred();
 
   bool tryUnlockSharedDeferred(uint32_t slot) {
     assert(slot < kMaxDeferredReaders);
@@ -1429,6 +1303,15 @@ class SharedMutexImpl {
   }
 };
 
+typedef SharedMutexImpl<true> SharedMutexReadPriority;
+typedef SharedMutexImpl<false> SharedMutexWritePriority;
+typedef SharedMutexWritePriority SharedMutex;
+
+// Prevent the compiler from instantiating these in other translation units.
+// They are instantiated once in SharedMutex.cpp
+extern template class SharedMutexImpl<true>;
+extern template class SharedMutexImpl<false>;
+
 template <
     bool ReaderPriority,
     typename Tag_,
@@ -1449,13 +1332,147 @@ FOLLY_TLS uint32_t
     SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
         tls_lastTokenlessSlot = 0;
 
-typedef SharedMutexImpl<true> SharedMutexReadPriority;
-typedef SharedMutexImpl<false> SharedMutexWritePriority;
-typedef SharedMutexWritePriority SharedMutex;
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+    tryUnlockTokenlessSharedDeferred() {
+  auto bestSlot = tls_lastTokenlessSlot;
+  for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
+    auto slotPtr = deferredReader(bestSlot ^ i);
+    auto slotValue = slotPtr->load(std::memory_order_relaxed);
+    if (slotValue == tokenlessSlotValue() &&
+        slotPtr->compare_exchange_strong(slotValue, 0)) {
+      tls_lastTokenlessSlot = bestSlot ^ i;
+      return true;
+    }
+  }
+  return false;
+}
 
-// Prevent the compiler from instantiating these in other translation units.
-// They are instantiated once in SharedMutex.cpp
-extern template class SharedMutexImpl<true>;
-extern template class SharedMutexImpl<false>;
+template <
+    bool ReaderPriority,
+    typename Tag_,
+    template <typename> class Atom,
+    bool BlockImmediately>
+template <class WaitContext>
+bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+    lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
+  while (true) {
+    if (UNLIKELY((state & kHasE) != 0) &&
+        !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
+      return false;
+    }
+
+    uint32_t slot;
+    uintptr_t slotValue = 1; // any non-zero value will do
+
+    bool canAlreadyDefer = (state & kMayDefer) != 0;
+    bool aboveDeferThreshold =
+        (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
+    bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
+    if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
+      // starting point for our empty-slot search, can change after
+      // calling waitForZeroBits
+      uint32_t bestSlot =
+          (uint32_t)folly::detail::AccessSpreader<Atom>::current(
+              kMaxDeferredReaders);
+
+      // deferred readers are already enabled, or it is time to
+      // enable them if we can find a slot
+      for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
+        slot = bestSlot ^ i;
+        assert(slot < kMaxDeferredReaders);
+        slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
+        if (slotValue == 0) {
+          // found empty slot
+          break;
+        }
+      }
+    }
+
+    if (slotValue != 0) {
+      // not yet deferred, or no empty slots
+      if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
+        // successfully recorded the read lock inline
+        if (token != nullptr) {
+          token->type_ = Token::Type::INLINE_SHARED;
+        }
+        return true;
+      }
+      // state is updated, try again
+      continue;
+    }
+
+    // record that deferred readers might be in use if necessary
+    if ((state & kMayDefer) == 0) {
+      if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
+        // keep going if CAS failed because somebody else set the bit
+        // for us
+        if ((state & (kHasE | kMayDefer)) != kMayDefer) {
+          continue;
+        }
+      }
+      // state = state | kMayDefer;
+    }
+
+    // try to use the slot
+    bool gotSlot = deferredReader(slot)->compare_exchange_strong(
+        slotValue,
+        token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
+
+    // If we got the slot, we need to verify that an exclusive lock
+    // didn't happen since we last checked.  If we didn't get the slot we
+    // need to recheck state_ anyway to make sure we don't waste too much
+    // work.  It is also possible that since we checked state_ someone
+    // has acquired and released the write lock, clearing kMayDefer.
+    // Both cases are covered by looking for the readers-possible bit,
+    // because it is off when the exclusive lock bit is set.
+    state = state_.load(std::memory_order_acquire);
+
+    if (!gotSlot) {
+      continue;
+    }
+
+    if (token == nullptr) {
+      tls_lastTokenlessSlot = slot;
+    }
+
+    if ((state & kMayDefer) != 0) {
+      assert((state & kHasE) == 0);
+      // success
+      if (token != nullptr) {
+        token->type_ = Token::Type::DEFERRED_SHARED;
+        token->slot_ = (uint16_t)slot;
+      }
+      return true;
+    }
+
+    // release the slot before retrying
+    if (token == nullptr) {
+      // We can't rely on slot.  Token-less slot values can be freed by
+      // any unlock_shared(), so we need to do the full deferredReader
+      // search during unlock.  Unlike unlock_shared(), we can't trust
+      // kPrevDefer here.  This deferred lock isn't visible to lock()
+      // (that's the whole reason we're undoing it) so there might have
+      // subsequently been an unlock() and lock() with no intervening
+      // transition to deferred mode.
+      if (!tryUnlockTokenlessSharedDeferred()) {
+        unlockSharedInline();
+      }
+    } else {
+      if (!tryUnlockSharedDeferred(slot)) {
+        unlockSharedInline();
+      }
+    }
+
+    // We got here not because the lock was unavailable, but because
+    // we lost a compare-and-swap.  Try-lock is typically allowed to
+    // have spurious failures, but there is no lock efficiency gain
+    // from exploiting that freedom here.
+  }
+}
 
 } // namespace folly
-- 
2.34.1