folly/SharedMutex.h

   1 /*
   2  * Copyright 2015-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Nathan Bronson (ngbronson@fb.com)
  18
  19 #pragma once
  20
  21 #include <stdint.h>
  22
  23 #include <atomic>
  24 #include <thread>
  25 #include <type_traits>
  26
  27 #include <folly/Likely.h>
  28 #include <folly/concurrency/CacheLocality.h>
  29 #include <folly/detail/Futex.h>
  30 #include <folly/portability/Asm.h>
  31 #include <folly/portability/SysResource.h>
  32
  33 // SharedMutex is a reader-writer lock.  It is small, very fast, scalable
  34 // on multi-core, and suitable for use when readers or writers may block.
  35 // Unlike most other reader-writer locks, its throughput with concurrent
  36 // readers scales linearly; it is able to acquire and release the lock
  37 // in shared mode without cache line ping-ponging.  It is suitable for
  38 // a wide range of lock hold times because it starts with spinning,
  39 // proceeds to using sched_yield with a preemption heuristic, and then
  40 // waits using futex and precise wakeups.
  41 //
  42 // SharedMutex provides all of the methods of folly::RWSpinLock,
  43 // boost::shared_mutex, boost::upgrade_mutex, and C++14's
  44 // std::shared_timed_mutex.  All operations that can block are available
  45 // in try, try-for, and try-until (system_clock or steady_clock) versions.
  46 //
  47 // SharedMutexReadPriority gives priority to readers,
  48 // SharedMutexWritePriority gives priority to writers.  SharedMutex is an
  49 // alias for SharedMutexWritePriority, because writer starvation is more
  50 // likely than reader starvation for the read-heavy workloads targetted
  51 // by SharedMutex.
  52 //
  53 // In my tests SharedMutex is as good or better than the other
  54 // reader-writer locks in use at Facebook for almost all use cases,
  55 // sometimes by a wide margin.  (If it is rare that there are actually
  56 // concurrent readers then RWSpinLock can be a few nanoseconds faster.)
  57 // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
  58 // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
  59 // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
  60 // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
  61 // It is generally as good or better than the rest when evaluating size,
  62 // speed, scalability, or latency outliers.  In the corner cases where
  63 // it is not the fastest (such as single-threaded use or heavy write
  64 // contention) it is never very much worse than the best.  See the bottom
  65 // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
  66 //
  67 // Comparison to folly::RWSpinLock:
  68 //
  69 //  * SharedMutex is faster than RWSpinLock when there are actually
  70 //    concurrent read accesses (sometimes much faster), and ~5 nanoseconds
  71 //    slower when there is not actually any contention.  SharedMutex is
  72 //    faster in every (benchmarked) scenario where the shared mode of
  73 //    the lock is actually useful.
  74 //
  75 //  * Concurrent shared access to SharedMutex scales linearly, while total
  76 //    RWSpinLock throughput drops as more threads try to access the lock
  77 //    in shared mode.  Under very heavy read contention SharedMutex can
  78 //    be two orders of magnitude faster than RWSpinLock (or any reader
  79 //    writer lock that doesn't use striping or deferral).
  80 //
  81 //  * SharedMutex can safely protect blocking calls, because after an
  82 //    initial period of spinning it waits using futex().
  83 //
  84 //  * RWSpinLock prioritizes readers, SharedMutex has both reader- and
  85 //    writer-priority variants, but defaults to write priority.
  86 //
  87 //  * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
  88 //    doesn't.  Both semantics are reasonable.  The boost documentation
  89 //    doesn't explicitly talk about this behavior (except by omitting
  90 //    any statement that those lock modes conflict), but the boost
  91 //    implementations do allow new readers while the upgradeable mode
  92 //    is held.  See https://github.com/boostorg/thread/blob/master/
  93 //      include/boost/thread/pthread/shared_mutex.hpp
  94 //
  95 //  * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
  96 //    (UpgradeableHolder would be even more pedantically correct).
  97 //    SharedMutex's holders have fewer methods (no reset) and are less
  98 //    tolerant (promotion and downgrade crash if the donor doesn't own
  99 //    the lock, and you must use the default constructor rather than
 100 //    passing a nullptr to the pointer constructor).
 101 //
 102 // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
 103 // and "shared" modes.  At all times num_threads_holding_exclusive +
 104 // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
 105 // 0 || num_threads_holding_shared == 0.  RWSpinLock has the additional
 106 // constraint that num_threads_holding_shared cannot increase while
 107 // num_threads_holding_upgrade is non-zero.
 108 //
 109 // Comparison to the internal RWLock:
 110 //
 111 //  * SharedMutex doesn't allow a maximum reader count to be configured,
 112 //    so it can't be used as a semaphore in the same way as RWLock.
 113 //
 114 //  * SharedMutex is 4 bytes, RWLock is 256.
 115 //
 116 //  * SharedMutex is as fast or faster than RWLock in all of my
 117 //    microbenchmarks, and has positive rather than negative scalability.
 118 //
 119 //  * RWLock and SharedMutex are both writer priority locks.
 120 //
 121 //  * SharedMutex avoids latency outliers as well as RWLock.
 122 //
 123 //  * SharedMutex uses different names (t != 0 below):
 124 //
 125 //    RWLock::lock(0)    => SharedMutex::lock()
 126 //
 127 //    RWLock::lock(t)    => SharedMutex::try_lock_for(milliseconds(t))
 128 //
 129 //    RWLock::tryLock()  => SharedMutex::try_lock()
 130 //
 131 //    RWLock::unlock()   => SharedMutex::unlock()
 132 //
 133 //    RWLock::enter(0)   => SharedMutex::lock_shared()
 134 //
 135 //    RWLock::enter(t)   =>
 136 //        SharedMutex::try_lock_shared_for(milliseconds(t))
 137 //
 138 //    RWLock::tryEnter() => SharedMutex::try_lock_shared()
 139 //
 140 //    RWLock::leave()    => SharedMutex::unlock_shared()
 141 //
 142 //  * RWLock allows the reader count to be adjusted by a value other
 143 //    than 1 during enter() or leave(). SharedMutex doesn't currently
 144 //    implement this feature.
 145 //
 146 //  * RWLock's methods are marked const, SharedMutex's aren't.
 147 //
 148 // Reader-writer locks have the potential to allow concurrent access
 149 // to shared read-mostly data, but in practice they often provide no
 150 // improvement over a mutex.  The problem is the cache coherence protocol
 151 // of modern CPUs.  Coherence is provided by making sure that when a cache
 152 // line is written it is present in only one core's cache.  Since a memory
 153 // write is required to acquire a reader-writer lock in shared mode, the
 154 // cache line holding the lock is invalidated in all of the other caches.
 155 // This leads to cache misses when another thread wants to acquire or
 156 // release the lock concurrently.  When the RWLock is colocated with the
 157 // data it protects (common), cache misses can also continue occur when
 158 // a thread that already holds the lock tries to read the protected data.
 159 //
 160 // Ideally, a reader-writer lock would allow multiple cores to acquire
 161 // and release the lock in shared mode without incurring any cache misses.
 162 // This requires that each core records its shared access in a cache line
 163 // that isn't read or written by other read-locking cores.  (Writers will
 164 // have to check all of the cache lines.)  Typical server hardware when
 165 // this comment was written has 16 L1 caches and cache lines of 64 bytes,
 166 // so a lock striped over all L1 caches would occupy a prohibitive 1024
 167 // bytes.  Nothing says that we need a separate set of per-core memory
 168 // locations for each lock, however.  Each SharedMutex instance is only
 169 // 4 bytes, but all locks together share a 2K area in which they make a
 170 // core-local record of lock acquisitions.
 171 //
 172 // SharedMutex's strategy of using a shared set of core-local stripes has
 173 // a potential downside, because it means that acquisition of any lock in
 174 // write mode can conflict with acquisition of any lock in shared mode.
 175 // If a lock instance doesn't actually experience concurrency then this
 176 // downside will outweight the upside of improved scalability for readers.
 177 // To avoid this problem we dynamically detect concurrent accesses to
 178 // SharedMutex, and don't start using the deferred mode unless we actually
 179 // observe concurrency.  See kNumSharedToStartDeferring.
 180 //
 181 // It is explicitly allowed to call lock_unshared() from a different
 182 // thread than lock_shared(), so long as they are properly paired.
 183 // lock_unshared() needs to find the location at which lock_shared()
 184 // recorded the lock, which might be in the lock itself or in any of
 185 // the shared slots.  If you can conveniently pass state from lock
 186 // acquisition to release then the fastest mechanism is to std::move
 187 // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
 188 // lock_shared(Token&) and unlock_shared(Token&)).  The guard or token
 189 // will tell unlock_shared where in deferredReaders[] to look for the
 190 // deferred lock.  The Token-less version of unlock_shared() works in all
 191 // cases, but is optimized for the common (no inter-thread handoff) case.
 192 //
 193 // In both read- and write-priority mode, a waiting lock() (exclusive mode)
 194 // only blocks readers after it has waited for an active upgrade lock to be
 195 // released; until the upgrade lock is released (or upgraded or downgraded)
 196 // readers will still be able to enter.  Preferences about lock acquisition
 197 // are not guaranteed to be enforced perfectly (even if they were, there
 198 // is theoretically the chance that a thread could be arbitrarily suspended
 199 // between calling lock() and SharedMutex code actually getting executed).
 200 //
 201 // try_*_for methods always try at least once, even if the duration
 202 // is zero or negative.  The duration type must be compatible with
 203 // std::chrono::steady_clock.  try_*_until methods also always try at
 204 // least once.  std::chrono::system_clock and std::chrono::steady_clock
 205 // are supported.
 206 //
 207 // If you have observed by profiling that your SharedMutex-s are getting
 208 // cache misses on deferredReaders[] due to another SharedMutex user, then
 209 // you can use the tag type to create your own instantiation of the type.
 210 // The contention threshold (see kNumSharedToStartDeferring) should make
 211 // this unnecessary in all but the most extreme cases.  Make sure to check
 212 // that the increased icache and dcache footprint of the tagged result is
 213 // worth it.
 214
 215 // SharedMutex's use of thread local storage is as an optimization, so
 216 // for the case where thread local storage is not supported, define it
 217 // away.
 218 #ifndef FOLLY_SHAREDMUTEX_TLS
 219 #if !FOLLY_MOBILE
 220 #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
 221 #else
 222 #define FOLLY_SHAREDMUTEX_TLS
 223 #endif
 224 #endif
 225
 226 namespace folly {
 227
 228 struct SharedMutexToken {
 229   enum class Type : uint16_t {
 230     INVALID = 0,
 231     INLINE_SHARED,
 232     DEFERRED_SHARED,
 233   };
 234
 235   Type type_;
 236   uint16_t slot_;
 237 };
 238
 239 template <
 240     bool ReaderPriority,
 241     typename Tag_ = void,
 242     template <typename> class Atom = std::atomic,
 243     bool BlockImmediately = false>
 244 class SharedMutexImpl {
 245  public:
 246   static constexpr bool kReaderPriority = ReaderPriority;
 247   typedef Tag_ Tag;
 248
 249   typedef SharedMutexToken Token;
 250
 251   class ReadHolder;
 252   class UpgradeHolder;
 253   class WriteHolder;
 254
 255   constexpr SharedMutexImpl() noexcept : state_(0) {}
 256
 257   SharedMutexImpl(const SharedMutexImpl&) = delete;
 258   SharedMutexImpl(SharedMutexImpl&&) = delete;
 259   SharedMutexImpl& operator = (const SharedMutexImpl&) = delete;
 260   SharedMutexImpl& operator = (SharedMutexImpl&&) = delete;
 261
 262   // It is an error to destroy an SharedMutex that still has
 263   // any outstanding locks.  This is checked if NDEBUG isn't defined.
 264   // SharedMutex's exclusive mode can be safely used to guard the lock's
 265   // own destruction.  If, for example, you acquire the lock in exclusive
 266   // mode and then observe that the object containing the lock is no longer
 267   // needed, you can unlock() and then immediately destroy the lock.
 268   // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
 269   // description about why this property needs to be explicitly mentioned.
 270   ~SharedMutexImpl() {
 271     auto state = state_.load(std::memory_order_relaxed);
 272     if (UNLIKELY((state & kHasS) != 0)) {
 273       cleanupTokenlessSharedDeferred(state);
 274     }
 275
 276 #ifndef NDEBUG
 277     // if a futexWait fails to go to sleep because the value has been
 278     // changed, we don't necessarily clean up the wait bits, so it is
 279     // possible they will be set here in a correct system
 280     assert((state & ~(kWaitingAny | kMayDefer)) == 0);
 281     if ((state & kMayDefer) != 0) {
 282       for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
 283         auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
 284         assert(!slotValueIsThis(slotValue));
 285       }
 286     }
 287 #endif
 288   }
 289
 290   void lock() {
 291     WaitForever ctx;
 292     (void)lockExclusiveImpl(kHasSolo, ctx);
 293   }
 294
 295   bool try_lock() {
 296     WaitNever ctx;
 297     return lockExclusiveImpl(kHasSolo, ctx);
 298   }
 299
 300   template <class Rep, class Period>
 301   bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
 302     WaitForDuration<Rep, Period> ctx(duration);
 303     return lockExclusiveImpl(kHasSolo, ctx);
 304   }
 305
 306   template <class Clock, class Duration>
 307   bool try_lock_until(
 308       const std::chrono::time_point<Clock, Duration>& absDeadline) {
 309     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 310     return lockExclusiveImpl(kHasSolo, ctx);
 311   }
 312
 313   void unlock() {
 314     // It is possible that we have a left-over kWaitingNotS if the last
 315     // unlock_shared() that let our matching lock() complete finished
 316     // releasing before lock()'s futexWait went to sleep.  Clean it up now
 317     auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
 318     assert((state & ~kWaitingAny) == 0);
 319     wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
 320   }
 321
 322   // Managing the token yourself makes unlock_shared a bit faster
 323
 324   void lock_shared() {
 325     WaitForever ctx;
 326     (void)lockSharedImpl(nullptr, ctx);
 327   }
 328
 329   void lock_shared(Token& token) {
 330     WaitForever ctx;
 331     (void)lockSharedImpl(&token, ctx);
 332   }
 333
 334   bool try_lock_shared() {
 335     WaitNever ctx;
 336     return lockSharedImpl(nullptr, ctx);
 337   }
 338
 339   bool try_lock_shared(Token& token) {
 340     WaitNever ctx;
 341     return lockSharedImpl(&token, ctx);
 342   }
 343
 344   template <class Rep, class Period>
 345   bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
 346     WaitForDuration<Rep, Period> ctx(duration);
 347     return lockSharedImpl(nullptr, ctx);
 348   }
 349
 350   template <class Rep, class Period>
 351   bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
 352                            Token& token) {
 353     WaitForDuration<Rep, Period> ctx(duration);
 354     return lockSharedImpl(&token, ctx);
 355   }
 356
 357   template <class Clock, class Duration>
 358   bool try_lock_shared_until(
 359       const std::chrono::time_point<Clock, Duration>& absDeadline) {
 360     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 361     return lockSharedImpl(nullptr, ctx);
 362   }
 363
 364   template <class Clock, class Duration>
 365   bool try_lock_shared_until(
 366       const std::chrono::time_point<Clock, Duration>& absDeadline,
 367       Token& token) {
 368     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 369     return lockSharedImpl(&token, ctx);
 370   }
 371
 372   void unlock_shared() {
 373     auto state = state_.load(std::memory_order_acquire);
 374
 375     // kPrevDefer can only be set if HasE or BegunE is set
 376     assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
 377
 378     // lock() strips kMayDefer immediately, but then copies it to
 379     // kPrevDefer so we can tell if the pre-lock() lock_shared() might
 380     // have deferred
 381     if ((state & (kMayDefer | kPrevDefer)) == 0 ||
 382         !tryUnlockTokenlessSharedDeferred()) {
 383       // Matching lock_shared() couldn't have deferred, or the deferred
 384       // lock has already been inlined by applyDeferredReaders()
 385       unlockSharedInline();
 386     }
 387   }
 388
 389   void unlock_shared(Token& token) {
 390     assert(token.type_ == Token::Type::INLINE_SHARED ||
 391            token.type_ == Token::Type::DEFERRED_SHARED);
 392
 393     if (token.type_ != Token::Type::DEFERRED_SHARED ||
 394         !tryUnlockSharedDeferred(token.slot_)) {
 395       unlockSharedInline();
 396     }
 397 #ifndef NDEBUG
 398     token.type_ = Token::Type::INVALID;
 399 #endif
 400   }
 401
 402   void unlock_and_lock_shared() {
 403     // We can't use state_ -=, because we need to clear 2 bits (1 of which
 404     // has an uncertain initial state) and set 1 other.  We might as well
 405     // clear the relevant wake bits at the same time.  Note that since S
 406     // doesn't block the beginning of a transition to E (writer priority
 407     // can cut off new S, reader priority grabs BegunE and blocks deferred
 408     // S) we need to wake E as well.
 409     auto state = state_.load(std::memory_order_acquire);
 410     do {
 411       assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
 412     } while (!state_.compare_exchange_strong(
 413         state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
 414     if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
 415       futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
 416     }
 417   }
 418
 419   void unlock_and_lock_shared(Token& token) {
 420     unlock_and_lock_shared();
 421     token.type_ = Token::Type::INLINE_SHARED;
 422   }
 423
 424   void lock_upgrade() {
 425     WaitForever ctx;
 426     (void)lockUpgradeImpl(ctx);
 427   }
 428
 429   bool try_lock_upgrade() {
 430     WaitNever ctx;
 431     return lockUpgradeImpl(ctx);
 432   }
 433
 434   template <class Rep, class Period>
 435   bool try_lock_upgrade_for(
 436       const std::chrono::duration<Rep, Period>& duration) {
 437     WaitForDuration<Rep, Period> ctx(duration);
 438     return lockUpgradeImpl(ctx);
 439   }
 440
 441   template <class Clock, class Duration>
 442   bool try_lock_upgrade_until(
 443       const std::chrono::time_point<Clock, Duration>& absDeadline) {
 444     WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
 445     return lockUpgradeImpl(ctx);
 446   }
 447
 448   void unlock_upgrade() {
 449     auto state = (state_ -= kHasU);
 450     assert((state & (kWaitingNotS | kHasSolo)) == 0);
 451     wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
 452   }
 453
 454   void unlock_upgrade_and_lock() {
 455     // no waiting necessary, so waitMask is empty
 456     WaitForever ctx;
 457     (void)lockExclusiveImpl(0, ctx);
 458   }
 459
 460   void unlock_upgrade_and_lock_shared() {
 461     auto state = (state_ -= kHasU - kIncrHasS);
 462     assert((state & (kWaitingNotS | kHasSolo)) == 0);
 463     wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
 464   }
 465
 466   void unlock_upgrade_and_lock_shared(Token& token) {
 467     unlock_upgrade_and_lock_shared();
 468     token.type_ = Token::Type::INLINE_SHARED;
 469   }
 470
 471   void unlock_and_lock_upgrade() {
 472     // We can't use state_ -=, because we need to clear 2 bits (1 of
 473     // which has an uncertain initial state) and set 1 other.  We might
 474     // as well clear the relevant wake bits at the same time.
 475     auto state = state_.load(std::memory_order_acquire);
 476     while (true) {
 477       assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
 478       auto after =
 479           (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
 480       if (state_.compare_exchange_strong(state, after)) {
 481         if ((state & kWaitingS) != 0) {
 482           futexWakeAll(kWaitingS);
 483         }
 484         return;
 485       }
 486     }
 487   }
 488
 489  private:
 490   typedef typename folly::detail::Futex<Atom> Futex;
 491
 492   // Internally we use four kinds of wait contexts.  These are structs
 493   // that provide a doWait method that returns true if a futex wake
 494   // was issued that intersects with the waitMask, false if there was a
 495   // timeout and no more waiting should be performed.  Spinning occurs
 496   // before the wait context is invoked.
 497
 498   struct WaitForever {
 499     bool canBlock() { return true; }
 500     bool canTimeOut() { return false; }
 501     bool shouldTimeOut() { return false; }
 502
 503     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
 504       futex.futexWait(expected, waitMask);
 505       return true;
 506     }
 507   };
 508
 509   struct WaitNever {
 510     bool canBlock() { return false; }
 511     bool canTimeOut() { return true; }
 512     bool shouldTimeOut() { return true; }
 513
 514     bool doWait(Futex& /* futex */,
 515                 uint32_t /* expected */,
 516                 uint32_t /* waitMask */) {
 517       return false;
 518     }
 519   };
 520
 521   template <class Rep, class Period>
 522   struct WaitForDuration {
 523     std::chrono::duration<Rep, Period> duration_;
 524     bool deadlineComputed_;
 525     std::chrono::steady_clock::time_point deadline_;
 526
 527     explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
 528         : duration_(duration), deadlineComputed_(false) {}
 529
 530     std::chrono::steady_clock::time_point deadline() {
 531       if (!deadlineComputed_) {
 532         deadline_ = std::chrono::steady_clock::now() + duration_;
 533         deadlineComputed_ = true;
 534       }
 535       return deadline_;
 536     }
 537
 538     bool canBlock() { return duration_.count() > 0; }
 539     bool canTimeOut() { return true; }
 540
 541     bool shouldTimeOut() {
 542       return std::chrono::steady_clock::now() > deadline();
 543     }
 544
 545     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
 546       auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
 547       return result != folly::detail::FutexResult::TIMEDOUT;
 548     }
 549   };
 550
 551   template <class Clock, class Duration>
 552   struct WaitUntilDeadline {
 553     std::chrono::time_point<Clock, Duration> absDeadline_;
 554
 555     bool canBlock() { return true; }
 556     bool canTimeOut() { return true; }
 557     bool shouldTimeOut() { return Clock::now() > absDeadline_; }
 558
 559     bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
 560       auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
 561       return result != folly::detail::FutexResult::TIMEDOUT;
 562     }
 563   };
 564
 565   // 32 bits of state
 566   Futex state_{};
 567
 568   // S count needs to be on the end, because we explicitly allow it to
 569   // underflow.  This can occur while we are in the middle of applying
 570   // deferred locks (we remove them from deferredReaders[] before
 571   // inlining them), or during token-less unlock_shared() if a racing
 572   // lock_shared();unlock_shared() moves the deferredReaders slot while
 573   // the first unlock_shared() is scanning.  The former case is cleaned
 574   // up before we finish applying the locks.  The latter case can persist
 575   // until destruction, when it is cleaned up.
 576   static constexpr uint32_t kIncrHasS = 1 << 10;
 577   static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
 578
 579   // If false, then there are definitely no deferred read locks for this
 580   // instance.  Cleared after initialization and when exclusively locked.
 581   static constexpr uint32_t kMayDefer = 1 << 9;
 582
 583   // lock() cleared kMayDefer as soon as it starts draining readers (so
 584   // that it doesn't have to do a second CAS once drain completes), but
 585   // unlock_shared() still needs to know whether to scan deferredReaders[]
 586   // or not.  We copy kMayDefer to kPrevDefer when setting kHasE or
 587   // kBegunE, and clear it when clearing those bits.
 588   static constexpr uint32_t kPrevDefer = 1 << 8;
 589
 590   // Exclusive-locked blocks all read locks and write locks.  This bit
 591   // may be set before all readers have finished, but in that case the
 592   // thread that sets it won't return to the caller until all read locks
 593   // have been released.
 594   static constexpr uint32_t kHasE = 1 << 7;
 595
 596   // Exclusive-draining means that lock() is waiting for existing readers
 597   // to leave, but that new readers may still acquire shared access.
 598   // This is only used in reader priority mode.  New readers during
 599   // drain must be inline.  The difference between this and kHasU is that
 600   // kBegunE prevents kMayDefer from being set.
 601   static constexpr uint32_t kBegunE = 1 << 6;
 602
 603   // At most one thread may have either exclusive or upgrade lock
 604   // ownership.  Unlike exclusive mode, ownership of the lock in upgrade
 605   // mode doesn't preclude other threads holding the lock in shared mode.
 606   // boost's concept for this doesn't explicitly say whether new shared
 607   // locks can be acquired one lock_upgrade has succeeded, but doesn't
 608   // list that as disallowed.  RWSpinLock disallows new read locks after
 609   // lock_upgrade has been acquired, but the boost implementation doesn't.
 610   // We choose the latter.
 611   static constexpr uint32_t kHasU = 1 << 5;
 612
 613   // There are three states that we consider to be "solo", in that they
 614   // cannot coexist with other solo states.  These are kHasE, kBegunE,
 615   // and kHasU.  Note that S doesn't conflict with any of these, because
 616   // setting the kHasE is only one of the two steps needed to actually
 617   // acquire the lock in exclusive mode (the other is draining the existing
 618   // S holders).
 619   static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
 620
 621   // Once a thread sets kHasE it needs to wait for the current readers
 622   // to exit the lock.  We give this a separate wait identity from the
 623   // waiting to set kHasE so that we can perform partial wakeups (wake
 624   // one instead of wake all).
 625   static constexpr uint32_t kWaitingNotS = 1 << 4;
 626
 627   // When waking writers we can either wake them all, in which case we
 628   // can clear kWaitingE, or we can call futexWake(1).  futexWake tells
 629   // us if anybody woke up, but even if we detect that nobody woke up we
 630   // can't clear the bit after the fact without issuing another wakeup.
 631   // To avoid thundering herds when there are lots of pending lock()
 632   // without needing to call futexWake twice when there is only one
 633   // waiter, kWaitingE actually encodes if we have observed multiple
 634   // concurrent waiters.  Tricky: ABA issues on futexWait mean that when
 635   // we see kWaitingESingle we can't assume that there is only one.
 636   static constexpr uint32_t kWaitingESingle = 1 << 2;
 637   static constexpr uint32_t kWaitingEMultiple = 1 << 3;
 638   static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
 639
 640   // kWaitingU is essentially a 1 bit saturating counter.  It always
 641   // requires a wakeAll.
 642   static constexpr uint32_t kWaitingU = 1 << 1;
 643
 644   // All blocked lock_shared() should be awoken, so it is correct (not
 645   // suboptimal) to wakeAll if there are any shared readers.
 646   static constexpr uint32_t kWaitingS = 1 << 0;
 647
 648   // kWaitingAny is a mask of all of the bits that record the state of
 649   // threads, rather than the state of the lock.  It is convenient to be
 650   // able to mask them off during asserts.
 651   static constexpr uint32_t kWaitingAny =
 652       kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
 653
 654   // The reader count at which a reader will attempt to use the lock
 655   // in deferred mode.  If this value is 2, then the second concurrent
 656   // reader will set kMayDefer and use deferredReaders[].  kMayDefer is
 657   // cleared during exclusive access, so this threshold must be reached
 658   // each time a lock is held in exclusive mode.
 659   static constexpr uint32_t kNumSharedToStartDeferring = 2;
 660
 661   // The typical number of spins that a thread will wait for a state
 662   // transition.  There is no bound on the number of threads that can wait
 663   // for a writer, so we are pretty conservative here to limit the chance
 664   // that we are starving the writer of CPU.  Each spin is 6 or 7 nanos,
 665   // almost all of which is in the pause instruction.
 666   static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
 667
 668   // The maximum number of soft yields before falling back to futex.
 669   // If the preemption heuristic is activated we will fall back before
 670   // this.  A soft yield takes ~900 nanos (two sched_yield plus a call
 671   // to getrusage, with checks of the goal at each step).  Soft yields
 672   // aren't compatible with deterministic execution under test (unlike
 673   // futexWaitUntil, which has a capricious but deterministic back end).
 674   static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
 675
 676   // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
 677   // level of the memory hierarchy is symmetrically divided into k pieces
 678   // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
 679   // that are the same after integer division by k share that resource.
 680   // Our strategy for deferred readers is to probe up to numSlots/4 slots,
 681   // using the full granularity of AccessSpreader for the start slot
 682   // and then search outward.  We can use AccessSpreader::current(n)
 683   // without managing our own spreader if kMaxDeferredReaders <=
 684   // AccessSpreader::kMaxCpus, which is currently 128.
 685   //
 686   // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
 687   // with 64 byte cache lines.  That means we need 64*16 bytes of
 688   // deferredReaders[] to give each L1 its own playground.  On x86_64
 689   // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
 690   // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128.  If
 691   // kDeferredSearchDistance * kDeferredSeparationFactor <=
 692   // 64 / 8 then we will search only within a single cache line, which
 693   // guarantees we won't have inter-L1 contention.  We give ourselves
 694   // a factor of 2 on the core count, which should hold us for a couple
 695   // processor generations.  deferredReaders[] is 2048 bytes currently.
 696  public:
 697   static constexpr uint32_t kMaxDeferredReaders = 64;
 698   static constexpr uint32_t kDeferredSearchDistance = 2;
 699   static constexpr uint32_t kDeferredSeparationFactor = 4;
 700
 701  private:
 702
 703   static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
 704                 "kMaxDeferredReaders must be a power of 2");
 705   static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
 706                 "kDeferredSearchDistance must be a power of 2");
 707
 708   // The number of deferred locks that can be simultaneously acquired
 709   // by a thread via the token-less methods without performing any heap
 710   // allocations.  Each of these costs 3 pointers (24 bytes, probably)
 711   // per thread.  There's not much point in making this larger than
 712   // kDeferredSearchDistance.
 713   static constexpr uint32_t kTokenStackTLSCapacity = 2;
 714
 715   // We need to make sure that if there is a lock_shared()
 716   // and lock_shared(token) followed by unlock_shared() and
 717   // unlock_shared(token), the token-less unlock doesn't null
 718   // out deferredReaders[token.slot_].  If we allowed that, then
 719   // unlock_shared(token) wouldn't be able to assume that its lock
 720   // had been inlined by applyDeferredReaders when it finds that
 721   // deferredReaders[token.slot_] no longer points to this.  We accomplish
 722   // this by stealing bit 0 from the pointer to record that the slot's
 723   // element has no token, hence our use of uintptr_t in deferredReaders[].
 724   static constexpr uintptr_t kTokenless = 0x1;
 725
 726   // This is the starting location for Token-less unlock_shared().
 727   static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
 728
 729   // Last deferred reader slot used.
 730   static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
 731
 732
 733   // Only indexes divisible by kDeferredSeparationFactor are used.
 734   // If any of those elements points to a SharedMutexImpl, then it
 735   // should be considered that there is a shared lock on that instance.
 736   // See kTokenless.
 737  public:
 738   typedef Atom<uintptr_t> DeferredReaderSlot;
 739
 740  private:
 741   alignas(hardware_destructive_interference_size) static DeferredReaderSlot
 742       deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor];
 743
 744   // Performs an exclusive lock, waiting for state_ & waitMask to be
 745   // zero first
 746   template <class WaitContext>
 747   bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
 748     uint32_t state = state_.load(std::memory_order_acquire);
 749     if (LIKELY(
 750             (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
 751             state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
 752       return true;
 753     } else {
 754       return lockExclusiveImpl(state, preconditionGoalMask, ctx);
 755     }
 756   }
 757
 758   template <class WaitContext>
 759   bool lockExclusiveImpl(uint32_t& state,
 760                          uint32_t preconditionGoalMask,
 761                          WaitContext& ctx) {
 762     while (true) {
 763       if (UNLIKELY((state & preconditionGoalMask) != 0) &&
 764           !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
 765           ctx.canTimeOut()) {
 766         return false;
 767       }
 768
 769       uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
 770       if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
 771         // Block readers immediately, either because we are in write
 772         // priority mode or because we can acquire the lock in one
 773         // step.  Note that if state has kHasU, then we are doing an
 774         // unlock_upgrade_and_lock() and we should clear it (reader
 775         // priority branch also does this).
 776         after |= (state | kHasE) & ~(kHasU | kMayDefer);
 777       } else {
 778         after |= (state | kBegunE) & ~(kHasU | kMayDefer);
 779       }
 780       if (state_.compare_exchange_strong(state, after)) {
 781         auto before = state;
 782         state = after;
 783
 784         // If we set kHasE (writer priority) then no new readers can
 785         // arrive.  If we set kBegunE then they can still enter, but
 786         // they must be inline.  Either way we need to either spin on
 787         // deferredReaders[] slots, or inline them so that we can wait on
 788         // kHasS to zero itself.  deferredReaders[] is pointers, which on
 789         // x86_64 are bigger than futex() can handle, so we inline the
 790         // deferred locks instead of trying to futexWait on each slot.
 791         // Readers are responsible for rechecking state_ after recording
 792         // a deferred read to avoid atomicity problems between the state_
 793         // CAS and applyDeferredReader's reads of deferredReaders[].
 794         if (UNLIKELY((before & kMayDefer) != 0)) {
 795           applyDeferredReaders(state, ctx);
 796         }
 797         while (true) {
 798           assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
 799           if (UNLIKELY((state & kHasS) != 0) &&
 800               !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
 801               ctx.canTimeOut()) {
 802             // Ugh.  We blocked new readers and other writers for a while,
 803             // but were unable to complete.  Move on.  On the plus side
 804             // we can clear kWaitingNotS because nobody else can piggyback
 805             // on it.
 806             state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
 807             wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
 808             return false;
 809           }
 810
 811           if (kReaderPriority && (state & kHasE) == 0) {
 812             assert((state & kBegunE) != 0);
 813             if (!state_.compare_exchange_strong(state,
 814                                                 (state & ~kBegunE) | kHasE)) {
 815               continue;
 816             }
 817           }
 818
 819           return true;
 820         }
 821       }
 822     }
 823   }
 824
 825   template <class WaitContext>
 826   bool waitForZeroBits(uint32_t& state,
 827                        uint32_t goal,
 828                        uint32_t waitMask,
 829                        WaitContext& ctx) {
 830     uint32_t spinCount = 0;
 831     while (true) {
 832       state = state_.load(std::memory_order_acquire);
 833       if ((state & goal) == 0) {
 834         return true;
 835       }
 836       asm_volatile_pause();
 837       ++spinCount;
 838       if (UNLIKELY(spinCount >= kMaxSpinCount)) {
 839         return ctx.canBlock() &&
 840                yieldWaitForZeroBits(state, goal, waitMask, ctx);
 841       }
 842     }
 843   }
 844
 845   template <class WaitContext>
 846   bool yieldWaitForZeroBits(uint32_t& state,
 847                             uint32_t goal,
 848                             uint32_t waitMask,
 849                             WaitContext& ctx) {
 850 #ifdef RUSAGE_THREAD
 851     struct rusage usage;
 852     std::memset(&usage, 0, sizeof(usage));
 853     long before = -1;
 854 #endif
 855     for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
 856          ++yieldCount) {
 857       for (int softState = 0; softState < 3; ++softState) {
 858         if (softState < 2) {
 859           std::this_thread::yield();
 860         } else {
 861 #ifdef RUSAGE_THREAD
 862           getrusage(RUSAGE_THREAD, &usage);
 863 #endif
 864         }
 865         if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
 866           return true;
 867         }
 868         if (ctx.shouldTimeOut()) {
 869           return false;
 870         }
 871       }
 872 #ifdef RUSAGE_THREAD
 873       if (before >= 0 && usage.ru_nivcsw >= before + 2) {
 874         // One involuntary csw might just be occasional background work,
 875         // but if we get two in a row then we guess that there is someone
 876         // else who can profitably use this CPU.  Fall back to futex
 877         break;
 878       }
 879       before = usage.ru_nivcsw;
 880 #endif
 881     }
 882     return futexWaitForZeroBits(state, goal, waitMask, ctx);
 883   }
 884
 885   template <class WaitContext>
 886   bool futexWaitForZeroBits(uint32_t& state,
 887                             uint32_t goal,
 888                             uint32_t waitMask,
 889                             WaitContext& ctx) {
 890     assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
 891            waitMask == kWaitingU || waitMask == kWaitingS);
 892
 893     while (true) {
 894       state = state_.load(std::memory_order_acquire);
 895       if ((state & goal) == 0) {
 896         return true;
 897       }
 898
 899       auto after = state;
 900       if (waitMask == kWaitingE) {
 901         if ((state & kWaitingESingle) != 0) {
 902           after |= kWaitingEMultiple;
 903         } else {
 904           after |= kWaitingESingle;
 905         }
 906       } else {
 907         after |= waitMask;
 908       }
 909
 910       // CAS is better than atomic |= here, because it lets us avoid
 911       // setting the wait flag when the goal is concurrently achieved
 912       if (after != state && !state_.compare_exchange_strong(state, after)) {
 913         continue;
 914       }
 915
 916       if (!ctx.doWait(state_, after, waitMask)) {
 917         // timed out
 918         return false;
 919       }
 920     }
 921   }
 922
 923   // Wakes up waiters registered in state_ as appropriate, clearing the
 924   // awaiting bits for anybody that was awoken.  Tries to perform direct
 925   // single wakeup of an exclusive waiter if appropriate
 926   void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
 927     if (UNLIKELY((state & wakeMask) != 0)) {
 928       wakeRegisteredWaitersImpl(state, wakeMask);
 929     }
 930   }
 931
 932   void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
 933     // If there are multiple lock() pending only one of them will actually
 934     // get to wake up, so issuing futexWakeAll will make a thundering herd.
 935     // There's nothing stopping us from issuing futexWake(1) instead,
 936     // so long as the wait bits are still an accurate reflection of
 937     // the waiters.  If we notice (via futexWake's return value) that
 938     // nobody woke up then we can try again with the normal wake-all path.
 939     // Note that we can't just clear the bits at that point; we need to
 940     // clear the bits and then issue another wakeup.
 941     //
 942     // It is possible that we wake an E waiter but an outside S grabs the
 943     // lock instead, at which point we should wake pending U and S waiters.
 944     // Rather than tracking state to make the failing E regenerate the
 945     // wakeup, we just disable the optimization in the case that there
 946     // are waiting U or S that we are eligible to wake.
 947     if ((wakeMask & kWaitingE) == kWaitingE &&
 948         (state & wakeMask) == kWaitingE &&
 949         state_.futexWake(1, kWaitingE) > 0) {
 950       // somebody woke up, so leave state_ as is and clear it later
 951       return;
 952     }
 953
 954     if ((state & wakeMask) != 0) {
 955       auto prev = state_.fetch_and(~wakeMask);
 956       if ((prev & wakeMask) != 0) {
 957         futexWakeAll(wakeMask);
 958       }
 959       state = prev & ~wakeMask;
 960     }
 961   }
 962
 963   void futexWakeAll(uint32_t wakeMask) {
 964     state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
 965   }
 966
 967   DeferredReaderSlot* deferredReader(uint32_t slot) {
 968     return &deferredReaders[slot * kDeferredSeparationFactor];
 969   }
 970
 971   uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
 972
 973   uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
 974
 975   bool slotValueIsThis(uintptr_t slotValue) {
 976     return (slotValue & ~kTokenless) == tokenfulSlotValue();
 977   }
 978
 979   // Clears any deferredReaders[] that point to this, adjusting the inline
 980   // shared lock count to compensate.  Does some spinning and yielding
 981   // to avoid the work.  Always finishes the application, even if ctx
 982   // times out.
 983   template <class WaitContext>
 984   void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
 985     uint32_t slot = 0;
 986
 987     uint32_t spinCount = 0;
 988     while (true) {
 989       while (!slotValueIsThis(
 990                  deferredReader(slot)->load(std::memory_order_acquire))) {
 991         if (++slot == kMaxDeferredReaders) {
 992           return;
 993         }
 994       }
 995       asm_volatile_pause();
 996       if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
 997         applyDeferredReaders(state, ctx, slot);
 998         return;
 999       }
1000     }
1001   }
1002
1003   template <class WaitContext>
1004   void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
1005
1006 #ifdef RUSAGE_THREAD
1007     struct rusage usage;
1008     std::memset(&usage, 0, sizeof(usage));
1009     long before = -1;
1010 #endif
1011     for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1012          ++yieldCount) {
1013       for (int softState = 0; softState < 3; ++softState) {
1014         if (softState < 2) {
1015           std::this_thread::yield();
1016         } else {
1017 #ifdef RUSAGE_THREAD
1018           getrusage(RUSAGE_THREAD, &usage);
1019 #endif
1020         }
1021         while (!slotValueIsThis(
1022                    deferredReader(slot)->load(std::memory_order_acquire))) {
1023           if (++slot == kMaxDeferredReaders) {
1024             return;
1025           }
1026         }
1027         if (ctx.shouldTimeOut()) {
1028           // finish applying immediately on timeout
1029           break;
1030         }
1031       }
1032 #ifdef RUSAGE_THREAD
1033       if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1034         // heuristic says run queue is not empty
1035         break;
1036       }
1037       before = usage.ru_nivcsw;
1038 #endif
1039     }
1040
1041     uint32_t movedSlotCount = 0;
1042     for (; slot < kMaxDeferredReaders; ++slot) {
1043       auto slotPtr = deferredReader(slot);
1044       auto slotValue = slotPtr->load(std::memory_order_acquire);
1045       if (slotValueIsThis(slotValue) &&
1046           slotPtr->compare_exchange_strong(slotValue, 0)) {
1047         ++movedSlotCount;
1048       }
1049     }
1050
1051     if (movedSlotCount > 0) {
1052       state = (state_ += movedSlotCount * kIncrHasS);
1053     }
1054     assert((state & (kHasE | kBegunE)) != 0);
1055
1056     // if state + kIncrHasS overflows (off the end of state) then either
1057     // we have 2^(32-9) readers (almost certainly an application bug)
1058     // or we had an underflow (also a bug)
1059     assert(state < state + kIncrHasS);
1060   }
1061
1062   // It is straightfoward to make a token-less lock_shared() and
1063   // unlock_shared() either by making the token-less version always use
1064   // INLINE_SHARED mode or by removing the token version.  Supporting
1065   // deferred operation for both types is trickier than it appears, because
1066   // the purpose of the token it so that unlock_shared doesn't have to
1067   // look in other slots for its deferred lock.  Token-less unlock_shared
1068   // might place a deferred lock in one place and then release a different
1069   // slot that was originally used by the token-ful version.  If this was
1070   // important we could solve the problem by differentiating the deferred
1071   // locks so that cross-variety release wouldn't occur.  The best way
1072   // is probably to steal a bit from the pointer, making deferredLocks[]
1073   // an array of Atom<uintptr_t>.
1074
1075   template <class WaitContext>
1076   bool lockSharedImpl(Token* token, WaitContext& ctx) {
1077     uint32_t state = state_.load(std::memory_order_relaxed);
1078     if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
1079         state_.compare_exchange_strong(state, state + kIncrHasS)) {
1080       if (token != nullptr) {
1081         token->type_ = Token::Type::INLINE_SHARED;
1082       }
1083       return true;
1084     }
1085     return lockSharedImpl(state, token, ctx);
1086   }
1087
1088   template <class WaitContext>
1089   bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
1090
1091   // Updates the state in/out argument as if the locks were made inline,
1092   // but does not update state_
1093   void cleanupTokenlessSharedDeferred(uint32_t& state) {
1094     for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1095       auto slotPtr = deferredReader(i);
1096       auto slotValue = slotPtr->load(std::memory_order_relaxed);
1097       if (slotValue == tokenlessSlotValue()) {
1098         slotPtr->store(0, std::memory_order_relaxed);
1099         state += kIncrHasS;
1100         if ((state & kHasS) == 0) {
1101           break;
1102         }
1103       }
1104     }
1105   }
1106
1107   bool tryUnlockTokenlessSharedDeferred();
1108
1109   bool tryUnlockSharedDeferred(uint32_t slot) {
1110     assert(slot < kMaxDeferredReaders);
1111     auto slotValue = tokenfulSlotValue();
1112     return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
1113   }
1114
1115   uint32_t unlockSharedInline() {
1116     uint32_t state = (state_ -= kIncrHasS);
1117     assert((state & (kHasE | kBegunE | kMayDefer)) != 0 ||
1118            state < state + kIncrHasS);
1119     if ((state & kHasS) == 0) {
1120       // Only the second half of lock() can be blocked by a non-zero
1121       // reader count, so that's the only thing we need to wake
1122       wakeRegisteredWaiters(state, kWaitingNotS);
1123     }
1124     return state;
1125   }
1126
1127   template <class WaitContext>
1128   bool lockUpgradeImpl(WaitContext& ctx) {
1129     uint32_t state;
1130     do {
1131       if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
1132         return false;
1133       }
1134     } while (!state_.compare_exchange_strong(state, state | kHasU));
1135     return true;
1136   }
1137
1138  public:
1139   class ReadHolder {
1140     ReadHolder() : lock_(nullptr) {}
1141
1142    public:
1143     explicit ReadHolder(const SharedMutexImpl* lock)
1144         : lock_(const_cast<SharedMutexImpl*>(lock)) {
1145       if (lock_) {
1146         lock_->lock_shared(token_);
1147       }
1148     }
1149
1150     explicit ReadHolder(const SharedMutexImpl& lock)
1151         : lock_(const_cast<SharedMutexImpl*>(&lock)) {
1152       lock_->lock_shared(token_);
1153     }
1154
1155     ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
1156                                             token_(rhs.token_) {
1157       rhs.lock_ = nullptr;
1158     }
1159
1160     // Downgrade from upgrade mode
1161     explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
1162       assert(upgraded.lock_ != nullptr);
1163       upgraded.lock_ = nullptr;
1164       lock_->unlock_upgrade_and_lock_shared(token_);
1165     }
1166
1167     // Downgrade from exclusive mode
1168     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1169       assert(writer.lock_ != nullptr);
1170       writer.lock_ = nullptr;
1171       lock_->unlock_and_lock_shared(token_);
1172     }
1173
1174     ReadHolder& operator=(ReadHolder&& rhs) noexcept {
1175       std::swap(lock_, rhs.lock_);
1176       std::swap(token_, rhs.token_);
1177       return *this;
1178     }
1179
1180     ReadHolder(const ReadHolder& rhs) = delete;
1181     ReadHolder& operator=(const ReadHolder& rhs) = delete;
1182
1183     ~ReadHolder() {
1184       unlock();
1185     }
1186
1187     void unlock() {
1188       if (lock_) {
1189         lock_->unlock_shared(token_);
1190         lock_ = nullptr;
1191       }
1192     }
1193
1194    private:
1195     friend class UpgradeHolder;
1196     friend class WriteHolder;
1197     SharedMutexImpl* lock_;
1198     SharedMutexToken token_;
1199   };
1200
1201   class UpgradeHolder {
1202     UpgradeHolder() : lock_(nullptr) {}
1203
1204    public:
1205     explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) {
1206       if (lock_) {
1207         lock_->lock_upgrade();
1208       }
1209     }
1210
1211     explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
1212       lock_->lock_upgrade();
1213     }
1214
1215     // Downgrade from exclusive mode
1216     explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1217       assert(writer.lock_ != nullptr);
1218       writer.lock_ = nullptr;
1219       lock_->unlock_and_lock_upgrade();
1220     }
1221
1222     UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
1223       rhs.lock_ = nullptr;
1224     }
1225
1226     UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
1227       std::swap(lock_, rhs.lock_);
1228       return *this;
1229     }
1230
1231     UpgradeHolder(const UpgradeHolder& rhs) = delete;
1232     UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
1233
1234     ~UpgradeHolder() {
1235       unlock();
1236     }
1237
1238     void unlock() {
1239       if (lock_) {
1240         lock_->unlock_upgrade();
1241         lock_ = nullptr;
1242       }
1243     }
1244
1245    private:
1246     friend class WriteHolder;
1247     friend class ReadHolder;
1248     SharedMutexImpl* lock_;
1249   };
1250
1251   class WriteHolder {
1252     WriteHolder() : lock_(nullptr) {}
1253
1254    public:
1255     explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) {
1256       if (lock_) {
1257         lock_->lock();
1258       }
1259     }
1260
1261     explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
1262       lock_->lock();
1263     }
1264
1265     // Promotion from upgrade mode
1266     explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
1267       assert(upgrade.lock_ != nullptr);
1268       upgrade.lock_ = nullptr;
1269       lock_->unlock_upgrade_and_lock();
1270     }
1271
1272     // README:
1273     //
1274     // It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
1275     //
1276     // Shared locks (read) can not safely upgrade to unique locks (write).
1277     // That upgrade path is a well-known recipe for deadlock, so we explicitly
1278     // disallow it.
1279     //
1280     // If you need to do a conditional mutation, you have a few options:
1281     // 1. Check the condition under a shared lock and release it.
1282     //    Then maybe check the condition again under a unique lock and maybe do
1283     //    the mutation.
1284     // 2. Check the condition once under an upgradeable lock.
1285     //    Then maybe upgrade the lock to a unique lock and do the mutation.
1286     // 3. Check the condition and maybe perform the mutation under a unique
1287     //    lock.
1288     //
1289     // Relevant upgradeable lock notes:
1290     // * At most one upgradeable lock can be held at a time for a given shared
1291     //   mutex, just like a unique lock.
1292     // * An upgradeable lock may be held concurrently with any number of shared
1293     //   locks.
1294     // * An upgradeable lock may be upgraded atomically to a unique lock.
1295
1296     WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
1297       rhs.lock_ = nullptr;
1298     }
1299
1300     WriteHolder& operator=(WriteHolder&& rhs) noexcept {
1301       std::swap(lock_, rhs.lock_);
1302       return *this;
1303     }
1304
1305     WriteHolder(const WriteHolder& rhs) = delete;
1306     WriteHolder& operator=(const WriteHolder& rhs) = delete;
1307
1308     ~WriteHolder() {
1309       unlock();
1310     }
1311
1312     void unlock() {
1313       if (lock_) {
1314         lock_->unlock();
1315         lock_ = nullptr;
1316       }
1317     }
1318
1319    private:
1320     friend class ReadHolder;
1321     friend class UpgradeHolder;
1322     SharedMutexImpl* lock_;
1323   };
1324
1325   // Adapters for Synchronized<>
1326   friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
1327   friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
1328   friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
1329   friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
1330   friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
1331     return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
1332   }
1333   friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
1334     return lock.try_lock_for(std::chrono::milliseconds(ms));
1335   }
1336 };
1337
1338 typedef SharedMutexImpl<true> SharedMutexReadPriority;
1339 typedef SharedMutexImpl<false> SharedMutexWritePriority;
1340 typedef SharedMutexWritePriority SharedMutex;
1341
1342 // Prevent the compiler from instantiating these in other translation units.
1343 // They are instantiated once in SharedMutex.cpp
1344 extern template class SharedMutexImpl<true>;
1345 extern template class SharedMutexImpl<false>;
1346
1347 template <
1348     bool ReaderPriority,
1349     typename Tag_,
1350     template <typename> class Atom,
1351     bool BlockImmediately>
1352 alignas(hardware_destructive_interference_size)
1353     typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1354         DeferredReaderSlot
1355     SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1356         deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] = {};
1357
1358 template <
1359     bool ReaderPriority,
1360     typename Tag_,
1361     template <typename> class Atom,
1362     bool BlockImmediately>
1363 FOLLY_SHAREDMUTEX_TLS uint32_t
1364     SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1365         tls_lastTokenlessSlot = 0;
1366
1367 template <
1368     bool ReaderPriority,
1369     typename Tag_,
1370     template <typename> class Atom,
1371     bool BlockImmediately>
1372 FOLLY_SHAREDMUTEX_TLS uint32_t
1373     SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1374         tls_lastDeferredReaderSlot = 0;
1375
1376 template <
1377     bool ReaderPriority,
1378     typename Tag_,
1379     template <typename> class Atom,
1380     bool BlockImmediately>
1381 bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1382     tryUnlockTokenlessSharedDeferred() {
1383   auto bestSlot = tls_lastTokenlessSlot;
1384   for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1385     auto slotPtr = deferredReader(bestSlot ^ i);
1386     auto slotValue = slotPtr->load(std::memory_order_relaxed);
1387     if (slotValue == tokenlessSlotValue() &&
1388         slotPtr->compare_exchange_strong(slotValue, 0)) {
1389       tls_lastTokenlessSlot = bestSlot ^ i;
1390       return true;
1391     }
1392   }
1393   return false;
1394 }
1395
1396 template <
1397     bool ReaderPriority,
1398     typename Tag_,
1399     template <typename> class Atom,
1400     bool BlockImmediately>
1401 template <class WaitContext>
1402 bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1403     lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
1404   while (true) {
1405     if (UNLIKELY((state & kHasE) != 0) &&
1406         !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
1407       return false;
1408     }
1409
1410     uint32_t slot = tls_lastDeferredReaderSlot;
1411     uintptr_t slotValue = 1; // any non-zero value will do
1412
1413     bool canAlreadyDefer = (state & kMayDefer) != 0;
1414     bool aboveDeferThreshold =
1415         (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
1416     bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
1417     if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
1418       /* Try using the most recent slot first. */
1419       slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1420       if (slotValue != 0) {
1421         // starting point for our empty-slot search, can change after
1422         // calling waitForZeroBits
1423         uint32_t bestSlot =
1424             (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);
1425
1426         // deferred readers are already enabled, or it is time to
1427         // enable them if we can find a slot
1428         for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
1429           slot = bestSlot ^ i;
1430           assert(slot < kMaxDeferredReaders);
1431           slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1432           if (slotValue == 0) {
1433             // found empty slot
1434             tls_lastDeferredReaderSlot = slot;
1435             break;
1436           }
1437         }
1438       }
1439     }
1440
1441     if (slotValue != 0) {
1442       // not yet deferred, or no empty slots
1443       if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
1444         // successfully recorded the read lock inline
1445         if (token != nullptr) {
1446           token->type_ = Token::Type::INLINE_SHARED;
1447         }
1448         return true;
1449       }
1450       // state is updated, try again
1451       continue;
1452     }
1453
1454     // record that deferred readers might be in use if necessary
1455     if ((state & kMayDefer) == 0) {
1456       if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
1457         // keep going if CAS failed because somebody else set the bit
1458         // for us
1459         if ((state & (kHasE | kMayDefer)) != kMayDefer) {
1460           continue;
1461         }
1462       }
1463       // state = state | kMayDefer;
1464     }
1465
1466     // try to use the slot
1467     bool gotSlot = deferredReader(slot)->compare_exchange_strong(
1468         slotValue,
1469         token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
1470
1471     // If we got the slot, we need to verify that an exclusive lock
1472     // didn't happen since we last checked.  If we didn't get the slot we
1473     // need to recheck state_ anyway to make sure we don't waste too much
1474     // work.  It is also possible that since we checked state_ someone
1475     // has acquired and released the write lock, clearing kMayDefer.
1476     // Both cases are covered by looking for the readers-possible bit,
1477     // because it is off when the exclusive lock bit is set.
1478     state = state_.load(std::memory_order_acquire);
1479
1480     if (!gotSlot) {
1481       continue;
1482     }
1483
1484     if (token == nullptr) {
1485       tls_lastTokenlessSlot = slot;
1486     }
1487
1488     if ((state & kMayDefer) != 0) {
1489       assert((state & kHasE) == 0);
1490       // success
1491       if (token != nullptr) {
1492         token->type_ = Token::Type::DEFERRED_SHARED;
1493         token->slot_ = (uint16_t)slot;
1494       }
1495       return true;
1496     }
1497
1498     // release the slot before retrying
1499     if (token == nullptr) {
1500       // We can't rely on slot.  Token-less slot values can be freed by
1501       // any unlock_shared(), so we need to do the full deferredReader
1502       // search during unlock.  Unlike unlock_shared(), we can't trust
1503       // kPrevDefer here.  This deferred lock isn't visible to lock()
1504       // (that's the whole reason we're undoing it) so there might have
1505       // subsequently been an unlock() and lock() with no intervening
1506       // transition to deferred mode.
1507       if (!tryUnlockTokenlessSharedDeferred()) {
1508         unlockSharedInline();
1509       }
1510     } else {
1511       if (!tryUnlockSharedDeferred(slot)) {
1512         unlockSharedInline();
1513       }
1514     }
1515
1516     // We got here not because the lock was unavailable, but because
1517     // we lost a compare-and-swap.  Try-lock is typically allowed to
1518     // have spurious failures, but there is no lock efficiency gain
1519     // from exploiting that freedom here.
1520   }
1521 }
1522
1523 } // namespace folly