2 * Copyright 2015-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author Nathan Bronson (ngbronson@fb.com)
25 #include <type_traits>
27 #include <folly/Likely.h>
28 #include <folly/concurrency/CacheLocality.h>
29 #include <folly/detail/Futex.h>
30 #include <folly/portability/Asm.h>
31 #include <folly/portability/SysResource.h>
33 // SharedMutex is a reader-writer lock. It is small, very fast, scalable
34 // on multi-core, and suitable for use when readers or writers may block.
35 // Unlike most other reader-writer locks, its throughput with concurrent
36 // readers scales linearly; it is able to acquire and release the lock
37 // in shared mode without cache line ping-ponging. It is suitable for
38 // a wide range of lock hold times because it starts with spinning,
39 // proceeds to using sched_yield with a preemption heuristic, and then
40 // waits using futex and precise wakeups.
42 // SharedMutex provides all of the methods of folly::RWSpinLock,
43 // boost::shared_mutex, boost::upgrade_mutex, and C++14's
44 // std::shared_timed_mutex. All operations that can block are available
45 // in try, try-for, and try-until (system_clock or steady_clock) versions.
47 // SharedMutexReadPriority gives priority to readers,
48 // SharedMutexWritePriority gives priority to writers. SharedMutex is an
49 // alias for SharedMutexWritePriority, because writer starvation is more
50 // likely than reader starvation for the read-heavy workloads targetted
53 // In my tests SharedMutex is as good or better than the other
54 // reader-writer locks in use at Facebook for almost all use cases,
55 // sometimes by a wide margin. (If it is rare that there are actually
56 // concurrent readers then RWSpinLock can be a few nanoseconds faster.)
57 // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
58 // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
59 // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
60 // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
61 // It is generally as good or better than the rest when evaluating size,
62 // speed, scalability, or latency outliers. In the corner cases where
63 // it is not the fastest (such as single-threaded use or heavy write
64 // contention) it is never very much worse than the best. See the bottom
65 // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
67 // Comparison to folly::RWSpinLock:
69 // * SharedMutex is faster than RWSpinLock when there are actually
70 // concurrent read accesses (sometimes much faster), and ~5 nanoseconds
71 // slower when there is not actually any contention. SharedMutex is
72 // faster in every (benchmarked) scenario where the shared mode of
73 // the lock is actually useful.
75 // * Concurrent shared access to SharedMutex scales linearly, while total
76 // RWSpinLock throughput drops as more threads try to access the lock
77 // in shared mode. Under very heavy read contention SharedMutex can
78 // be two orders of magnitude faster than RWSpinLock (or any reader
79 // writer lock that doesn't use striping or deferral).
81 // * SharedMutex can safely protect blocking calls, because after an
82 // initial period of spinning it waits using futex().
84 // * RWSpinLock prioritizes readers, SharedMutex has both reader- and
85 // writer-priority variants, but defaults to write priority.
87 // * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
88 // doesn't. Both semantics are reasonable. The boost documentation
89 // doesn't explicitly talk about this behavior (except by omitting
90 // any statement that those lock modes conflict), but the boost
91 // implementations do allow new readers while the upgradeable mode
92 // is held. See https://github.com/boostorg/thread/blob/master/
93 // include/boost/thread/pthread/shared_mutex.hpp
95 // * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
96 // (UpgradeableHolder would be even more pedantically correct).
97 // SharedMutex's holders have fewer methods (no reset) and are less
98 // tolerant (promotion and downgrade crash if the donor doesn't own
99 // the lock, and you must use the default constructor rather than
100 // passing a nullptr to the pointer constructor).
102 // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
103 // and "shared" modes. At all times num_threads_holding_exclusive +
104 // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
105 // 0 || num_threads_holding_shared == 0. RWSpinLock has the additional
106 // constraint that num_threads_holding_shared cannot increase while
107 // num_threads_holding_upgrade is non-zero.
109 // Comparison to the internal RWLock:
111 // * SharedMutex doesn't allow a maximum reader count to be configured,
112 // so it can't be used as a semaphore in the same way as RWLock.
114 // * SharedMutex is 4 bytes, RWLock is 256.
116 // * SharedMutex is as fast or faster than RWLock in all of my
117 // microbenchmarks, and has positive rather than negative scalability.
119 // * RWLock and SharedMutex are both writer priority locks.
121 // * SharedMutex avoids latency outliers as well as RWLock.
123 // * SharedMutex uses different names (t != 0 below):
125 // RWLock::lock(0) => SharedMutex::lock()
127 // RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t))
129 // RWLock::tryLock() => SharedMutex::try_lock()
131 // RWLock::unlock() => SharedMutex::unlock()
133 // RWLock::enter(0) => SharedMutex::lock_shared()
135 // RWLock::enter(t) =>
136 // SharedMutex::try_lock_shared_for(milliseconds(t))
138 // RWLock::tryEnter() => SharedMutex::try_lock_shared()
140 // RWLock::leave() => SharedMutex::unlock_shared()
142 // * RWLock allows the reader count to be adjusted by a value other
143 // than 1 during enter() or leave(). SharedMutex doesn't currently
144 // implement this feature.
146 // * RWLock's methods are marked const, SharedMutex's aren't.
148 // Reader-writer locks have the potential to allow concurrent access
149 // to shared read-mostly data, but in practice they often provide no
150 // improvement over a mutex. The problem is the cache coherence protocol
151 // of modern CPUs. Coherence is provided by making sure that when a cache
152 // line is written it is present in only one core's cache. Since a memory
153 // write is required to acquire a reader-writer lock in shared mode, the
154 // cache line holding the lock is invalidated in all of the other caches.
155 // This leads to cache misses when another thread wants to acquire or
156 // release the lock concurrently. When the RWLock is colocated with the
157 // data it protects (common), cache misses can also continue occur when
158 // a thread that already holds the lock tries to read the protected data.
160 // Ideally, a reader-writer lock would allow multiple cores to acquire
161 // and release the lock in shared mode without incurring any cache misses.
162 // This requires that each core records its shared access in a cache line
163 // that isn't read or written by other read-locking cores. (Writers will
164 // have to check all of the cache lines.) Typical server hardware when
165 // this comment was written has 16 L1 caches and cache lines of 64 bytes,
166 // so a lock striped over all L1 caches would occupy a prohibitive 1024
167 // bytes. Nothing says that we need a separate set of per-core memory
168 // locations for each lock, however. Each SharedMutex instance is only
169 // 4 bytes, but all locks together share a 2K area in which they make a
170 // core-local record of lock acquisitions.
172 // SharedMutex's strategy of using a shared set of core-local stripes has
173 // a potential downside, because it means that acquisition of any lock in
174 // write mode can conflict with acquisition of any lock in shared mode.
175 // If a lock instance doesn't actually experience concurrency then this
176 // downside will outweight the upside of improved scalability for readers.
177 // To avoid this problem we dynamically detect concurrent accesses to
178 // SharedMutex, and don't start using the deferred mode unless we actually
179 // observe concurrency. See kNumSharedToStartDeferring.
181 // It is explicitly allowed to call lock_unshared() from a different
182 // thread than lock_shared(), so long as they are properly paired.
183 // lock_unshared() needs to find the location at which lock_shared()
184 // recorded the lock, which might be in the lock itself or in any of
185 // the shared slots. If you can conveniently pass state from lock
186 // acquisition to release then the fastest mechanism is to std::move
187 // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
188 // lock_shared(Token&) and unlock_shared(Token&)). The guard or token
189 // will tell unlock_shared where in deferredReaders[] to look for the
190 // deferred lock. The Token-less version of unlock_shared() works in all
191 // cases, but is optimized for the common (no inter-thread handoff) case.
193 // In both read- and write-priority mode, a waiting lock() (exclusive mode)
194 // only blocks readers after it has waited for an active upgrade lock to be
195 // released; until the upgrade lock is released (or upgraded or downgraded)
196 // readers will still be able to enter. Preferences about lock acquisition
197 // are not guaranteed to be enforced perfectly (even if they were, there
198 // is theoretically the chance that a thread could be arbitrarily suspended
199 // between calling lock() and SharedMutex code actually getting executed).
201 // try_*_for methods always try at least once, even if the duration
202 // is zero or negative. The duration type must be compatible with
203 // std::chrono::steady_clock. try_*_until methods also always try at
204 // least once. std::chrono::system_clock and std::chrono::steady_clock
207 // If you have observed by profiling that your SharedMutex-s are getting
208 // cache misses on deferredReaders[] due to another SharedMutex user, then
209 // you can use the tag type to create your own instantiation of the type.
210 // The contention threshold (see kNumSharedToStartDeferring) should make
211 // this unnecessary in all but the most extreme cases. Make sure to check
212 // that the increased icache and dcache footprint of the tagged result is
215 // SharedMutex's use of thread local storage is as an optimization, so
216 // for the case where thread local storage is not supported, define it
218 #ifndef FOLLY_SHAREDMUTEX_TLS
220 #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
222 #define FOLLY_SHAREDMUTEX_TLS
228 struct SharedMutexToken {
229 enum class Type : uint16_t {
241 typename Tag_ = void,
242 template <typename> class Atom = std::atomic,
243 bool BlockImmediately = false>
244 class SharedMutexImpl {
246 static constexpr bool kReaderPriority = ReaderPriority;
249 typedef SharedMutexToken Token;
255 constexpr SharedMutexImpl() noexcept : state_(0) {}
257 SharedMutexImpl(const SharedMutexImpl&) = delete;
258 SharedMutexImpl(SharedMutexImpl&&) = delete;
259 SharedMutexImpl& operator = (const SharedMutexImpl&) = delete;
260 SharedMutexImpl& operator = (SharedMutexImpl&&) = delete;
262 // It is an error to destroy an SharedMutex that still has
263 // any outstanding locks. This is checked if NDEBUG isn't defined.
264 // SharedMutex's exclusive mode can be safely used to guard the lock's
265 // own destruction. If, for example, you acquire the lock in exclusive
266 // mode and then observe that the object containing the lock is no longer
267 // needed, you can unlock() and then immediately destroy the lock.
268 // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
269 // description about why this property needs to be explicitly mentioned.
271 auto state = state_.load(std::memory_order_relaxed);
272 if (UNLIKELY((state & kHasS) != 0)) {
273 cleanupTokenlessSharedDeferred(state);
277 // if a futexWait fails to go to sleep because the value has been
278 // changed, we don't necessarily clean up the wait bits, so it is
279 // possible they will be set here in a correct system
280 assert((state & ~(kWaitingAny | kMayDefer)) == 0);
281 if ((state & kMayDefer) != 0) {
282 for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
283 auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
284 assert(!slotValueIsThis(slotValue));
292 (void)lockExclusiveImpl(kHasSolo, ctx);
297 return lockExclusiveImpl(kHasSolo, ctx);
300 template <class Rep, class Period>
301 bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
302 WaitForDuration<Rep, Period> ctx(duration);
303 return lockExclusiveImpl(kHasSolo, ctx);
306 template <class Clock, class Duration>
308 const std::chrono::time_point<Clock, Duration>& absDeadline) {
309 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
310 return lockExclusiveImpl(kHasSolo, ctx);
314 // It is possible that we have a left-over kWaitingNotS if the last
315 // unlock_shared() that let our matching lock() complete finished
316 // releasing before lock()'s futexWait went to sleep. Clean it up now
317 auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
318 assert((state & ~kWaitingAny) == 0);
319 wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
322 // Managing the token yourself makes unlock_shared a bit faster
326 (void)lockSharedImpl(nullptr, ctx);
329 void lock_shared(Token& token) {
331 (void)lockSharedImpl(&token, ctx);
334 bool try_lock_shared() {
336 return lockSharedImpl(nullptr, ctx);
339 bool try_lock_shared(Token& token) {
341 return lockSharedImpl(&token, ctx);
344 template <class Rep, class Period>
345 bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
346 WaitForDuration<Rep, Period> ctx(duration);
347 return lockSharedImpl(nullptr, ctx);
350 template <class Rep, class Period>
351 bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
353 WaitForDuration<Rep, Period> ctx(duration);
354 return lockSharedImpl(&token, ctx);
357 template <class Clock, class Duration>
358 bool try_lock_shared_until(
359 const std::chrono::time_point<Clock, Duration>& absDeadline) {
360 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
361 return lockSharedImpl(nullptr, ctx);
364 template <class Clock, class Duration>
365 bool try_lock_shared_until(
366 const std::chrono::time_point<Clock, Duration>& absDeadline,
368 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
369 return lockSharedImpl(&token, ctx);
372 void unlock_shared() {
373 auto state = state_.load(std::memory_order_acquire);
375 // kPrevDefer can only be set if HasE or BegunE is set
376 assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
378 // lock() strips kMayDefer immediately, but then copies it to
379 // kPrevDefer so we can tell if the pre-lock() lock_shared() might
381 if ((state & (kMayDefer | kPrevDefer)) == 0 ||
382 !tryUnlockTokenlessSharedDeferred()) {
383 // Matching lock_shared() couldn't have deferred, or the deferred
384 // lock has already been inlined by applyDeferredReaders()
385 unlockSharedInline();
389 void unlock_shared(Token& token) {
390 assert(token.type_ == Token::Type::INLINE_SHARED ||
391 token.type_ == Token::Type::DEFERRED_SHARED);
393 if (token.type_ != Token::Type::DEFERRED_SHARED ||
394 !tryUnlockSharedDeferred(token.slot_)) {
395 unlockSharedInline();
398 token.type_ = Token::Type::INVALID;
402 void unlock_and_lock_shared() {
403 // We can't use state_ -=, because we need to clear 2 bits (1 of which
404 // has an uncertain initial state) and set 1 other. We might as well
405 // clear the relevant wake bits at the same time. Note that since S
406 // doesn't block the beginning of a transition to E (writer priority
407 // can cut off new S, reader priority grabs BegunE and blocks deferred
408 // S) we need to wake E as well.
409 auto state = state_.load(std::memory_order_acquire);
411 assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
412 } while (!state_.compare_exchange_strong(
413 state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
414 if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
415 futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
419 void unlock_and_lock_shared(Token& token) {
420 unlock_and_lock_shared();
421 token.type_ = Token::Type::INLINE_SHARED;
424 void lock_upgrade() {
426 (void)lockUpgradeImpl(ctx);
429 bool try_lock_upgrade() {
431 return lockUpgradeImpl(ctx);
434 template <class Rep, class Period>
435 bool try_lock_upgrade_for(
436 const std::chrono::duration<Rep, Period>& duration) {
437 WaitForDuration<Rep, Period> ctx(duration);
438 return lockUpgradeImpl(ctx);
441 template <class Clock, class Duration>
442 bool try_lock_upgrade_until(
443 const std::chrono::time_point<Clock, Duration>& absDeadline) {
444 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
445 return lockUpgradeImpl(ctx);
448 void unlock_upgrade() {
449 auto state = (state_ -= kHasU);
450 assert((state & (kWaitingNotS | kHasSolo)) == 0);
451 wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
454 void unlock_upgrade_and_lock() {
455 // no waiting necessary, so waitMask is empty
457 (void)lockExclusiveImpl(0, ctx);
460 void unlock_upgrade_and_lock_shared() {
461 auto state = (state_ -= kHasU - kIncrHasS);
462 assert((state & (kWaitingNotS | kHasSolo)) == 0);
463 wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
466 void unlock_upgrade_and_lock_shared(Token& token) {
467 unlock_upgrade_and_lock_shared();
468 token.type_ = Token::Type::INLINE_SHARED;
471 void unlock_and_lock_upgrade() {
472 // We can't use state_ -=, because we need to clear 2 bits (1 of
473 // which has an uncertain initial state) and set 1 other. We might
474 // as well clear the relevant wake bits at the same time.
475 auto state = state_.load(std::memory_order_acquire);
477 assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
479 (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
480 if (state_.compare_exchange_strong(state, after)) {
481 if ((state & kWaitingS) != 0) {
482 futexWakeAll(kWaitingS);
490 typedef typename folly::detail::Futex<Atom> Futex;
492 // Internally we use four kinds of wait contexts. These are structs
493 // that provide a doWait method that returns true if a futex wake
494 // was issued that intersects with the waitMask, false if there was a
495 // timeout and no more waiting should be performed. Spinning occurs
496 // before the wait context is invoked.
499 bool canBlock() { return true; }
500 bool canTimeOut() { return false; }
501 bool shouldTimeOut() { return false; }
503 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
504 futex.futexWait(expected, waitMask);
510 bool canBlock() { return false; }
511 bool canTimeOut() { return true; }
512 bool shouldTimeOut() { return true; }
514 bool doWait(Futex& /* futex */,
515 uint32_t /* expected */,
516 uint32_t /* waitMask */) {
521 template <class Rep, class Period>
522 struct WaitForDuration {
523 std::chrono::duration<Rep, Period> duration_;
524 bool deadlineComputed_;
525 std::chrono::steady_clock::time_point deadline_;
527 explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
528 : duration_(duration), deadlineComputed_(false) {}
530 std::chrono::steady_clock::time_point deadline() {
531 if (!deadlineComputed_) {
532 deadline_ = std::chrono::steady_clock::now() + duration_;
533 deadlineComputed_ = true;
538 bool canBlock() { return duration_.count() > 0; }
539 bool canTimeOut() { return true; }
541 bool shouldTimeOut() {
542 return std::chrono::steady_clock::now() > deadline();
545 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
546 auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
547 return result != folly::detail::FutexResult::TIMEDOUT;
551 template <class Clock, class Duration>
552 struct WaitUntilDeadline {
553 std::chrono::time_point<Clock, Duration> absDeadline_;
555 bool canBlock() { return true; }
556 bool canTimeOut() { return true; }
557 bool shouldTimeOut() { return Clock::now() > absDeadline_; }
559 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
560 auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
561 return result != folly::detail::FutexResult::TIMEDOUT;
568 // S count needs to be on the end, because we explicitly allow it to
569 // underflow. This can occur while we are in the middle of applying
570 // deferred locks (we remove them from deferredReaders[] before
571 // inlining them), or during token-less unlock_shared() if a racing
572 // lock_shared();unlock_shared() moves the deferredReaders slot while
573 // the first unlock_shared() is scanning. The former case is cleaned
574 // up before we finish applying the locks. The latter case can persist
575 // until destruction, when it is cleaned up.
576 static constexpr uint32_t kIncrHasS = 1 << 10;
577 static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
579 // If false, then there are definitely no deferred read locks for this
580 // instance. Cleared after initialization and when exclusively locked.
581 static constexpr uint32_t kMayDefer = 1 << 9;
583 // lock() cleared kMayDefer as soon as it starts draining readers (so
584 // that it doesn't have to do a second CAS once drain completes), but
585 // unlock_shared() still needs to know whether to scan deferredReaders[]
586 // or not. We copy kMayDefer to kPrevDefer when setting kHasE or
587 // kBegunE, and clear it when clearing those bits.
588 static constexpr uint32_t kPrevDefer = 1 << 8;
590 // Exclusive-locked blocks all read locks and write locks. This bit
591 // may be set before all readers have finished, but in that case the
592 // thread that sets it won't return to the caller until all read locks
593 // have been released.
594 static constexpr uint32_t kHasE = 1 << 7;
596 // Exclusive-draining means that lock() is waiting for existing readers
597 // to leave, but that new readers may still acquire shared access.
598 // This is only used in reader priority mode. New readers during
599 // drain must be inline. The difference between this and kHasU is that
600 // kBegunE prevents kMayDefer from being set.
601 static constexpr uint32_t kBegunE = 1 << 6;
603 // At most one thread may have either exclusive or upgrade lock
604 // ownership. Unlike exclusive mode, ownership of the lock in upgrade
605 // mode doesn't preclude other threads holding the lock in shared mode.
606 // boost's concept for this doesn't explicitly say whether new shared
607 // locks can be acquired one lock_upgrade has succeeded, but doesn't
608 // list that as disallowed. RWSpinLock disallows new read locks after
609 // lock_upgrade has been acquired, but the boost implementation doesn't.
610 // We choose the latter.
611 static constexpr uint32_t kHasU = 1 << 5;
613 // There are three states that we consider to be "solo", in that they
614 // cannot coexist with other solo states. These are kHasE, kBegunE,
615 // and kHasU. Note that S doesn't conflict with any of these, because
616 // setting the kHasE is only one of the two steps needed to actually
617 // acquire the lock in exclusive mode (the other is draining the existing
619 static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
621 // Once a thread sets kHasE it needs to wait for the current readers
622 // to exit the lock. We give this a separate wait identity from the
623 // waiting to set kHasE so that we can perform partial wakeups (wake
624 // one instead of wake all).
625 static constexpr uint32_t kWaitingNotS = 1 << 4;
627 // When waking writers we can either wake them all, in which case we
628 // can clear kWaitingE, or we can call futexWake(1). futexWake tells
629 // us if anybody woke up, but even if we detect that nobody woke up we
630 // can't clear the bit after the fact without issuing another wakeup.
631 // To avoid thundering herds when there are lots of pending lock()
632 // without needing to call futexWake twice when there is only one
633 // waiter, kWaitingE actually encodes if we have observed multiple
634 // concurrent waiters. Tricky: ABA issues on futexWait mean that when
635 // we see kWaitingESingle we can't assume that there is only one.
636 static constexpr uint32_t kWaitingESingle = 1 << 2;
637 static constexpr uint32_t kWaitingEMultiple = 1 << 3;
638 static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
640 // kWaitingU is essentially a 1 bit saturating counter. It always
641 // requires a wakeAll.
642 static constexpr uint32_t kWaitingU = 1 << 1;
644 // All blocked lock_shared() should be awoken, so it is correct (not
645 // suboptimal) to wakeAll if there are any shared readers.
646 static constexpr uint32_t kWaitingS = 1 << 0;
648 // kWaitingAny is a mask of all of the bits that record the state of
649 // threads, rather than the state of the lock. It is convenient to be
650 // able to mask them off during asserts.
651 static constexpr uint32_t kWaitingAny =
652 kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
654 // The reader count at which a reader will attempt to use the lock
655 // in deferred mode. If this value is 2, then the second concurrent
656 // reader will set kMayDefer and use deferredReaders[]. kMayDefer is
657 // cleared during exclusive access, so this threshold must be reached
658 // each time a lock is held in exclusive mode.
659 static constexpr uint32_t kNumSharedToStartDeferring = 2;
661 // The typical number of spins that a thread will wait for a state
662 // transition. There is no bound on the number of threads that can wait
663 // for a writer, so we are pretty conservative here to limit the chance
664 // that we are starving the writer of CPU. Each spin is 6 or 7 nanos,
665 // almost all of which is in the pause instruction.
666 static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
668 // The maximum number of soft yields before falling back to futex.
669 // If the preemption heuristic is activated we will fall back before
670 // this. A soft yield takes ~900 nanos (two sched_yield plus a call
671 // to getrusage, with checks of the goal at each step). Soft yields
672 // aren't compatible with deterministic execution under test (unlike
673 // futexWaitUntil, which has a capricious but deterministic back end).
674 static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
676 // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
677 // level of the memory hierarchy is symmetrically divided into k pieces
678 // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
679 // that are the same after integer division by k share that resource.
680 // Our strategy for deferred readers is to probe up to numSlots/4 slots,
681 // using the full granularity of AccessSpreader for the start slot
682 // and then search outward. We can use AccessSpreader::current(n)
683 // without managing our own spreader if kMaxDeferredReaders <=
684 // AccessSpreader::kMaxCpus, which is currently 128.
686 // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
687 // with 64 byte cache lines. That means we need 64*16 bytes of
688 // deferredReaders[] to give each L1 its own playground. On x86_64
689 // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
690 // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If
691 // kDeferredSearchDistance * kDeferredSeparationFactor <=
692 // 64 / 8 then we will search only within a single cache line, which
693 // guarantees we won't have inter-L1 contention. We give ourselves
694 // a factor of 2 on the core count, which should hold us for a couple
695 // processor generations. deferredReaders[] is 2048 bytes currently.
697 static constexpr uint32_t kMaxDeferredReaders = 64;
698 static constexpr uint32_t kDeferredSearchDistance = 2;
699 static constexpr uint32_t kDeferredSeparationFactor = 4;
703 static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
704 "kMaxDeferredReaders must be a power of 2");
705 static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
706 "kDeferredSearchDistance must be a power of 2");
708 // The number of deferred locks that can be simultaneously acquired
709 // by a thread via the token-less methods without performing any heap
710 // allocations. Each of these costs 3 pointers (24 bytes, probably)
711 // per thread. There's not much point in making this larger than
712 // kDeferredSearchDistance.
713 static constexpr uint32_t kTokenStackTLSCapacity = 2;
715 // We need to make sure that if there is a lock_shared()
716 // and lock_shared(token) followed by unlock_shared() and
717 // unlock_shared(token), the token-less unlock doesn't null
718 // out deferredReaders[token.slot_]. If we allowed that, then
719 // unlock_shared(token) wouldn't be able to assume that its lock
720 // had been inlined by applyDeferredReaders when it finds that
721 // deferredReaders[token.slot_] no longer points to this. We accomplish
722 // this by stealing bit 0 from the pointer to record that the slot's
723 // element has no token, hence our use of uintptr_t in deferredReaders[].
724 static constexpr uintptr_t kTokenless = 0x1;
726 // This is the starting location for Token-less unlock_shared().
727 static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
729 // Last deferred reader slot used.
730 static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
733 // Only indexes divisible by kDeferredSeparationFactor are used.
734 // If any of those elements points to a SharedMutexImpl, then it
735 // should be considered that there is a shared lock on that instance.
738 typedef Atom<uintptr_t> DeferredReaderSlot;
741 alignas(hardware_destructive_interference_size) static DeferredReaderSlot
742 deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor];
744 // Performs an exclusive lock, waiting for state_ & waitMask to be
746 template <class WaitContext>
747 bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
748 uint32_t state = state_.load(std::memory_order_acquire);
750 (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
751 state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
754 return lockExclusiveImpl(state, preconditionGoalMask, ctx);
758 template <class WaitContext>
759 bool lockExclusiveImpl(uint32_t& state,
760 uint32_t preconditionGoalMask,
763 if (UNLIKELY((state & preconditionGoalMask) != 0) &&
764 !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
769 uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
770 if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
771 // Block readers immediately, either because we are in write
772 // priority mode or because we can acquire the lock in one
773 // step. Note that if state has kHasU, then we are doing an
774 // unlock_upgrade_and_lock() and we should clear it (reader
775 // priority branch also does this).
776 after |= (state | kHasE) & ~(kHasU | kMayDefer);
778 after |= (state | kBegunE) & ~(kHasU | kMayDefer);
780 if (state_.compare_exchange_strong(state, after)) {
784 // If we set kHasE (writer priority) then no new readers can
785 // arrive. If we set kBegunE then they can still enter, but
786 // they must be inline. Either way we need to either spin on
787 // deferredReaders[] slots, or inline them so that we can wait on
788 // kHasS to zero itself. deferredReaders[] is pointers, which on
789 // x86_64 are bigger than futex() can handle, so we inline the
790 // deferred locks instead of trying to futexWait on each slot.
791 // Readers are responsible for rechecking state_ after recording
792 // a deferred read to avoid atomicity problems between the state_
793 // CAS and applyDeferredReader's reads of deferredReaders[].
794 if (UNLIKELY((before & kMayDefer) != 0)) {
795 applyDeferredReaders(state, ctx);
798 assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
799 if (UNLIKELY((state & kHasS) != 0) &&
800 !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
802 // Ugh. We blocked new readers and other writers for a while,
803 // but were unable to complete. Move on. On the plus side
804 // we can clear kWaitingNotS because nobody else can piggyback
806 state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
807 wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
811 if (kReaderPriority && (state & kHasE) == 0) {
812 assert((state & kBegunE) != 0);
813 if (!state_.compare_exchange_strong(state,
814 (state & ~kBegunE) | kHasE)) {
825 template <class WaitContext>
826 bool waitForZeroBits(uint32_t& state,
830 uint32_t spinCount = 0;
832 state = state_.load(std::memory_order_acquire);
833 if ((state & goal) == 0) {
836 asm_volatile_pause();
838 if (UNLIKELY(spinCount >= kMaxSpinCount)) {
839 return ctx.canBlock() &&
840 yieldWaitForZeroBits(state, goal, waitMask, ctx);
845 template <class WaitContext>
846 bool yieldWaitForZeroBits(uint32_t& state,
852 std::memset(&usage, 0, sizeof(usage));
855 for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
857 for (int softState = 0; softState < 3; ++softState) {
859 std::this_thread::yield();
862 getrusage(RUSAGE_THREAD, &usage);
865 if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
868 if (ctx.shouldTimeOut()) {
873 if (before >= 0 && usage.ru_nivcsw >= before + 2) {
874 // One involuntary csw might just be occasional background work,
875 // but if we get two in a row then we guess that there is someone
876 // else who can profitably use this CPU. Fall back to futex
879 before = usage.ru_nivcsw;
882 return futexWaitForZeroBits(state, goal, waitMask, ctx);
885 template <class WaitContext>
886 bool futexWaitForZeroBits(uint32_t& state,
890 assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
891 waitMask == kWaitingU || waitMask == kWaitingS);
894 state = state_.load(std::memory_order_acquire);
895 if ((state & goal) == 0) {
900 if (waitMask == kWaitingE) {
901 if ((state & kWaitingESingle) != 0) {
902 after |= kWaitingEMultiple;
904 after |= kWaitingESingle;
910 // CAS is better than atomic |= here, because it lets us avoid
911 // setting the wait flag when the goal is concurrently achieved
912 if (after != state && !state_.compare_exchange_strong(state, after)) {
916 if (!ctx.doWait(state_, after, waitMask)) {
923 // Wakes up waiters registered in state_ as appropriate, clearing the
924 // awaiting bits for anybody that was awoken. Tries to perform direct
925 // single wakeup of an exclusive waiter if appropriate
926 void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
927 if (UNLIKELY((state & wakeMask) != 0)) {
928 wakeRegisteredWaitersImpl(state, wakeMask);
932 void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
933 // If there are multiple lock() pending only one of them will actually
934 // get to wake up, so issuing futexWakeAll will make a thundering herd.
935 // There's nothing stopping us from issuing futexWake(1) instead,
936 // so long as the wait bits are still an accurate reflection of
937 // the waiters. If we notice (via futexWake's return value) that
938 // nobody woke up then we can try again with the normal wake-all path.
939 // Note that we can't just clear the bits at that point; we need to
940 // clear the bits and then issue another wakeup.
942 // It is possible that we wake an E waiter but an outside S grabs the
943 // lock instead, at which point we should wake pending U and S waiters.
944 // Rather than tracking state to make the failing E regenerate the
945 // wakeup, we just disable the optimization in the case that there
946 // are waiting U or S that we are eligible to wake.
947 if ((wakeMask & kWaitingE) == kWaitingE &&
948 (state & wakeMask) == kWaitingE &&
949 state_.futexWake(1, kWaitingE) > 0) {
950 // somebody woke up, so leave state_ as is and clear it later
954 if ((state & wakeMask) != 0) {
955 auto prev = state_.fetch_and(~wakeMask);
956 if ((prev & wakeMask) != 0) {
957 futexWakeAll(wakeMask);
959 state = prev & ~wakeMask;
963 void futexWakeAll(uint32_t wakeMask) {
964 state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
967 DeferredReaderSlot* deferredReader(uint32_t slot) {
968 return &deferredReaders[slot * kDeferredSeparationFactor];
971 uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
973 uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
975 bool slotValueIsThis(uintptr_t slotValue) {
976 return (slotValue & ~kTokenless) == tokenfulSlotValue();
979 // Clears any deferredReaders[] that point to this, adjusting the inline
980 // shared lock count to compensate. Does some spinning and yielding
981 // to avoid the work. Always finishes the application, even if ctx
983 template <class WaitContext>
984 void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
987 uint32_t spinCount = 0;
989 while (!slotValueIsThis(
990 deferredReader(slot)->load(std::memory_order_acquire))) {
991 if (++slot == kMaxDeferredReaders) {
995 asm_volatile_pause();
996 if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
997 applyDeferredReaders(state, ctx, slot);
1003 template <class WaitContext>
1004 void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
1006 #ifdef RUSAGE_THREAD
1007 struct rusage usage;
1008 std::memset(&usage, 0, sizeof(usage));
1011 for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1013 for (int softState = 0; softState < 3; ++softState) {
1014 if (softState < 2) {
1015 std::this_thread::yield();
1017 #ifdef RUSAGE_THREAD
1018 getrusage(RUSAGE_THREAD, &usage);
1021 while (!slotValueIsThis(
1022 deferredReader(slot)->load(std::memory_order_acquire))) {
1023 if (++slot == kMaxDeferredReaders) {
1027 if (ctx.shouldTimeOut()) {
1028 // finish applying immediately on timeout
1032 #ifdef RUSAGE_THREAD
1033 if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1034 // heuristic says run queue is not empty
1037 before = usage.ru_nivcsw;
1041 uint32_t movedSlotCount = 0;
1042 for (; slot < kMaxDeferredReaders; ++slot) {
1043 auto slotPtr = deferredReader(slot);
1044 auto slotValue = slotPtr->load(std::memory_order_acquire);
1045 if (slotValueIsThis(slotValue) &&
1046 slotPtr->compare_exchange_strong(slotValue, 0)) {
1051 if (movedSlotCount > 0) {
1052 state = (state_ += movedSlotCount * kIncrHasS);
1054 assert((state & (kHasE | kBegunE)) != 0);
1056 // if state + kIncrHasS overflows (off the end of state) then either
1057 // we have 2^(32-9) readers (almost certainly an application bug)
1058 // or we had an underflow (also a bug)
1059 assert(state < state + kIncrHasS);
1062 // It is straightfoward to make a token-less lock_shared() and
1063 // unlock_shared() either by making the token-less version always use
1064 // INLINE_SHARED mode or by removing the token version. Supporting
1065 // deferred operation for both types is trickier than it appears, because
1066 // the purpose of the token it so that unlock_shared doesn't have to
1067 // look in other slots for its deferred lock. Token-less unlock_shared
1068 // might place a deferred lock in one place and then release a different
1069 // slot that was originally used by the token-ful version. If this was
1070 // important we could solve the problem by differentiating the deferred
1071 // locks so that cross-variety release wouldn't occur. The best way
1072 // is probably to steal a bit from the pointer, making deferredLocks[]
1073 // an array of Atom<uintptr_t>.
1075 template <class WaitContext>
1076 bool lockSharedImpl(Token* token, WaitContext& ctx) {
1077 uint32_t state = state_.load(std::memory_order_relaxed);
1078 if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
1079 state_.compare_exchange_strong(state, state + kIncrHasS)) {
1080 if (token != nullptr) {
1081 token->type_ = Token::Type::INLINE_SHARED;
1085 return lockSharedImpl(state, token, ctx);
1088 template <class WaitContext>
1089 bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
1091 // Updates the state in/out argument as if the locks were made inline,
1092 // but does not update state_
1093 void cleanupTokenlessSharedDeferred(uint32_t& state) {
1094 for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1095 auto slotPtr = deferredReader(i);
1096 auto slotValue = slotPtr->load(std::memory_order_relaxed);
1097 if (slotValue == tokenlessSlotValue()) {
1098 slotPtr->store(0, std::memory_order_relaxed);
1100 if ((state & kHasS) == 0) {
1107 bool tryUnlockTokenlessSharedDeferred();
1109 bool tryUnlockSharedDeferred(uint32_t slot) {
1110 assert(slot < kMaxDeferredReaders);
1111 auto slotValue = tokenfulSlotValue();
1112 return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
1115 uint32_t unlockSharedInline() {
1116 uint32_t state = (state_ -= kIncrHasS);
1117 assert((state & (kHasE | kBegunE | kMayDefer)) != 0 ||
1118 state < state + kIncrHasS);
1119 if ((state & kHasS) == 0) {
1120 // Only the second half of lock() can be blocked by a non-zero
1121 // reader count, so that's the only thing we need to wake
1122 wakeRegisteredWaiters(state, kWaitingNotS);
1127 template <class WaitContext>
1128 bool lockUpgradeImpl(WaitContext& ctx) {
1131 if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
1134 } while (!state_.compare_exchange_strong(state, state | kHasU));
1140 ReadHolder() : lock_(nullptr) {}
1143 explicit ReadHolder(const SharedMutexImpl* lock)
1144 : lock_(const_cast<SharedMutexImpl*>(lock)) {
1146 lock_->lock_shared(token_);
1150 explicit ReadHolder(const SharedMutexImpl& lock)
1151 : lock_(const_cast<SharedMutexImpl*>(&lock)) {
1152 lock_->lock_shared(token_);
1155 ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
1156 token_(rhs.token_) {
1157 rhs.lock_ = nullptr;
1160 // Downgrade from upgrade mode
1161 explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
1162 assert(upgraded.lock_ != nullptr);
1163 upgraded.lock_ = nullptr;
1164 lock_->unlock_upgrade_and_lock_shared(token_);
1167 // Downgrade from exclusive mode
1168 explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1169 assert(writer.lock_ != nullptr);
1170 writer.lock_ = nullptr;
1171 lock_->unlock_and_lock_shared(token_);
1174 ReadHolder& operator=(ReadHolder&& rhs) noexcept {
1175 std::swap(lock_, rhs.lock_);
1176 std::swap(token_, rhs.token_);
1180 ReadHolder(const ReadHolder& rhs) = delete;
1181 ReadHolder& operator=(const ReadHolder& rhs) = delete;
1189 lock_->unlock_shared(token_);
1195 friend class UpgradeHolder;
1196 friend class WriteHolder;
1197 SharedMutexImpl* lock_;
1198 SharedMutexToken token_;
1201 class UpgradeHolder {
1202 UpgradeHolder() : lock_(nullptr) {}
1205 explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) {
1207 lock_->lock_upgrade();
1211 explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
1212 lock_->lock_upgrade();
1215 // Downgrade from exclusive mode
1216 explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1217 assert(writer.lock_ != nullptr);
1218 writer.lock_ = nullptr;
1219 lock_->unlock_and_lock_upgrade();
1222 UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
1223 rhs.lock_ = nullptr;
1226 UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
1227 std::swap(lock_, rhs.lock_);
1231 UpgradeHolder(const UpgradeHolder& rhs) = delete;
1232 UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
1240 lock_->unlock_upgrade();
1246 friend class WriteHolder;
1247 friend class ReadHolder;
1248 SharedMutexImpl* lock_;
1252 WriteHolder() : lock_(nullptr) {}
1255 explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) {
1261 explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
1265 // Promotion from upgrade mode
1266 explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
1267 assert(upgrade.lock_ != nullptr);
1268 upgrade.lock_ = nullptr;
1269 lock_->unlock_upgrade_and_lock();
1274 // It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
1276 // Shared locks (read) can not safely upgrade to unique locks (write).
1277 // That upgrade path is a well-known recipe for deadlock, so we explicitly
1280 // If you need to do a conditional mutation, you have a few options:
1281 // 1. Check the condition under a shared lock and release it.
1282 // Then maybe check the condition again under a unique lock and maybe do
1284 // 2. Check the condition once under an upgradeable lock.
1285 // Then maybe upgrade the lock to a unique lock and do the mutation.
1286 // 3. Check the condition and maybe perform the mutation under a unique
1289 // Relevant upgradeable lock notes:
1290 // * At most one upgradeable lock can be held at a time for a given shared
1291 // mutex, just like a unique lock.
1292 // * An upgradeable lock may be held concurrently with any number of shared
1294 // * An upgradeable lock may be upgraded atomically to a unique lock.
1296 WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
1297 rhs.lock_ = nullptr;
1300 WriteHolder& operator=(WriteHolder&& rhs) noexcept {
1301 std::swap(lock_, rhs.lock_);
1305 WriteHolder(const WriteHolder& rhs) = delete;
1306 WriteHolder& operator=(const WriteHolder& rhs) = delete;
1320 friend class ReadHolder;
1321 friend class UpgradeHolder;
1322 SharedMutexImpl* lock_;
1325 // Adapters for Synchronized<>
1326 friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
1327 friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
1328 friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
1329 friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
1330 friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
1331 return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
1333 friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
1334 return lock.try_lock_for(std::chrono::milliseconds(ms));
1338 typedef SharedMutexImpl<true> SharedMutexReadPriority;
1339 typedef SharedMutexImpl<false> SharedMutexWritePriority;
1340 typedef SharedMutexWritePriority SharedMutex;
1342 // Prevent the compiler from instantiating these in other translation units.
1343 // They are instantiated once in SharedMutex.cpp
1344 extern template class SharedMutexImpl<true>;
1345 extern template class SharedMutexImpl<false>;
1348 bool ReaderPriority,
1350 template <typename> class Atom,
1351 bool BlockImmediately>
1352 alignas(hardware_destructive_interference_size)
1353 typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1355 SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1356 deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] = {};
1359 bool ReaderPriority,
1361 template <typename> class Atom,
1362 bool BlockImmediately>
1363 FOLLY_SHAREDMUTEX_TLS uint32_t
1364 SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1365 tls_lastTokenlessSlot = 0;
1368 bool ReaderPriority,
1370 template <typename> class Atom,
1371 bool BlockImmediately>
1372 FOLLY_SHAREDMUTEX_TLS uint32_t
1373 SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1374 tls_lastDeferredReaderSlot = 0;
1377 bool ReaderPriority,
1379 template <typename> class Atom,
1380 bool BlockImmediately>
1381 bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1382 tryUnlockTokenlessSharedDeferred() {
1383 auto bestSlot = tls_lastTokenlessSlot;
1384 for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1385 auto slotPtr = deferredReader(bestSlot ^ i);
1386 auto slotValue = slotPtr->load(std::memory_order_relaxed);
1387 if (slotValue == tokenlessSlotValue() &&
1388 slotPtr->compare_exchange_strong(slotValue, 0)) {
1389 tls_lastTokenlessSlot = bestSlot ^ i;
1397 bool ReaderPriority,
1399 template <typename> class Atom,
1400 bool BlockImmediately>
1401 template <class WaitContext>
1402 bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
1403 lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
1405 if (UNLIKELY((state & kHasE) != 0) &&
1406 !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
1410 uint32_t slot = tls_lastDeferredReaderSlot;
1411 uintptr_t slotValue = 1; // any non-zero value will do
1413 bool canAlreadyDefer = (state & kMayDefer) != 0;
1414 bool aboveDeferThreshold =
1415 (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
1416 bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
1417 if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
1418 /* Try using the most recent slot first. */
1419 slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1420 if (slotValue != 0) {
1421 // starting point for our empty-slot search, can change after
1422 // calling waitForZeroBits
1424 (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);
1426 // deferred readers are already enabled, or it is time to
1427 // enable them if we can find a slot
1428 for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
1429 slot = bestSlot ^ i;
1430 assert(slot < kMaxDeferredReaders);
1431 slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1432 if (slotValue == 0) {
1434 tls_lastDeferredReaderSlot = slot;
1441 if (slotValue != 0) {
1442 // not yet deferred, or no empty slots
1443 if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
1444 // successfully recorded the read lock inline
1445 if (token != nullptr) {
1446 token->type_ = Token::Type::INLINE_SHARED;
1450 // state is updated, try again
1454 // record that deferred readers might be in use if necessary
1455 if ((state & kMayDefer) == 0) {
1456 if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
1457 // keep going if CAS failed because somebody else set the bit
1459 if ((state & (kHasE | kMayDefer)) != kMayDefer) {
1463 // state = state | kMayDefer;
1466 // try to use the slot
1467 bool gotSlot = deferredReader(slot)->compare_exchange_strong(
1469 token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
1471 // If we got the slot, we need to verify that an exclusive lock
1472 // didn't happen since we last checked. If we didn't get the slot we
1473 // need to recheck state_ anyway to make sure we don't waste too much
1474 // work. It is also possible that since we checked state_ someone
1475 // has acquired and released the write lock, clearing kMayDefer.
1476 // Both cases are covered by looking for the readers-possible bit,
1477 // because it is off when the exclusive lock bit is set.
1478 state = state_.load(std::memory_order_acquire);
1484 if (token == nullptr) {
1485 tls_lastTokenlessSlot = slot;
1488 if ((state & kMayDefer) != 0) {
1489 assert((state & kHasE) == 0);
1491 if (token != nullptr) {
1492 token->type_ = Token::Type::DEFERRED_SHARED;
1493 token->slot_ = (uint16_t)slot;
1498 // release the slot before retrying
1499 if (token == nullptr) {
1500 // We can't rely on slot. Token-less slot values can be freed by
1501 // any unlock_shared(), so we need to do the full deferredReader
1502 // search during unlock. Unlike unlock_shared(), we can't trust
1503 // kPrevDefer here. This deferred lock isn't visible to lock()
1504 // (that's the whole reason we're undoing it) so there might have
1505 // subsequently been an unlock() and lock() with no intervening
1506 // transition to deferred mode.
1507 if (!tryUnlockTokenlessSharedDeferred()) {
1508 unlockSharedInline();
1511 if (!tryUnlockSharedDeferred(slot)) {
1512 unlockSharedInline();
1516 // We got here not because the lock was unavailable, but because
1517 // we lost a compare-and-swap. Try-lock is typically allowed to
1518 // have spurious failures, but there is no lock efficiency gain
1519 // from exploiting that freedom here.
1523 } // namespace folly