--- /dev/null
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Nathan Bronson (ngbronson@fb.com)
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <thread>
+#include <type_traits>
+#include <folly/Likely.h>
+#include <folly/detail/CacheLocality.h>
+#include <folly/detail/Futex.h>
+#include <sys/resource.h>
+
+// SharedMutex is a reader-writer lock. It is small, very fast, scalable
+// on multi-core, and suitable for use when readers or writers may block.
+// Unlike most other reader-writer locks, its throughput with concurrent
+// readers scales linearly; it is able to acquire and release the lock
+// in shared mode without cache line ping-ponging. It is suitable for
+// a wide range of lock hold times because it starts with spinning,
+// proceeds to using sched_yield with a preemption heuristic, and then
+// waits using futex and precise wakeups.
+//
+// SharedMutex provides all of the methods of folly::RWSpinLock,
+// boost::shared_mutex, boost::upgrade_mutex, and C++14's
+// std::shared_timed_mutex. All operations that can block are available
+// in try, try-for, and try-until (system_clock or steady_clock) versions.
+//
+// SharedMutexReadPriority gives priority to readers,
+// SharedMutexWritePriority gives priority to writers. SharedMutex is an
+// alias for SharedMutexWritePriority, because writer starvation is more
+// likely than reader starvation for the read-heavy workloads targetted
+// by SharedMutex.
+//
+// In my tests SharedMutex is as good or better than the other
+// reader-writer locks in use at Facebook for almost all use cases,
+// sometimes by a wide margin. (If it is rare that there are actually
+// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
+// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
+// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
+// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
+// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
+// It is generally as good or better than the rest when evaluating size,
+// speed, scalability, or latency outliers. In the corner cases where
+// it is not the fastest (such as single-threaded use or heavy write
+// contention) it is never very much worse than the best. See the bottom
+// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
+//
+// Comparison to folly::RWSpinLock:
+//
+// * SharedMutex is faster than RWSpinLock when there are actually
+// concurrent read accesses (sometimes much faster), and ~5 nanoseconds
+// slower when there is not actually any contention. SharedMutex is
+// faster in every (benchmarked) scenario where the shared mode of
+// the lock is actually useful.
+//
+// * Concurrent shared access to SharedMutex scales linearly, while total
+// RWSpinLock throughput drops as more threads try to access the lock
+// in shared mode. Under very heavy read contention SharedMutex can
+// be two orders of magnitude faster than RWSpinLock (or any reader
+// writer lock that doesn't use striping or deferral).
+//
+// * SharedMutex can safely protect blocking calls, because after an
+// initial period of spinning it waits using futex().
+//
+// * RWSpinLock prioritizes readers, SharedMutex has both reader- and
+// writer-priority variants, but defaults to write priority.
+//
+// * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
+// doesn't. Both semantics are reasonable. The boost documentation
+// doesn't explicitly talk about this behavior (except by omitting
+// any statement that those lock modes conflict), but the boost
+// implementations do allow new readers while the upgradeable mode
+// is held. See https://github.com/boostorg/thread/blob/master/
+// include/boost/thread/pthread/shared_mutex.hpp
+//
+// * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
+// (UpgradeableHolder would be even more pedantically correct).
+// SharedMutex's holders have fewer methods (no reset) and are less
+// tolerant (promotion and downgrade crash if the donor doesn't own
+// the lock, and you must use the default constructor rather than
+// passing a nullptr to the pointer constructor).
+//
+// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
+// and "shared" modes. At all times num_threads_holding_exclusive +
+// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
+// 0 || num_threads_holding_shared == 0. RWSpinLock has the additional
+// constraint that num_threads_holding_shared cannot increase while
+// num_threads_holding_upgrade is non-zero.
+//
+// Comparison to the internal RWLock:
+//
+// * SharedMutex doesn't allow a maximum reader count to be configured,
+// so it can't be used as a semaphore in the same way as RWLock.
+//
+// * SharedMutex is 4 bytes, RWLock is 256.
+//
+// * SharedMutex is as fast or faster than RWLock in all of my
+// microbenchmarks, and has positive rather than negative scalability.
+//
+// * RWLock and SharedMutex are both writer priority locks.
+//
+// * SharedMutex avoids latency outliers as well as RWLock.
+//
+// * SharedMutex uses different names (t != 0 below):
+//
+// RWLock::lock(0) => SharedMutex::lock()
+//
+// RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t))
+//
+// RWLock::tryLock() => SharedMutex::try_lock()
+//
+// RWLock::unlock() => SharedMutex::unlock()
+//
+// RWLock::enter(0) => SharedMutex::lock_shared()
+//
+// RWLock::enter(t) =>
+// SharedMutex::try_lock_shared_for(milliseconds(t))
+//
+// RWLock::tryEnter() => SharedMutex::try_lock_shared()
+//
+// RWLock::leave() => SharedMutex::unlock_shared()
+//
+// * RWLock allows the reader count to be adjusted by a value other
+// than 1 during enter() or leave(). SharedMutex doesn't currently
+// implement this feature.
+//
+// * RWLock's methods are marked const, SharedMutex's aren't.
+//
+// Reader-writer locks have the potential to allow concurrent access
+// to shared read-mostly data, but in practice they often provide no
+// improvement over a mutex. The problem is the cache coherence protocol
+// of modern CPUs. Coherence is provided by making sure that when a cache
+// line is written it is present in only one core's cache. Since a memory
+// write is required to acquire a reader-writer lock in shared mode, the
+// cache line holding the lock is invalidated in all of the other caches.
+// This leads to cache misses when another thread wants to acquire or
+// release the lock concurrently. When the RWLock is colocated with the
+// data it protects (common), cache misses can also continue occur when
+// a thread that already holds the lock tries to read the protected data.
+//
+// Ideally, a reader-writer lock would allow multiple cores to acquire
+// and release the lock in shared mode without incurring any cache misses.
+// This requires that each core records its shared access in a cache line
+// that isn't read or written by other read-locking cores. (Writers will
+// have to check all of the cache lines.) Typical server hardware when
+// this comment was written has 16 L1 caches and cache lines of 64 bytes,
+// so a lock striped over all L1 caches would occupy a prohibitive 1024
+// bytes. Nothing says that we need a separate set of per-core memory
+// locations for each lock, however. Each SharedMutex instance is only
+// 4 bytes, but all locks together share a 2K area in which they make a
+// core-local record of lock acquisitions.
+//
+// SharedMutex's strategy of using a shared set of core-local stripes has
+// a potential downside, because it means that acquisition of any lock in
+// write mode can conflict with acquisition of any lock in shared mode.
+// If a lock instance doesn't actually experience concurrency then this
+// downside will outweight the upside of improved scalability for readers.
+// To avoid this problem we dynamically detect concurrent accesses to
+// SharedMutex, and don't start using the deferred mode unless we actually
+// observe concurrency. See kNumSharedToStartDeferring.
+//
+// It is explicitly allowed to call lock_unshared() from a different
+// thread than lock_shared(), so long as they are properly paired.
+// lock_unshared() needs to find the location at which lock_shared()
+// recorded the lock, which might be in the lock itself or in any of
+// the shared slots. If you can conveniently pass state from lock
+// acquisition to release then the fastest mechanism is to std::move
+// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
+// lock_shared(Token&) and unlock_sahred(Token&)). The guard or token
+// will tell unlock_shared where in deferredReaders[] to look for the
+// deferred lock. The Token-less version of unlock_shared() works in all
+// cases, but is optimized for the common (no inter-thread handoff) case.
+//
+// In both read- and write-priority mode, a waiting lock() (exclusive mode)
+// only blocks readers after it has waited for an active upgrade lock to be
+// released; until the upgrade lock is released (or upgraded or downgraded)
+// readers will still be able to enter. Preferences about lock acquisition
+// are not guaranteed to be enforced perfectly (even if they were, there
+// is theoretically the chance that a thread could be arbitrarily suspended
+// between calling lock() and SharedMutex code actually getting executed).
+//
+// try_*_for methods always try at least once, even if the duration
+// is zero or negative. The duration type must be compatible with
+// std::chrono::steady_clock. try_*_until methods also always try at
+// least once. std::chrono::system_clock and std::chrono::steady_clock
+// are supported.
+//
+// If you have observed by profiling that your SharedMutex-s are getting
+// cache misses on deferredReaders[] due to another SharedMutex user, then
+// you can use the tag type plus the RWDEFERREDLOCK_DECLARE_STATIC_STORAGE
+// macro to create your own instantiation of the type. The contention
+// threshold (see kNumSharedToStartDeferring) should make this unnecessary
+// in all but the most extreme cases. Make sure to check that the
+// increased icache and dcache footprint of the tagged result is worth it.
+
+namespace folly {
+
+struct SharedMutexToken {
+ enum class Type : uint16_t {
+ INVALID = 0,
+ INLINE_SHARED,
+ DEFERRED_SHARED,
+ };
+
+ Type type_;
+ uint16_t slot_;
+};
+
+template <bool ReaderPriority,
+ typename Tag_ = void,
+ template <typename> class Atom = std::atomic,
+ bool BlockImmediately = false>
+class SharedMutexImpl : boost::noncopyable {
+ public:
+ static constexpr bool kReaderPriority = ReaderPriority;
+ typedef Tag_ Tag;
+
+ typedef SharedMutexToken Token;
+
+ class ReadHolder;
+ class UpgradeHolder;
+ class WriteHolder;
+
+ SharedMutexImpl() : state_(0) {}
+
+ // It is an error to destroy an SharedMutex that still has
+ // any outstanding locks. This is checked if NDEBUG isn't defined.
+ // SharedMutex's exclusive mode can be safely used to guard the lock's
+ // own destruction. If, for example, you acquire the lock in exclusive
+ // mode and then observe that the object containing the lock is no longer
+ // needed, you can unlock() and then immediately destroy the lock.
+ // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
+ // description about why this property needs to be explicitly mentioned.
+ ~SharedMutexImpl() {
+#ifndef NDEBUG
+ auto state = state_.load(std::memory_order_acquire);
+
+ // if a futexWait fails to go to sleep because the value has been
+ // changed, we don't necessarily clean up the wait bits, so it is
+ // possible they will be set here in a correct system
+ assert((state & ~(kWaitingAny | kMayDefer)) == 0);
+ if ((state & kMayDefer) != 0) {
+ for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
+ auto slotValue = deferredReader(slot)->load(std::memory_order_acquire);
+ assert(!slotValueIsThis(slotValue));
+ }
+ }
+#endif
+ }
+
+ void lock() {
+ WaitForever ctx;
+ (void)lockExclusiveImpl(kHasSolo, ctx);
+ }
+
+ bool try_lock() {
+ WaitNever ctx;
+ return lockExclusiveImpl(kHasSolo, ctx);
+ }
+
+ template <class Rep, class Period>
+ bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
+ WaitForDuration<Rep, Period> ctx(duration);
+ return lockExclusiveImpl(kHasSolo, ctx);
+ }
+
+ template <class Clock, class Duration>
+ bool try_lock_until(
+ const std::chrono::time_point<Clock, Duration>& absDeadline) {
+ WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+ return lockExclusiveImpl(kHasSolo, ctx);
+ }
+
+ void unlock() {
+ // It is possible that we have a left-over kWaitingNotS if the last
+ // unlock_shared() that let our matching lock() complete finished
+ // releasing before lock()'s futexWait went to sleep. Clean it up now
+ auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
+ assert((state & ~kWaitingAny) == 0);
+ wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
+ }
+
+ // Managing the token yourself makes unlock_shared a bit faster
+
+ void lock_shared() {
+ WaitForever ctx;
+ (void)lockSharedImpl(nullptr, ctx);
+ }
+
+ void lock_shared(Token& token) {
+ WaitForever ctx;
+ (void)lockSharedImpl(&token, ctx);
+ }
+
+ bool try_lock_shared() {
+ WaitNever ctx;
+ return lockSharedImpl(nullptr, ctx);
+ }
+
+ bool try_lock_shared(Token& token) {
+ WaitNever ctx;
+ return lockSharedImpl(&token, ctx);
+ }
+
+ template <class Rep, class Period>
+ bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
+ WaitForDuration<Rep, Period> ctx(duration);
+ return lockSharedImpl(nullptr, ctx);
+ }
+
+ template <class Rep, class Period>
+ bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
+ Token& token) {
+ WaitForDuration<Rep, Period> ctx(duration);
+ return lockSharedImpl(&token, ctx);
+ }
+
+ template <class Clock, class Duration>
+ bool try_lock_shared_until(
+ const std::chrono::time_point<Clock, Duration>& absDeadline) {
+ WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+ return lockSharedImpl(nullptr, ctx);
+ }
+
+ template <class Clock, class Duration>
+ bool try_lock_shared_until(
+ const std::chrono::time_point<Clock, Duration>& absDeadline,
+ Token& token) {
+ WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+ return lockSharedImpl(&token, ctx);
+ }
+
+ void unlock_shared() {
+ auto state = state_.load(std::memory_order_acquire);
+
+ // kPrevDefer can only be set if HasE or BegunE is set
+ assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
+
+ // lock() strips kMayDefer immediately, but then copies it to
+ // kPrevDefer so we can tell if the pre-lock() lock_shared() might
+ // have deferred
+ if ((state & (kMayDefer | kPrevDefer)) == 0 ||
+ !tryUnlockAnySharedDeferred()) {
+ // Matching lock_shared() couldn't have deferred, or the deferred
+ // lock has already been inlined by applyDeferredReaders()
+ unlockSharedInline();
+ }
+ }
+
+ void unlock_shared(Token& token) {
+ assert(token.type_ == Token::Type::INLINE_SHARED ||
+ token.type_ == Token::Type::DEFERRED_SHARED);
+
+ if (token.type_ != Token::Type::DEFERRED_SHARED ||
+ !tryUnlockSharedDeferred(token.slot_)) {
+ unlockSharedInline();
+ }
+#ifndef NDEBUG
+ token.type_ = Token::Type::INVALID;
+#endif
+ }
+
+ void unlock_and_lock_shared() {
+ // We can't use state_ -=, because we need to clear 2 bits (1 of which
+ // has an uncertain initial state) and set 1 other. We might as well
+ // clear the relevant wake bits at the same time. Note that since S
+ // doesn't block the beginning of a transition to E (writer priority
+ // can cut off new S, reader priority grabs BegunE and blocks deferred
+ // S) we need to wake E as well.
+ auto state = state_.load(std::memory_order_acquire);
+ do {
+ assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
+ } while (!state_.compare_exchange_strong(
+ state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
+ if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
+ futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
+ }
+ }
+
+ void unlock_and_lock_shared(Token& token) {
+ unlock_and_lock_shared();
+ token.type_ = Token::Type::INLINE_SHARED;
+ }
+
+ void lock_upgrade() {
+ WaitForever ctx;
+ (void)lockUpgradeImpl(ctx);
+ }
+
+ bool try_lock_upgrade() {
+ WaitNever ctx;
+ return lockUpgradeImpl(ctx);
+ }
+
+ template <class Rep, class Period>
+ bool try_lock_upgrade_for(
+ const std::chrono::duration<Rep, Period>& duration) {
+ WaitForDuration<Rep, Period> ctx(duration);
+ return lockUpgradeImpl(ctx);
+ }
+
+ template <class Clock, class Duration>
+ bool try_lock_upgrade_until(
+ const std::chrono::time_point<Clock, Duration>& absDeadline) {
+ WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
+ return lockUpgradeImpl(ctx);
+ }
+
+ void unlock_upgrade() {
+ auto state = (state_ -= kHasU);
+ assert((state & (kWaitingNotS | kHasSolo)) == 0);
+ wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
+ }
+
+ void unlock_upgrade_and_lock() {
+ // no waiting necessary, so waitMask is empty
+ WaitForever ctx;
+ (void)lockExclusiveImpl(0, ctx);
+ }
+
+ void unlock_upgrade_and_lock_shared() {
+ auto state = (state_ -= kHasU - kIncrHasS);
+ assert((state & (kWaitingNotS | kHasSolo)) == 0 && (state & kHasS) != 0);
+ wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
+ }
+
+ void unlock_upgrade_and_lock_shared(Token& token) {
+ unlock_upgrade_and_lock_shared();
+ token.type_ = Token::Type::INLINE_SHARED;
+ }
+
+ void unlock_and_lock_upgrade() {
+ // We can't use state_ -=, because we need to clear 2 bits (1 of
+ // which has an uncertain initial state) and set 1 other. We might
+ // as well clear the relevant wake bits at the same time.
+ auto state = state_.load(std::memory_order_acquire);
+ while (true) {
+ assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
+ auto after =
+ (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
+ if (state_.compare_exchange_strong(state, after)) {
+ if ((state & kWaitingS) != 0) {
+ futexWakeAll(kWaitingS);
+ }
+ return;
+ }
+ }
+ }
+
+ private:
+ typedef typename folly::detail::Futex<Atom> Futex;
+
+ // Internally we use four kinds of wait contexts. These are structs
+ // that provide a doWait method that returns true if a futex wake
+ // was issued that intersects with the waitMask, false if there was a
+ // timeout and no more waiting should be performed. Spinning occurs
+ // before the wait context is invoked.
+
+ struct WaitForever {
+ bool canBlock() { return true; }
+ bool canTimeOut() { return false; }
+ bool shouldTimeOut() { return false; }
+
+ bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+ futex.futexWait(expected, waitMask);
+ return true;
+ }
+ };
+
+ struct WaitNever {
+ bool canBlock() { return false; }
+ bool canTimeOut() { return true; }
+ bool shouldTimeOut() { return true; }
+
+ bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+ return false;
+ }
+ };
+
+ template <class Rep, class Period>
+ struct WaitForDuration {
+ std::chrono::duration<Rep, Period> duration_;
+ bool deadlineComputed_;
+ std::chrono::steady_clock::time_point deadline_;
+
+ explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
+ : duration_(duration), deadlineComputed_(false) {}
+
+ std::chrono::steady_clock::time_point deadline() {
+ if (!deadlineComputed_) {
+ deadline_ = std::chrono::steady_clock::now() + duration_;
+ deadlineComputed_ = true;
+ }
+ return deadline_;
+ }
+
+ bool canBlock() { return duration_.count() > 0; }
+ bool canTimeOut() { return true; }
+
+ bool shouldTimeOut() {
+ return std::chrono::steady_clock::now() > deadline();
+ }
+
+ bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+ auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
+ return result != folly::detail::FutexResult::TIMEDOUT;
+ }
+ };
+
+ template <class Clock, class Duration>
+ struct WaitUntilDeadline {
+ std::chrono::time_point<Clock, Duration> absDeadline_;
+
+ bool canBlock() { return true; }
+ bool canTimeOut() { return true; }
+ bool shouldTimeOut() { return Clock::now() > absDeadline_; }
+
+ bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
+ auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
+ return result != folly::detail::FutexResult::TIMEDOUT;
+ }
+ };
+
+ // 32 bits of state
+ Futex state_;
+
+ static constexpr uint32_t kIncrHasS = 1 << 10;
+ static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
+
+ // If false, then there are definitely no deferred read locks for this
+ // instance. Cleared after initialization and when exclusively locked.
+ static constexpr uint32_t kMayDefer = 1 << 9;
+
+ // lock() cleared kMayDefer as soon as it starts draining readers (so
+ // that it doesn't have to do a second CAS once drain completes), but
+ // unlock_shared() still needs to know whether to scan deferredReaders[]
+ // or not. We copy kMayDefer to kPrevDefer when setting kHasE or
+ // kBegunE, and clear it when clearing those bits.
+ static constexpr uint32_t kPrevDefer = 1 << 8;
+
+ // Exclusive-locked blocks all read locks and write locks. This bit
+ // may be set before all readers have finished, but in that case the
+ // thread that sets it won't return to the caller until all read locks
+ // have been released.
+ static constexpr uint32_t kHasE = 1 << 7;
+
+ // Exclusive-draining means that lock() is waiting for existing readers
+ // to leave, but that new readers may still acquire shared access.
+ // This is only used in reader priority mode. New readers during
+ // drain must be inline. The difference between this and kHasU is that
+ // kBegunE prevents kMayDefer from being set.
+ static constexpr uint32_t kBegunE = 1 << 6;
+
+ // At most one thread may have either exclusive or upgrade lock
+ // ownership. Unlike exclusive mode, ownership of the lock in upgrade
+ // mode doesn't preclude other threads holding the lock in shared mode.
+ // boost's concept for this doesn't explicitly say whether new shared
+ // locks can be acquired one lock_upgrade has succeeded, but doesn't
+ // list that as disallowed. RWSpinLock disallows new read locks after
+ // lock_upgrade has been acquired, but the boost implementation doesn't.
+ // We choose the latter.
+ static constexpr uint32_t kHasU = 1 << 5;
+
+ // There are three states that we consider to be "solo", in that they
+ // cannot coexist with other solo states. These are kHasE, kBegunE,
+ // and kHasU. Note that S doesn't conflict with any of these, because
+ // setting the kHasE is only one of the two steps needed to actually
+ // acquire the lock in exclusive mode (the other is draining the existing
+ // S holders).
+ static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
+
+ // Once a thread sets kHasE it needs to wait for the current readers
+ // to exit the lock. We give this a separate wait identity from the
+ // waiting to set kHasE so that we can perform partial wakeups (wake
+ // one instead of wake all).
+ static constexpr uint32_t kWaitingNotS = 1 << 4;
+
+ // If there are multiple pending waiters, then waking them all can
+ // lead to a thundering herd on the lock. To avoid this, we keep
+ // a 2 bit saturating counter of the number of exclusive waiters
+ // (0, 1, 2, 3+), and if the value is >= 2 we perform futexWake(1)
+ // instead of futexWakeAll. See wakeRegisteredWaiters for more.
+ // It isn't actually useful to make the counter bigger, because
+ // whenever a futexWait fails with EAGAIN the counter becomes higher
+ // than the actual number of waiters, and hence effectively saturated.
+ // Bigger counters just lead to more changes in state_, which increase
+ // contention and failed futexWait-s.
+ static constexpr uint32_t kIncrWaitingE = 1 << 2;
+ static constexpr uint32_t kWaitingE = 0x3 * kIncrWaitingE;
+
+ // kWaitingU is essentially a 1 bit saturating counter. It always
+ // requires a wakeAll.
+ static constexpr uint32_t kWaitingU = 1 << 1;
+
+ // All blocked lock_shared() should be awoken, so it is correct (not
+ // suboptimal) to wakeAll if there are any shared readers.
+ static constexpr uint32_t kWaitingS = 1 << 0;
+
+ // kWaitingAny is a mask of all of the bits that record the state of
+ // threads, rather than the state of the lock. It is convenient to be
+ // able to mask them off during asserts.
+ static constexpr uint32_t kWaitingAny =
+ kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
+
+ // The reader count at which a reader will attempt to use the lock
+ // in deferred mode. If this value is 2, then the second concurrent
+ // reader will set kMayDefer and use deferredReaders[]. kMayDefer is
+ // cleared during exclusive access, so this threshold must be reached
+ // each time a lock is held in exclusive mode.
+ static constexpr uint32_t kNumSharedToStartDeferring = 2;
+
+ // The typical number of spins that a thread will wait for a state
+ // transition. There is no bound on the number of threads that can wait
+ // for a writer, so we are pretty conservative here to limit the chance
+ // that we are starving the writer of CPU. Each spin is 6 or 7 nanos,
+ // almost all of which is in the pause instruction.
+ static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
+
+ // The maximum number of soft yields before falling back to futex.
+ // If the preemption heuristic is activated we will fall back before
+ // this. A soft yield takes ~900 nanos (two sched_yield plus a call
+ // to getrusage, with checks of the goal at each step). Soft yields
+ // aren't compatible with deterministic execution under test (unlike
+ // futexWaitUntil, which has a capricious but deterministic back end).
+ static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
+
+ // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
+ // level of the memory hierarchy is symmetrically divided into k pieces
+ // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
+ // that are the same after integer division by k share that resource.
+ // Our strategy for deferred readers is to probe up to numSlots/4 slots,
+ // using the full granularity of AccessSpreader for the start slot
+ // and then search outward. We can use AccessSpreader::current(n)
+ // without managing our own spreader if kMaxDeferredReaders <=
+ // AccessSpreader::kMaxCpus, which is currently 128.
+ //
+ // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
+ // with 64 byte cache lines. That means we need 64*16 bytes of
+ // deferredReaders[] to give each L1 its own playground. On x86_64
+ // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
+ // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If
+ // kDeferredSearchDistance * kDeferredSeparationFactor <=
+ // 64 / 8 then we will search only within a single cache line, which
+ // guarantees we won't have inter-L1 contention. We give ourselves
+ // a factor of 2 on the core count, which should hold us for a couple
+ // processor generations. deferredReaders[] is 2048 bytes currently.
+ static constexpr uint32_t kMaxDeferredReaders = 64;
+ static constexpr uint32_t kDeferredSearchDistance = 2;
+ static constexpr uint32_t kDeferredSeparationFactor = 4;
+
+ static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
+ "kMaxDeferredReaders must be a power of 2");
+ static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
+ "kDeferredSearchDistance must be a power of 2");
+
+ // The number of deferred locks that can be simultaneously acquired
+ // by a thread via the token-less methods without performing any heap
+ // allocations. Each of these costs 3 pointers (24 bytes, probably)
+ // per thread. There's not much point in making this larger than
+ // kDeferredSearchDistance.
+ static constexpr uint32_t kTokenStackTLSCapacity = 2;
+
+ // We need to make sure that if there is a lock_shared()
+ // and lock_shared(token) followed by unlock_shared() and
+ // unlock_shared(token), the token-less unlock doesn't null
+ // out deferredReaders[token.slot_]. If we allowed that, then
+ // unlock_shared(token) wouldn't be able to assume that its lock
+ // had been inlined by applyDeferredReaders when it finds that
+ // deferredReaders[token.slot_] no longer points to this. We accomplish
+ // this by stealing bit 0 from the pointer to record that the slot's
+ // element has no token, hence our use of uintptr_t in deferredReaders[].
+ static constexpr uintptr_t kTokenless = 0x1;
+
+ // This is the starting location for Token-less unlock_shared().
+ static FOLLY_TLS uint32_t tls_lastTokenlessSlot;
+
+ // Only indexes divisible by kDeferredSeparationFactor are used.
+ // If any of those elements points to a SharedMutexImpl, then it
+ // should be considered that there is a shared lock on that instance.
+ // See kTokenless.
+ typedef Atom<uintptr_t> DeferredReaderSlot;
+ static DeferredReaderSlot deferredReaders
+ [kMaxDeferredReaders *
+ kDeferredSeparationFactor] FOLLY_ALIGN_TO_AVOID_FALSE_SHARING;
+
+ // Performs an exclusive lock, waiting for state_ & waitMask to be
+ // zero first
+ template <class WaitContext>
+ bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
+ uint32_t state = state_.load(std::memory_order_acquire);
+ if (LIKELY(
+ (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
+ state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
+ return true;
+ } else {
+ return lockExclusiveImpl(state, preconditionGoalMask, ctx);
+ }
+ }
+
+ template <class WaitContext>
+ bool lockExclusiveImpl(uint32_t& state,
+ uint32_t preconditionGoalMask,
+ WaitContext& ctx) {
+ while (true) {
+ if (UNLIKELY((state & preconditionGoalMask) != 0) &&
+ !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
+ ctx.canTimeOut()) {
+ return false;
+ }
+
+ uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
+ if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
+ // Block readers immediately, either because we are in write
+ // priority mode or because we can acquire the lock in one
+ // step. Note that if state has kHasU, then we are doing an
+ // unlock_upgrade_and_lock() and we should clear it (reader
+ // priority branch also does this).
+ after |= (state | kHasE) & ~(kHasU | kMayDefer);
+ } else {
+ after |= (state | kBegunE) & ~(kHasU | kMayDefer);
+ }
+ if (state_.compare_exchange_strong(state, after)) {
+ auto before = state;
+ state = after;
+
+ // If we set kHasE (writer priority) then no new readers can
+ // arrive. If we set kBegunE then they can still enter, but
+ // they must be inline. Either way we need to either spin on
+ // deferredReaders[] slots, or inline them so that we can wait on
+ // kHasS to zero itself. deferredReaders[] is pointers, which on
+ // x86_64 are bigger than futex() can handle, so we inline the
+ // deferred locks instead of trying to futexWait on each slot.
+ // Readers are responsible for rechecking state_ after recording
+ // a deferred read to avoid atomicity problems between the state_
+ // CAS and applyDeferredReader's reads of deferredReaders[].
+ if (UNLIKELY((before & kMayDefer) != 0)) {
+ applyDeferredReaders(state, ctx);
+ }
+ while (true) {
+ assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
+ if (UNLIKELY((state & kHasS) != 0) &&
+ !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
+ ctx.canTimeOut()) {
+ // Ugh. We blocked new readers and other writers for a while,
+ // but were unable to complete. Move on. On the plus side
+ // we can clear kWaitingNotS because nobody else can piggyback
+ // on it.
+ state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
+ wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
+ return false;
+ }
+
+ if (ReaderPriority && (state & kHasE) == 0) {
+ assert((state & kBegunE) != 0);
+ if (!state_.compare_exchange_strong(state,
+ (state & ~kBegunE) | kHasE)) {
+ continue;
+ }
+ }
+
+ return true;
+ }
+ }
+ }
+ }
+
+ template <class WaitContext>
+ bool waitForZeroBits(uint32_t& state,
+ uint32_t goal,
+ uint32_t waitMask,
+ WaitContext& ctx) {
+ uint32_t spinCount = 0;
+ while (true) {
+ state = state_.load(std::memory_order_acquire);
+ if ((state & goal) == 0) {
+ return true;
+ }
+#if FOLLY_X64
+ asm volatile("pause");
+#endif
+ ++spinCount;
+ if (UNLIKELY(spinCount >= kMaxSpinCount)) {
+ return ctx.canBlock() &&
+ yieldWaitForZeroBits(state, goal, waitMask, ctx);
+ }
+ }
+ }
+
+ template <class WaitContext>
+ bool yieldWaitForZeroBits(uint32_t& state,
+ uint32_t goal,
+ uint32_t waitMask,
+ WaitContext& ctx) {
+#ifdef RUSAGE_THREAD
+ struct rusage usage;
+ long before = -1;
+#endif
+ for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
+ ++yieldCount) {
+ for (int softState = 0; softState < 3; ++softState) {
+ if (softState < 2) {
+ std::this_thread::yield();
+ } else {
+#ifdef RUSAGE_THREAD
+ getrusage(RUSAGE_THREAD, &usage);
+#endif
+ }
+ if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
+ return true;
+ }
+ if (ctx.shouldTimeOut()) {
+ return false;
+ }
+ }
+#ifdef RUSAGE_THREAD
+ if (before >= 0 && usage.ru_nivcsw >= before + 2) {
+ // One involuntary csw might just be occasional background work,
+ // but if we get two in a row then we guess that there is someone
+ // else who can profitably use this CPU. Fall back to futex
+ break;
+ }
+ before = usage.ru_nivcsw;
+#endif
+ }
+ return futexWaitForZeroBits(state, goal, waitMask, ctx);
+ }
+
+ template <class WaitContext>
+ bool futexWaitForZeroBits(uint32_t& state,
+ uint32_t goal,
+ uint32_t waitMask,
+ WaitContext& ctx) {
+ assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
+ waitMask == kWaitingU || waitMask == kWaitingS);
+
+ while (true) {
+ state = state_.load(std::memory_order_acquire);
+ if ((state & goal) == 0) {
+ return true;
+ }
+
+ auto after = state;
+ if (waitMask == kWaitingE) {
+ if ((state & kWaitingE) != kWaitingE) {
+ after += kIncrWaitingE;
+ } // else counter is saturated
+ } else {
+ after |= waitMask;
+ }
+
+ // CAS is better than atomic |= here, because it lets us avoid
+ // setting the wait flag when the goal is concurrently achieved
+ if (after != state && !state_.compare_exchange_strong(state, after)) {
+ continue;
+ }
+
+ if (!ctx.doWait(state_, after, waitMask)) {
+ // timed out
+ return false;
+ }
+ }
+ }
+
+ // Wakes up waiters registered in state_ as appropriate, clearing the
+ // awaiting bits for anybody that was awoken. Tries to perform direct
+ // single wakeup of an exclusive waiter if appropriate
+ void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
+ if (UNLIKELY((state & wakeMask) != 0)) {
+ wakeRegisteredWaitersImpl(state, wakeMask);
+ }
+ }
+
+ void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
+ if ((wakeMask & kWaitingE) != 0) {
+ // If there are multiple lock() pending only one of them will
+ // actually get to wake up, so issuing futexWakeAll will make
+ // a thundering herd. There's nothing stopping us from issuing
+ // futexWake(1) instead, so long as the wait bits are still an
+ // accurate reflection of the waiters. If our pending lock() counter
+ // hasn't saturated we can decrement it. If it has saturated,
+ // then we can clear it by noticing that futexWake(1) returns 0
+ // (indicating no actual waiters) and then retrying via the normal
+ // clear+futexWakeAll path.
+ //
+ // It is possible that we wake an E waiter but an outside S grabs
+ // the lock instead, at which point we should wake pending U and
+ // S waiters. Rather than tracking state to make the failing E
+ // regenerate the wakeup, we just disable the optimization in the
+ // case that there are waiting U or S that we are eligible to wake.
+ //
+ // Note that in the contended scenario it is quite likely that the
+ // waiter's futexWait call will fail with EAGAIN (expected value
+ // mismatch), at which point the awaiting-exclusive count will be
+ // larger than the actual number of waiters. At this point the
+ // counter is effectively saturated. Since this is likely, it is
+ // actually less efficient to have a larger counter. 2 bits seems
+ // to be the best.
+ while ((state & kWaitingE) != 0 &&
+ (state & wakeMask & (kWaitingU | kWaitingS)) == 0) {
+ if ((state & kWaitingE) != kWaitingE) {
+ // not saturated
+ if (!state_.compare_exchange_strong(state, state - kIncrWaitingE)) {
+ continue;
+ }
+ state -= kIncrWaitingE;
+ }
+
+ if (state_.futexWake(1, kWaitingE) > 0) {
+ return;
+ }
+
+ // Despite the non-zero awaiting-exclusive count, there aren't
+ // actually any pending writers. Fall through to the logic below
+ // to wake up other classes of locks and to clear the saturated
+ // counter (if necessary).
+ break;
+ }
+ }
+
+ if ((state & wakeMask) != 0) {
+ auto prev = state_.fetch_and(~wakeMask);
+ if ((prev & wakeMask) != 0) {
+ futexWakeAll(wakeMask);
+ }
+ state = prev & ~wakeMask;
+ }
+ }
+
+ void futexWakeAll(uint32_t wakeMask) {
+ state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
+ }
+
+ DeferredReaderSlot* deferredReader(uint32_t slot) {
+ return &deferredReaders[slot * kDeferredSeparationFactor];
+ }
+
+ uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
+
+ uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
+
+ bool slotValueIsThis(uintptr_t slotValue) {
+ return (slotValue & ~kTokenless) == tokenfulSlotValue();
+ }
+
+ // Clears any deferredReaders[] that point to this, adjusting the inline
+ // shared lock count to compensate. Does some spinning and yielding
+ // to avoid the work. Always finishes the application, even if ctx
+ // times out.
+ template <class WaitContext>
+ void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
+ uint32_t slot = 0;
+
+ uint32_t spinCount = 0;
+ while (true) {
+ while (!slotValueIsThis(
+ deferredReader(slot)->load(std::memory_order_acquire))) {
+ if (++slot == kMaxDeferredReaders) {
+ return;
+ }
+ }
+#if FOLLY_X64
+ asm("pause");
+#endif
+ if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
+ applyDeferredReaders(state, ctx, slot);
+ return;
+ }
+ }
+ }
+
+ template <class WaitContext>
+ void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
+
+#ifdef RUSAGE_THREAD
+ struct rusage usage;
+ long before = -1;
+#endif
+ for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
+ ++yieldCount) {
+ for (int softState = 0; softState < 3; ++softState) {
+ if (softState < 2) {
+ std::this_thread::yield();
+ } else {
+#ifdef RUSAGE_THREAD
+ getrusage(RUSAGE_THREAD, &usage);
+#endif
+ }
+ while (!slotValueIsThis(
+ deferredReader(slot)->load(std::memory_order_acquire))) {
+ if (++slot == kMaxDeferredReaders) {
+ return;
+ }
+ }
+ if (ctx.shouldTimeOut()) {
+ // finish applying immediately on timeout
+ break;
+ }
+ }
+#ifdef RUSAGE_THREAD
+ if (before >= 0 && usage.ru_nivcsw >= before + 2) {
+ // heuristic says run queue is not empty
+ break;
+ }
+ before = usage.ru_nivcsw;
+#endif
+ }
+
+ uint32_t movedSlotCount = 0;
+ for (; slot < kMaxDeferredReaders; ++slot) {
+ auto slotPtr = deferredReader(slot);
+ auto slotValue = slotPtr->load(std::memory_order_acquire);
+ if (slotValueIsThis(slotValue) &&
+ slotPtr->compare_exchange_strong(slotValue, 0)) {
+ ++movedSlotCount;
+ }
+ }
+
+ if (movedSlotCount > 0) {
+ state = (state_ += movedSlotCount * kIncrHasS);
+ }
+ assert((state & (kHasE | kBegunE)) != 0);
+
+ // if state + kIncrHasS overflows (off the end of state) then either
+ // we have 2^(32-9) readers (almost certainly an application bug)
+ // or we had an underflow (also a bug)
+ assert(state < state + kIncrHasS);
+ }
+
+ // It is straightfoward to make a token-less lock_shared() and
+ // unlock_shared() either by making the token-less version always use
+ // INLINE_SHARED mode or by removing the token version. Supporting
+ // deferred operation for both types is trickier than it appears, because
+ // the purpose of the token it so that unlock_shared doesn't have to
+ // look in other slots for its deferred lock. Token-less unlock_shared
+ // might place a deferred lock in one place and then release a different
+ // slot that was originally used by the token-ful version. If this was
+ // important we could solve the problem by differentiating the deferred
+ // locks so that cross-variety release wouldn't occur. The best way
+ // is probably to steal a bit from the pointer, making deferredLocks[]
+ // an array of Atom<uintptr_t>.
+
+ template <class WaitContext>
+ bool lockSharedImpl(Token* token, WaitContext& ctx) {
+ uint32_t state = state_.load(std::memory_order_relaxed);
+ if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
+ state_.compare_exchange_strong(state, state + kIncrHasS)) {
+ if (token != nullptr) {
+ token->type_ = Token::Type::INLINE_SHARED;
+ }
+ return true;
+ }
+ return lockSharedImpl(state, token, ctx);
+ }
+
+ template <class WaitContext>
+ bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
+ while (true) {
+ if (UNLIKELY((state & kHasE) != 0) &&
+ !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
+ return false;
+ }
+
+ uint32_t slot;
+ uintptr_t slotValue = 1; // any non-zero value will do
+
+ bool canAlreadyDefer = (state & kMayDefer) != 0;
+ bool aboveDeferThreshold =
+ (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
+ bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
+ if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
+ // starting point for our empty-slot search, can change after
+ // calling waitForZeroBits
+ uint32_t bestSlot =
+ (uint32_t)folly::detail::AccessSpreader<Atom>::current(
+ kMaxDeferredReaders);
+
+ // deferred readers are already enabled, or it is time to
+ // enable them if we can find a slot
+ for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
+ slot = bestSlot ^ i;
+ assert(slot < kMaxDeferredReaders);
+ slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
+ if (slotValue == 0) {
+ // found empty slot
+ break;
+ }
+ }
+ }
+
+ if (slotValue != 0) {
+ // not yet deferred, or no empty slots
+ if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
+ // successfully recorded the read lock inline
+ if (token != nullptr) {
+ token->type_ = Token::Type::INLINE_SHARED;
+ }
+ return true;
+ }
+ // state is updated, try again
+ continue;
+ }
+
+ // record that deferred readers might be in use if necessary
+ if ((state & kMayDefer) == 0) {
+ if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
+ // keep going if CAS failed because somebody else set the bit
+ // for us
+ if ((state & (kHasE | kMayDefer)) != kMayDefer) {
+ continue;
+ }
+ }
+ // state = state | kMayDefer;
+ }
+
+ // try to use the slot
+ bool gotSlot = deferredReader(slot)->compare_exchange_strong(
+ slotValue,
+ token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
+
+ // If we got the slot, we need to verify that an exclusive lock
+ // didn't happen since we last checked. If we didn't get the slot we
+ // need to recheck state_ anyway to make sure we don't waste too much
+ // work. It is also possible that since we checked state_ someone
+ // has acquired and released the write lock, clearing kMayDefer.
+ // Both cases are covered by looking for the readers-possible bit,
+ // because it is off when the exclusive lock bit is set.
+ state = state_.load(std::memory_order_acquire);
+
+ if (!gotSlot) {
+ continue;
+ }
+
+ if (token == nullptr) {
+ tls_lastTokenlessSlot = slot;
+ }
+
+ if ((state & kMayDefer) != 0) {
+ assert((state & kHasE) == 0);
+ // success
+ if (token != nullptr) {
+ token->type_ = Token::Type::DEFERRED_SHARED;
+ token->slot_ = (uint16_t)slot;
+ }
+ return true;
+ }
+
+ // release the slot before retrying
+ if (token == nullptr) {
+ // We can't rely on slot. Token-less slot values can be freed by
+ // any unlock_shared(), so we need to do the full deferredReader
+ // search during unlock. Unlike unlock_shared(), we can't trust
+ // kPrevDefer here. This deferred lock isn't visible to lock()
+ // (that's the whole reason we're undoing it) so there might have
+ // subsequently been an unlock() and lock() with no intervening
+ // transition to deferred mode.
+ if (!tryUnlockAnySharedDeferred()) {
+ unlockSharedInline();
+ }
+ } else {
+ if (!tryUnlockSharedDeferred(slot)) {
+ unlockSharedInline();
+ }
+ }
+
+ // We got here not because the lock was unavailable, but because
+ // we lost a compare-and-swap. Try-lock is typically allowed to
+ // have spurious failures, but there is no lock efficiency gain
+ // from exploiting that freedom here.
+ }
+ }
+
+ bool tryUnlockAnySharedDeferred() {
+ auto bestSlot = tls_lastTokenlessSlot;
+ for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
+ auto slotPtr = deferredReader(bestSlot ^ i);
+ auto slotValue = slotPtr->load(std::memory_order_relaxed);
+ if (slotValue == tokenlessSlotValue() &&
+ slotPtr->compare_exchange_strong(slotValue, 0)) {
+ tls_lastTokenlessSlot = bestSlot ^ i;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool tryUnlockSharedDeferred(uint32_t slot) {
+ assert(slot < kMaxDeferredReaders);
+ auto slotValue = tokenfulSlotValue();
+ return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
+ }
+
+ uint32_t unlockSharedInline() {
+ uint32_t state = (state_ -= kIncrHasS);
+ assert((state & (kHasE | kBegunE)) != 0 || state < state + kIncrHasS);
+ if ((state & kHasS) == 0) {
+ // Only the second half of lock() can be blocked by a non-zero
+ // reader count, so that's the only thing we need to wake
+ wakeRegisteredWaiters(state, kWaitingNotS);
+ }
+ return state;
+ }
+
+ template <class WaitContext>
+ bool lockUpgradeImpl(WaitContext& ctx) {
+ uint32_t state;
+ do {
+ if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
+ return false;
+ }
+ } while (!state_.compare_exchange_strong(state, state | kHasU));
+ return true;
+ }
+
+ public:
+ class ReadHolder {
+ public:
+ ReadHolder() : lock_(nullptr) {}
+
+ explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
+
+ explicit ReadHolder(const SharedMutexImpl& lock)
+ : lock_(const_cast<SharedMutexImpl*>(&lock)) {
+ lock_->lock_shared(token_);
+ }
+
+ ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
+ token_(rhs.token_) {
+ rhs.lock_ = nullptr;
+ }
+
+ // Downgrade from upgrade mode
+ explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
+ assert(upgraded.lock_ != nullptr);
+ upgraded.lock_ = nullptr;
+ lock_->unlock_upgrade_and_lock_shared(token_);
+ }
+
+ // Downgrade from exclusive mode
+ explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+ assert(writer.lock_ != nullptr);
+ writer.lock_ = nullptr;
+ lock_->unlock_and_lock_shared(token_);
+ }
+
+ ReadHolder& operator=(ReadHolder&& rhs) noexcept {
+ std::swap(lock_, rhs.lock_);
+ std::swap(token_, rhs.token_);
+ return *this;
+ }
+
+ ReadHolder(const ReadHolder& rhs) = delete;
+ ReadHolder& operator=(const ReadHolder& rhs) = delete;
+
+ ~ReadHolder() {
+ if (lock_) {
+ lock_->unlock_shared(token_);
+ }
+ }
+
+ private:
+ friend class UpgradeHolder;
+ friend class WriteHolder;
+ SharedMutexImpl* lock_;
+ SharedMutexToken token_;
+ };
+
+ class UpgradeHolder {
+ public:
+ UpgradeHolder() : lock_(nullptr) {}
+
+ explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
+
+ explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
+ lock_->lock_upgrade();
+ }
+
+ // Downgrade from exclusive mode
+ explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+ assert(writer.lock_ != nullptr);
+ writer.lock_ = nullptr;
+ lock_->unlock_and_lock_upgrade();
+ }
+
+ UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
+ rhs.lock_ = nullptr;
+ }
+
+ UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
+ std::swap(lock_, rhs.lock_);
+ return *this;
+ }
+
+ UpgradeHolder(const UpgradeHolder& rhs) = delete;
+ UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
+
+ ~UpgradeHolder() {
+ if (lock_) {
+ lock_->unlock_upgrade();
+ }
+ }
+
+ private:
+ friend class WriteHolder;
+ friend class ReadHolder;
+ SharedMutexImpl* lock_;
+ };
+
+ class WriteHolder {
+ public:
+ WriteHolder() : lock_(nullptr) {}
+
+ explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
+
+ explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
+ lock_->lock();
+ }
+
+ // Promotion from upgrade mode
+ explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
+ assert(upgrade.lock_ != nullptr);
+ upgrade.lock_ = nullptr;
+ lock_->unlock_upgrade_and_lock();
+ }
+
+ WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
+ rhs.lock_ = nullptr;
+ }
+
+ WriteHolder& operator=(WriteHolder&& rhs) noexcept {
+ std::swap(lock_, rhs.lock_);
+ return *this;
+ }
+
+ WriteHolder(const WriteHolder& rhs) = delete;
+ WriteHolder& operator=(const WriteHolder& rhs) = delete;
+
+ ~WriteHolder() {
+ if (lock_) {
+ lock_->unlock();
+ }
+ }
+
+ private:
+ friend class ReadHolder;
+ friend class UpgradeHolder;
+ SharedMutexImpl* lock_;
+ };
+
+ // Adapters for Synchronized<>
+ friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
+ friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
+ friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
+ friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
+};
+
+#define COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(type) \
+ template <> \
+ type::DeferredReaderSlot \
+ type::deferredReaders[type::kMaxDeferredReaders * \
+ type::kDeferredSeparationFactor] = {}; \
+ template <> \
+ FOLLY_TLS uint32_t type::tls_lastTokenlessSlot = 0;
+
+typedef SharedMutexImpl<true> SharedMutexReadPriority;
+typedef SharedMutexImpl<false> SharedMutexWritePriority;
+typedef SharedMutexWritePriority SharedMutex;
+
+} // namespace folly
--- /dev/null
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/experimental/SharedMutex.h>
+
+#include <stdlib.h>
+#include <thread>
+#include <vector>
+#include <boost/optional.hpp>
+#include <folly/Benchmark.h>
+#include <folly/MPMCQueue.h>
+#include <folly/Random.h>
+#include <folly/test/DeterministicSchedule.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include <boost/thread/shared_mutex.hpp>
+#include <folly/RWSpinLock.h>
+
+using namespace folly;
+using namespace folly::test;
+using namespace std;
+using namespace chrono;
+
+typedef DeterministicSchedule DSched;
+typedef SharedMutexImpl<true, void, DeterministicAtomic, true>
+ DSharedMutexReadPriority;
+typedef SharedMutexImpl<false, void, DeterministicAtomic, true>
+ DSharedMutexWritePriority;
+
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+ DSharedMutexReadPriority);
+COMMON_CONCURRENCY_SHARED_MUTEX_DECLARE_STATIC_STORAGE(
+ DSharedMutexWritePriority);
+
+template <typename Lock>
+void runBasicTest() {
+ Lock lock;
+ SharedMutexToken token1;
+ SharedMutexToken token2;
+ SharedMutexToken token3;
+
+ EXPECT_TRUE(lock.try_lock());
+ EXPECT_FALSE(lock.try_lock());
+ EXPECT_FALSE(lock.try_lock_shared(token1));
+ lock.unlock();
+
+ EXPECT_TRUE(lock.try_lock_shared(token1));
+ EXPECT_FALSE(lock.try_lock());
+ EXPECT_TRUE(lock.try_lock_shared(token2));
+ lock.lock_shared(token3);
+ lock.unlock_shared(token3);
+ lock.unlock_shared(token2);
+ lock.unlock_shared(token1);
+
+ lock.lock();
+ lock.unlock();
+
+ lock.lock_shared(token1);
+ lock.lock_shared(token2);
+ lock.unlock_shared(token1);
+ lock.unlock_shared(token2);
+
+ lock.lock();
+ lock.unlock_and_lock_shared(token1);
+ lock.lock_shared(token2);
+ lock.unlock_shared(token2);
+ lock.unlock_shared(token1);
+}
+
+TEST(SharedMutex, basic) {
+ runBasicTest<SharedMutexReadPriority>();
+ runBasicTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runBasicHoldersTest() {
+ Lock lock;
+ SharedMutexToken token;
+
+ {
+ typename Lock::WriteHolder holder(lock);
+ EXPECT_FALSE(lock.try_lock());
+ EXPECT_FALSE(lock.try_lock_shared(token));
+
+ typename Lock::WriteHolder holder2(std::move(holder));
+ typename Lock::WriteHolder holder3;
+ holder3 = std::move(holder2);
+
+ typename Lock::UpgradeHolder holder4(std::move(holder3));
+ typename Lock::WriteHolder holder5(std::move(holder4));
+
+ typename Lock::ReadHolder holder6(std::move(holder5));
+
+ EXPECT_FALSE(lock.try_lock());
+ EXPECT_TRUE(lock.try_lock_shared(token));
+ lock.unlock_shared(token);
+ }
+
+ {
+ typename Lock::WriteHolder holder(lock);
+ EXPECT_FALSE(lock.try_lock());
+ }
+
+ {
+ typename Lock::ReadHolder holder(lock);
+ typename Lock::ReadHolder holder2(lock);
+ typename Lock::UpgradeHolder holder3(lock);
+ }
+
+ {
+ typename Lock::UpgradeHolder holder(lock);
+ typename Lock::ReadHolder holder2(lock);
+ typename Lock::ReadHolder holder3(std::move(holder));
+ }
+}
+
+TEST(SharedMutex, basic_holders) {
+ runBasicHoldersTest<SharedMutexReadPriority>();
+ runBasicHoldersTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runManyReadLocksTestWithTokens() {
+ Lock lock;
+
+ vector<SharedMutexToken> tokens;
+ for (int i = 0; i < 1000; ++i) {
+ tokens.emplace_back();
+ EXPECT_TRUE(lock.try_lock_shared(tokens.back()));
+ }
+ for (auto& token : tokens) {
+ lock.unlock_shared(token);
+ }
+ EXPECT_TRUE(lock.try_lock());
+ lock.unlock();
+}
+
+TEST(SharedMutex, many_read_locks_with_tokens) {
+ runManyReadLocksTestWithTokens<SharedMutexReadPriority>();
+ runManyReadLocksTestWithTokens<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runManyReadLocksTestWithoutTokens() {
+ Lock lock;
+
+ for (int i = 0; i < 1000; ++i) {
+ EXPECT_TRUE(lock.try_lock_shared());
+ }
+ for (int i = 0; i < 1000; ++i) {
+ lock.unlock_shared();
+ }
+ EXPECT_TRUE(lock.try_lock());
+ lock.unlock();
+}
+
+TEST(SharedMutex, many_read_locks_without_tokens) {
+ runManyReadLocksTestWithoutTokens<SharedMutexReadPriority>();
+ runManyReadLocksTestWithoutTokens<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runTimeoutInPastTest() {
+ Lock lock;
+
+ EXPECT_TRUE(lock.try_lock_for(milliseconds(0)));
+ lock.unlock();
+ EXPECT_TRUE(lock.try_lock_for(milliseconds(-1)));
+ lock.unlock();
+ EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(0)));
+ lock.unlock_shared();
+ EXPECT_TRUE(lock.try_lock_shared_for(milliseconds(-1)));
+ lock.unlock_shared();
+ EXPECT_TRUE(lock.try_lock_until(system_clock::now() - milliseconds(1)));
+ lock.unlock();
+ EXPECT_TRUE(
+ lock.try_lock_shared_until(system_clock::now() - milliseconds(1)));
+ lock.unlock_shared();
+ EXPECT_TRUE(lock.try_lock_until(steady_clock::now() - milliseconds(1)));
+ lock.unlock();
+ EXPECT_TRUE(
+ lock.try_lock_shared_until(steady_clock::now() - milliseconds(1)));
+ lock.unlock_shared();
+}
+
+TEST(SharedMutex, timeout_in_past) {
+ runTimeoutInPastTest<SharedMutexReadPriority>();
+ runTimeoutInPastTest<SharedMutexWritePriority>();
+}
+
+template <class Func>
+bool funcHasDuration(milliseconds expectedDuration, Func func) {
+ // elapsed time should eventually fall within expectedDuration +- 25%
+ for (int tries = 0; tries < 100; ++tries) {
+ auto start = steady_clock::now();
+ func();
+ auto elapsed = steady_clock::now() - start;
+ if (elapsed > expectedDuration - expectedDuration / 4 &&
+ elapsed < expectedDuration + expectedDuration / 4) {
+ return true;
+ }
+ }
+ return false;
+}
+
+template <typename Lock>
+void runFailingTryTimeoutTest() {
+ Lock lock;
+ lock.lock();
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ typename Lock::Token token;
+ EXPECT_FALSE(lock.try_lock_shared_for(milliseconds(10), token));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_upgrade_for(milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ typename Lock::Token token;
+ EXPECT_FALSE(lock.try_lock_shared_until(
+ steady_clock::now() + milliseconds(10), token));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(
+ lock.try_lock_upgrade_until(steady_clock::now() + milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ typename Lock::Token token;
+ EXPECT_FALSE(lock.try_lock_shared_until(
+ system_clock::now() + milliseconds(10), token));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(
+ lock.try_lock_upgrade_until(system_clock::now() + milliseconds(10)));
+ }));
+ lock.unlock();
+
+ lock.lock_shared();
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_for(milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_until(steady_clock::now() + milliseconds(10)));
+ }));
+ EXPECT_TRUE(funcHasDuration(milliseconds(10), [&] {
+ EXPECT_FALSE(lock.try_lock_until(system_clock::now() + milliseconds(10)));
+ }));
+ lock.unlock_shared();
+
+ lock.lock();
+ for (int p = 0; p < 8; ++p) {
+ EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
+ }
+ lock.unlock();
+
+ for (int p = 0; p < 8; ++p) {
+ typename Lock::ReadHolder holder1(lock);
+ typename Lock::ReadHolder holder2(lock);
+ typename Lock::ReadHolder holder3(lock);
+ EXPECT_FALSE(lock.try_lock_for(nanoseconds(1 << p)));
+ }
+}
+
+TEST(SharedMutex, failing_try_timeout) {
+ runFailingTryTimeoutTest<SharedMutexReadPriority>();
+ runFailingTryTimeoutTest<SharedMutexWritePriority>();
+}
+
+template <typename Lock>
+void runBasicUpgradeTest() {
+ Lock lock;
+ typename Lock::Token token1;
+ typename Lock::Token token2;
+
+ lock.lock_upgrade();
+ EXPECT_FALSE(lock.try_lock());
+ EXPECT_TRUE(lock.try_lock_shared(token1));
+ lock.unlock_shared(token1);
+ lock.unlock_upgrade();
+
+ lock.lock_upgrade();
+ lock.unlock_upgrade_and_lock();
+ EXPECT_FALSE(lock.try_lock_shared(token1));
+ lock.unlock();
+
+ lock.lock_upgrade();
+ lock.unlock_upgrade_and_lock_shared(token1);
+ lock.lock_upgrade();
+ lock.unlock_upgrade_and_lock_shared(token2);
+ lock.unlock_shared(token1);
+ lock.unlock_shared(token2);
+
+ lock.lock();
+ lock.unlock_and_lock_upgrade();
+ EXPECT_TRUE(lock.try_lock_shared(token1));
+ lock.unlock_upgrade();
+ lock.unlock_shared(token1);
+}
+
+TEST(SharedMutex, basic_upgrade_tests) {
+ runBasicUpgradeTest<SharedMutexReadPriority>();
+ runBasicUpgradeTest<SharedMutexWritePriority>();
+}
+
+TEST(SharedMutex, read_has_prio) {
+ SharedMutexReadPriority lock;
+ SharedMutexToken token1;
+ SharedMutexToken token2;
+ lock.lock_shared(token1);
+ bool exclusiveAcquired = false;
+ auto writer = thread([&] {
+ lock.lock();
+ exclusiveAcquired = true;
+ lock.unlock();
+ });
+
+ // lock() can't complete until we unlock token1, but it should stake
+ // its claim with regards to other exclusive or upgrade locks. We can
+ // use try_lock_upgrade to poll for that eventuality.
+ while (lock.try_lock_upgrade()) {
+ lock.unlock_upgrade();
+ this_thread::yield();
+ }
+ EXPECT_FALSE(exclusiveAcquired);
+
+ // Even though lock() is stuck we should be able to get token2
+ EXPECT_TRUE(lock.try_lock_shared(token2));
+ lock.unlock_shared(token1);
+ lock.unlock_shared(token2);
+ writer.join();
+ EXPECT_TRUE(exclusiveAcquired);
+}
+
+TEST(SharedMutex, write_has_prio) {
+ SharedMutexWritePriority lock;
+ SharedMutexToken token1;
+ SharedMutexToken token2;
+ lock.lock_shared(token1);
+ auto writer = thread([&] {
+ lock.lock();
+ lock.unlock();
+ });
+
+ // eventually lock() should block readers
+ while (lock.try_lock_shared(token2)) {
+ lock.unlock_shared(token2);
+ this_thread::yield();
+ }
+
+ lock.unlock_shared(token1);
+ writer.join();
+}
+
+struct TokenLocker {
+ SharedMutexToken token;
+
+ template <typename T>
+ void lock(T* lock) {
+ lock->lock();
+ }
+
+ template <typename T>
+ void unlock(T* lock) {
+ lock->unlock();
+ }
+
+ template <typename T>
+ void lock_shared(T* lock) {
+ lock->lock_shared(token);
+ }
+
+ template <typename T>
+ void unlock_shared(T* lock) {
+ lock->unlock_shared(token);
+ }
+};
+
+struct Locker {
+ template <typename T>
+ void lock(T* lock) {
+ lock->lock();
+ }
+
+ template <typename T>
+ void unlock(T* lock) {
+ lock->unlock();
+ }
+
+ template <typename T>
+ void lock_shared(T* lock) {
+ lock->lock_shared();
+ }
+
+ template <typename T>
+ void unlock_shared(T* lock) {
+ lock->unlock_shared();
+ }
+};
+
+struct EnterLocker {
+ template <typename T>
+ void lock(T* lock) {
+ lock->lock(0);
+ }
+
+ template <typename T>
+ void unlock(T* lock) {
+ lock->unlock();
+ }
+
+ template <typename T>
+ void lock_shared(T* lock) {
+ lock->enter(0);
+ }
+
+ template <typename T>
+ void unlock_shared(T* lock) {
+ lock->leave();
+ }
+};
+
+struct PosixRWLock {
+ pthread_rwlock_t lock_;
+
+ PosixRWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+ ~PosixRWLock() { pthread_rwlock_destroy(&lock_); }
+
+ void lock() { pthread_rwlock_wrlock(&lock_); }
+
+ void unlock() { pthread_rwlock_unlock(&lock_); }
+
+ void lock_shared() { pthread_rwlock_rdlock(&lock_); }
+
+ void unlock_shared() { pthread_rwlock_unlock(&lock_); }
+};
+
+struct PosixMutex {
+ pthread_mutex_t lock_;
+
+ PosixMutex() { pthread_mutex_init(&lock_, nullptr); }
+
+ ~PosixMutex() { pthread_mutex_destroy(&lock_); }
+
+ void lock() { pthread_mutex_lock(&lock_); }
+
+ void unlock() { pthread_mutex_unlock(&lock_); }
+
+ void lock_shared() { pthread_mutex_lock(&lock_); }
+
+ void unlock_shared() { pthread_mutex_unlock(&lock_); }
+};
+
+template <template <typename> class Atom, typename Lock, typename Locker>
+static void runContendedReaders(size_t numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ char padding1[64];
+ Lock globalLock;
+ int valueProtectedByLock = 10;
+ char padding2[64];
+ Atom<bool> go(false);
+ Atom<bool>* goPtr = &go; // workaround for clang bug
+ vector<thread> threads(numThreads);
+
+ BENCHMARK_SUSPEND {
+ for (int t = 0; t < numThreads; ++t) {
+ threads[t] = DSched::thread([&, t, numThreads] {
+ Lock privateLock;
+ Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
+ Locker locker;
+ while (!goPtr->load()) {
+ this_thread::yield();
+ }
+ for (size_t op = t; op < numOps; op += numThreads) {
+ locker.lock_shared(lock);
+ // note: folly::doNotOptimizeAway reads and writes to its arg,
+ // so the following two lines are very different than a call
+ // to folly::doNotOptimizeAway(valueProtectedByLock);
+ auto copy = valueProtectedByLock;
+ folly::doNotOptimizeAway(copy);
+ locker.unlock_shared(lock);
+ }
+ });
+ }
+ }
+
+ go.store(true);
+ for (auto& thr : threads) {
+ DSched::join(thr);
+ }
+}
+
+static void folly_rwspin_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, RWSpinLock, Locker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_wr_pri_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_w_bare_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_rd_pri_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void shmtx_r_bare_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void folly_ticket_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, RWTicketSpinLock64, Locker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void boost_shared_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, boost::shared_mutex, Locker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+static void pthrd_rwlock_reads(uint numOps,
+ size_t numThreads,
+ bool useSeparateLocks) {
+ runContendedReaders<atomic, PosixRWLock, Locker>(
+ numOps, numThreads, useSeparateLocks);
+}
+
+template <template <typename> class Atom, typename Lock, typename Locker>
+static void runMixed(size_t numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ char padding1[64];
+ Lock globalLock;
+ int valueProtectedByLock = 0;
+ char padding2[64];
+ Atom<bool> go(false);
+ Atom<bool>* goPtr = &go; // workaround for clang bug
+ vector<thread> threads(numThreads);
+
+ BENCHMARK_SUSPEND {
+ for (int t = 0; t < numThreads; ++t) {
+ threads[t] = DSched::thread([&, t, numThreads] {
+ struct drand48_data buffer;
+ srand48_r(t, &buffer);
+ long writeThreshold = writeFraction * 0x7fffffff;
+ Lock privateLock;
+ Lock* lock = useSeparateLocks ? &privateLock : &globalLock;
+ Locker locker;
+ while (!goPtr->load()) {
+ this_thread::yield();
+ }
+ for (size_t op = t; op < numOps; op += numThreads) {
+ long randVal;
+ lrand48_r(&buffer, &randVal);
+ bool writeOp = randVal < writeThreshold;
+ SharedMutexToken token;
+ if (writeOp) {
+ locker.lock(lock);
+ if (!useSeparateLocks) {
+ ++valueProtectedByLock;
+ }
+ locker.unlock(lock);
+ } else {
+ locker.lock_shared(lock);
+ auto v = valueProtectedByLock;
+ folly::doNotOptimizeAway(v);
+ locker.unlock_shared(lock);
+ }
+ }
+ });
+ }
+ }
+
+ go.store(true);
+ for (auto& thr : threads) {
+ DSched::join(thr);
+ }
+}
+
+static void folly_rwspin(size_t numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, RWSpinLock, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_wr_pri(uint numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_w_bare(uint numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, SharedMutexWritePriority, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_rd_pri(uint numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void shmtx_r_bare(uint numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, SharedMutexReadPriority, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void folly_ticket(size_t numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, RWTicketSpinLock64, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void boost_shared(size_t numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, boost::shared_mutex, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void pthrd_rwlock(size_t numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, PosixRWLock, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+static void pthrd_mutex_(size_t numOps,
+ size_t numThreads,
+ double writeFraction,
+ bool useSeparateLocks) {
+ runMixed<atomic, PosixMutex, Locker>(
+ numOps, numThreads, writeFraction, useSeparateLocks);
+}
+
+template <typename Lock, template <typename> class Atom>
+static void runAllAndValidate(size_t numOps, size_t numThreads) {
+ Lock globalLock;
+ Atom<int> globalExclusiveCount(0);
+ Atom<int> globalUpgradeCount(0);
+ Atom<int> globalSharedCount(0);
+
+ Atom<bool> go(false);
+
+ // clang crashes on access to Atom<> captured by ref in closure
+ Atom<int>* globalExclusiveCountPtr = &globalExclusiveCount;
+ Atom<int>* globalUpgradeCountPtr = &globalUpgradeCount;
+ Atom<int>* globalSharedCountPtr = &globalSharedCount;
+ Atom<bool>* goPtr = &go;
+
+ vector<thread> threads(numThreads);
+
+ BENCHMARK_SUSPEND {
+ for (int t = 0; t < numThreads; ++t) {
+ threads[t] = DSched::thread([&, t, numThreads] {
+ struct drand48_data buffer;
+ srand48_r(t, &buffer);
+
+ bool exclusive = false;
+ bool upgrade = false;
+ bool shared = false;
+ bool ourGlobalTokenUsed = false;
+ SharedMutexToken ourGlobalToken;
+
+ Lock privateLock;
+ vector<SharedMutexToken> privateTokens;
+
+ while (!goPtr->load()) {
+ this_thread::yield();
+ }
+ for (size_t op = t; op < numOps; op += numThreads) {
+ // randVal in [0,1000)
+ long randVal;
+ lrand48_r(&buffer, &randVal);
+ randVal = (long)((randVal * (uint64_t)1000) / 0x7fffffff);
+
+ // make as many assertions as possible about the global state
+ if (exclusive) {
+ EXPECT_EQ(1, globalExclusiveCountPtr->load(memory_order_acquire));
+ EXPECT_EQ(0, globalUpgradeCountPtr->load(memory_order_acquire));
+ EXPECT_EQ(0, globalSharedCountPtr->load(memory_order_acquire));
+ }
+ if (upgrade) {
+ EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
+ EXPECT_EQ(1, globalUpgradeCountPtr->load(memory_order_acquire));
+ }
+ if (shared) {
+ EXPECT_EQ(0, globalExclusiveCountPtr->load(memory_order_acquire));
+ EXPECT_TRUE(globalSharedCountPtr->load(memory_order_acquire) > 0);
+ } else {
+ EXPECT_FALSE(ourGlobalTokenUsed);
+ }
+
+ // independent 20% chance we do something to the private lock
+ if (randVal < 200) {
+ // it's okay to take multiple private shared locks because
+ // we never take an exclusive lock, so reader versus writer
+ // priority doesn't cause deadlocks
+ if (randVal < 100 && privateTokens.size() > 0) {
+ auto i = randVal % privateTokens.size();
+ privateLock.unlock_shared(privateTokens[i]);
+ privateTokens.erase(privateTokens.begin() + i);
+ } else {
+ SharedMutexToken token;
+ privateLock.lock_shared(token);
+ privateTokens.push_back(token);
+ }
+ continue;
+ }
+
+ // if we've got a lock, the only thing we can do is release it
+ // or transform it into a different kind of lock
+ if (exclusive) {
+ exclusive = false;
+ --*globalExclusiveCountPtr;
+ if (randVal < 500) {
+ globalLock.unlock();
+ } else if (randVal < 700) {
+ globalLock.unlock_and_lock_shared();
+ ++*globalSharedCountPtr;
+ shared = true;
+ } else if (randVal < 900) {
+ globalLock.unlock_and_lock_shared(ourGlobalToken);
+ ++*globalSharedCountPtr;
+ shared = true;
+ ourGlobalTokenUsed = true;
+ } else {
+ globalLock.unlock_and_lock_upgrade();
+ ++*globalUpgradeCountPtr;
+ upgrade = true;
+ }
+ } else if (upgrade) {
+ upgrade = false;
+ --*globalUpgradeCountPtr;
+ if (randVal < 500) {
+ globalLock.unlock_upgrade();
+ } else if (randVal < 700) {
+ globalLock.unlock_upgrade_and_lock_shared();
+ ++*globalSharedCountPtr;
+ shared = true;
+ } else if (randVal < 900) {
+ globalLock.unlock_upgrade_and_lock_shared(ourGlobalToken);
+ ++*globalSharedCountPtr;
+ shared = true;
+ ourGlobalTokenUsed = true;
+ } else {
+ globalLock.unlock_upgrade_and_lock();
+ ++*globalExclusiveCountPtr;
+ exclusive = true;
+ }
+ } else if (shared) {
+ shared = false;
+ --*globalSharedCountPtr;
+ if (ourGlobalTokenUsed) {
+ globalLock.unlock_shared(ourGlobalToken);
+ ourGlobalTokenUsed = false;
+ } else {
+ globalLock.unlock_shared();
+ }
+ } else if (randVal < 400) {
+ // 40% chance of shared lock with token, 5 ways to get it
+
+ // delta t goes from -1 millis to 7 millis
+ auto dt = microseconds(10 * (randVal - 100));
+
+ if (randVal < 400) {
+ globalLock.lock_shared(ourGlobalToken);
+ shared = true;
+ } else if (randVal < 500) {
+ shared = globalLock.try_lock_shared(ourGlobalToken);
+ } else if (randVal < 600) {
+ shared = globalLock.try_lock_shared_for(dt, ourGlobalToken);
+ } else if (randVal < 800) {
+ shared = globalLock.try_lock_shared_until(
+ system_clock::now() + dt, ourGlobalToken);
+ }
+ if (shared) {
+ ourGlobalTokenUsed = true;
+ ++*globalSharedCountPtr;
+ }
+ } else if (randVal < 800) {
+ // 40% chance of shared lock without token
+ auto dt = microseconds(10 * (randVal - 100));
+ if (randVal < 400) {
+ globalLock.lock_shared();
+ shared = true;
+ } else if (randVal < 500) {
+ shared = globalLock.try_lock_shared();
+ } else if (randVal < 600) {
+ shared = globalLock.try_lock_shared_for(dt);
+ } else if (randVal < 800) {
+ shared = globalLock.try_lock_shared_until(
+ system_clock::now() + dt);
+ }
+ if (shared) {
+ ++*globalSharedCountPtr;
+ }
+ } else if (randVal < 900) {
+ // 10% change of upgrade lock
+ globalLock.lock_upgrade();
+ upgrade = true;
+ ++*globalUpgradeCountPtr;
+ } else {
+ // 10% chance of exclusive lock, 5 ways to get it
+
+ // delta t goes from -1 millis to 9 millis
+ auto dt = microseconds(100 * (randVal - 910));
+
+ if (randVal < 400) {
+ globalLock.lock();
+ exclusive = true;
+ } else if (randVal < 500) {
+ exclusive = globalLock.try_lock();
+ } else if (randVal < 600) {
+ exclusive = globalLock.try_lock_for(dt);
+ } else if (randVal < 700) {
+ exclusive = globalLock.try_lock_until(steady_clock::now() + dt);
+ } else {
+ exclusive = globalLock.try_lock_until(system_clock::now() + dt);
+ }
+ if (exclusive) {
+ ++*globalExclusiveCountPtr;
+ }
+ }
+ }
+
+ if (exclusive) {
+ --*globalExclusiveCountPtr;
+ globalLock.unlock();
+ }
+ if (upgrade) {
+ --*globalUpgradeCountPtr;
+ globalLock.unlock_upgrade();
+ }
+ if (shared) {
+ --*globalSharedCountPtr;
+ if (ourGlobalTokenUsed) {
+ globalLock.unlock_shared(ourGlobalToken);
+ ourGlobalTokenUsed = false;
+ } else {
+ globalLock.unlock_shared();
+ }
+ }
+ for (auto& token : privateTokens) {
+ privateLock.unlock_shared(token);
+ }
+ });
+ }
+ }
+
+ go.store(true);
+ for (auto& thr : threads) {
+ DSched::join(thr);
+ }
+}
+
+TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_read_prio) {
+ for (int pass = 0; pass < 3; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runContendedReaders<DeterministicAtomic,
+ DSharedMutexReadPriority,
+ Locker>(1000, 3, false);
+ }
+}
+
+TEST(SharedMutex, deterministic_concurrent_readers_of_one_lock_write_prio) {
+ for (int pass = 0; pass < 3; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runContendedReaders<DeterministicAtomic,
+ DSharedMutexWritePriority,
+ Locker>(1000, 3, false);
+ }
+}
+
+TEST(SharedMutex, concurrent_readers_of_one_lock_read_prio) {
+ for (int pass = 0; pass < 10; ++pass) {
+ runContendedReaders<atomic, SharedMutexReadPriority, Locker>(
+ 100000, 32, false);
+ }
+}
+
+TEST(SharedMutex, concurrent_readers_of_one_lock_write_prio) {
+ for (int pass = 0; pass < 10; ++pass) {
+ runContendedReaders<atomic, SharedMutexWritePriority, Locker>(
+ 100000, 32, false);
+ }
+}
+
+TEST(SharedMutex, deterministic_readers_of_concurrent_locks_read_prio) {
+ for (int pass = 0; pass < 3; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runContendedReaders<DeterministicAtomic,
+ DSharedMutexReadPriority,
+ Locker>(1000, 3, true);
+ }
+}
+
+TEST(SharedMutex, deterministic_readers_of_concurrent_locks_write_prio) {
+ for (int pass = 0; pass < 3; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runContendedReaders<DeterministicAtomic,
+ DSharedMutexWritePriority,
+ Locker>(1000, 3, true);
+ }
+}
+
+TEST(SharedMutex, readers_of_concurrent_locks_read_prio) {
+ for (int pass = 0; pass < 10; ++pass) {
+ runContendedReaders<atomic, SharedMutexReadPriority, TokenLocker>(
+ 100000, 32, true);
+ }
+}
+
+TEST(SharedMutex, readers_of_concurrent_locks_write_prio) {
+ for (int pass = 0; pass < 10; ++pass) {
+ runContendedReaders<atomic, SharedMutexWritePriority, TokenLocker>(
+ 100000, 32, true);
+ }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_read_read_prio) {
+ for (int pass = 0; pass < 3; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runMixed<DeterministicAtomic, DSharedMutexReadPriority, Locker>(
+ 1000, 3, 0.1, false);
+ }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_read_write_prio) {
+ for (int pass = 0; pass < 3; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runMixed<DeterministicAtomic, DSharedMutexWritePriority, Locker>(
+ 1000, 3, 0.1, false);
+ }
+}
+
+TEST(SharedMutex, mixed_mostly_read_read_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+ 50000, 32, 0.1, false);
+ }
+}
+
+TEST(SharedMutex, mixed_mostly_read_write_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+ 50000, 32, 0.1, false);
+ }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_write_read_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runMixed<DeterministicAtomic, DSharedMutexReadPriority, TokenLocker>(
+ 1000, 10, 0.9, false);
+ }
+}
+
+TEST(SharedMutex, deterministic_mixed_mostly_write_write_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runMixed<DeterministicAtomic, DSharedMutexWritePriority, TokenLocker>(
+ 1000, 10, 0.9, false);
+ }
+}
+
+TEST(SharedMutex, mixed_mostly_write_read_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ runMixed<atomic, SharedMutexReadPriority, TokenLocker>(
+ 50000, 300, 0.9, false);
+ }
+}
+
+TEST(SharedMutex, mixed_mostly_write_write_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ runMixed<atomic, SharedMutexWritePriority, TokenLocker>(
+ 50000, 300, 0.9, false);
+ }
+}
+
+TEST(SharedMutex, deterministic_all_ops_read_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runAllAndValidate<DSharedMutexReadPriority, DeterministicAtomic>(1000, 8);
+ }
+}
+
+TEST(SharedMutex, deterministic_all_ops_write_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runAllAndValidate<DSharedMutexWritePriority, DeterministicAtomic>(1000, 8);
+ }
+}
+
+TEST(SharedMutex, all_ops_read_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ runAllAndValidate<SharedMutexReadPriority, atomic>(100000, 32);
+ }
+}
+
+TEST(SharedMutex, all_ops_write_prio) {
+ for (int pass = 0; pass < 5; ++pass) {
+ runAllAndValidate<SharedMutexWritePriority, atomic>(100000, 32);
+ }
+}
+
+FOLLY_ASSUME_FBVECTOR_COMPATIBLE(
+ boost::optional<boost::optional<SharedMutexToken>>)
+
+// Setup is a set of threads that either grab a shared lock, or exclusive
+// and then downgrade it, or upgrade then upgrade and downgrade, then
+// enqueue the shared lock to a second set of threads that just performs
+// unlocks. Half of the shared locks use tokens, the others don't.
+template <typename Lock, template <typename> class Atom>
+static void runRemoteUnlock(size_t numOps,
+ double preWriteFraction,
+ double preUpgradeFraction,
+ size_t numSendingThreads,
+ size_t numReceivingThreads) {
+ Lock globalLock;
+ MPMCQueue<boost::optional<boost::optional<SharedMutexToken>>, Atom>
+ queue(10);
+ auto queuePtr = &queue; // workaround for clang crash
+
+ Atom<bool> go(false);
+ auto goPtr = &go; // workaround for clang crash
+ Atom<int> pendingSenders(numSendingThreads);
+ auto pendingSendersPtr = &pendingSenders; // workaround for clang crash
+ vector<thread> threads(numSendingThreads + numReceivingThreads);
+
+ BENCHMARK_SUSPEND {
+ for (int t = 0; t < threads.size(); ++t) {
+ threads[t] = DSched::thread([&, t, numSendingThreads] {
+ if (t >= numSendingThreads) {
+ // we're a receiver
+ typename decltype(queue)::value_type elem;
+ while (true) {
+ queuePtr->blockingRead(elem);
+ if (!elem) {
+ // EOF, pass the EOF token
+ queuePtr->blockingWrite(std::move(elem));
+ break;
+ }
+ if (*elem) {
+ globalLock.unlock_shared(**elem);
+ } else {
+ globalLock.unlock_shared();
+ }
+ }
+ return;
+ }
+ // else we're a sender
+
+ struct drand48_data buffer;
+ srand48_r(t, &buffer);
+
+ while (!goPtr->load()) {
+ this_thread::yield();
+ }
+ for (size_t op = t; op < numOps; op += numSendingThreads) {
+ long unscaledRandVal;
+ lrand48_r(&buffer, &unscaledRandVal);
+
+ // randVal in [0,1]
+ double randVal = ((double)unscaledRandVal) / 0x7fffffff;
+
+ // extract a bit and rescale
+ bool useToken = randVal >= 0.5;
+ randVal = (randVal - (useToken ? 0.5 : 0.0)) * 2;
+
+ boost::optional<SharedMutexToken> maybeToken;
+
+ if (useToken) {
+ SharedMutexToken token;
+ if (randVal < preWriteFraction) {
+ globalLock.lock();
+ globalLock.unlock_and_lock_shared(token);
+ } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
+ globalLock.lock_upgrade();
+ globalLock.unlock_upgrade_and_lock_shared(token);
+ } else if (randVal < preWriteFraction + preUpgradeFraction) {
+ globalLock.lock_upgrade();
+ globalLock.unlock_upgrade_and_lock();
+ globalLock.unlock_and_lock_shared(token);
+ } else {
+ globalLock.lock_shared(token);
+ }
+ maybeToken = token;
+ } else {
+ if (randVal < preWriteFraction) {
+ globalLock.lock();
+ globalLock.unlock_and_lock_shared();
+ } else if (randVal < preWriteFraction + preUpgradeFraction / 2) {
+ globalLock.lock_upgrade();
+ globalLock.unlock_upgrade_and_lock_shared();
+ } else if (randVal < preWriteFraction + preUpgradeFraction) {
+ globalLock.lock_upgrade();
+ globalLock.unlock_upgrade_and_lock();
+ globalLock.unlock_and_lock_shared();
+ } else {
+ globalLock.lock_shared();
+ }
+ }
+
+ // blockingWrite is emplace-like, so this automatically adds
+ // another level of wrapping
+ queuePtr->blockingWrite(maybeToken);
+ }
+ if (--*pendingSendersPtr == 0) {
+ queuePtr->blockingWrite(boost::none);
+ }
+ });
+ }
+ }
+
+ go.store(true);
+ for (auto& thr : threads) {
+ DSched::join(thr);
+ }
+}
+
+TEST(SharedMutex, deterministic_remote_write_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runRemoteUnlock<DSharedMutexWritePriority, DeterministicAtomic>(
+ 500, 0.1, 0.1, 5, 5);
+ }
+}
+
+TEST(SharedMutex, deterministic_remote_read_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ DSched sched(DSched::uniform(pass));
+ runRemoteUnlock<DSharedMutexReadPriority, DeterministicAtomic>(
+ 500, 0.1, 0.1, 5, 5);
+ }
+}
+
+TEST(SharedMutex, remote_write_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ runRemoteUnlock<SharedMutexWritePriority, atomic>(100000, 0.1, 0.1, 5, 5);
+ }
+}
+
+TEST(SharedMutex, remote_read_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ // LOG(INFO) << "pass " << pass;
+ runRemoteUnlock<SharedMutexReadPriority, atomic>(100000, 0.1, 0.1, 5, 5);
+ }
+}
+
+static void burn(size_t n) {
+ for (size_t i = 0; i < n; ++i) {
+ folly::doNotOptimizeAway(i);
+ }
+}
+
+// Two threads and three locks, arranged so that they have to proceed
+// in turn with reader/writer conflict
+template <typename Lock, template <typename> class Atom = atomic>
+static void runPingPong(size_t numRounds, size_t burnCount) {
+ char padding1[56];
+ pair<Lock, char[56]> locks[3];
+ char padding2[56];
+
+ Atom<int> avail(0);
+ auto availPtr = &avail; // workaround for clang crash
+ Atom<bool> go(false);
+ auto goPtr = &go; // workaround for clang crash
+ vector<thread> threads(2);
+
+ locks[0].first.lock();
+ locks[1].first.lock();
+ locks[2].first.lock_shared();
+
+ BENCHMARK_SUSPEND {
+ threads[0] = DSched::thread([&] {
+ ++*availPtr;
+ while (!goPtr->load()) {
+ this_thread::yield();
+ }
+ for (int i = 0; i < numRounds; ++i) {
+ locks[i % 3].first.unlock();
+ locks[(i + 2) % 3].first.lock();
+ burn(burnCount);
+ }
+ });
+ threads[1] = DSched::thread([&] {
+ ++*availPtr;
+ while (!goPtr->load()) {
+ this_thread::yield();
+ }
+ for (int i = 0; i < numRounds; ++i) {
+ locks[i % 3].first.lock_shared();
+ burn(burnCount);
+ locks[(i + 2) % 3].first.unlock_shared();
+ }
+ });
+
+ while (avail.load() < 2) {
+ this_thread::yield();
+ }
+ }
+
+ go.store(true);
+ for (auto& thr : threads) {
+ DSched::join(thr);
+ }
+ locks[numRounds % 3].first.unlock();
+ locks[(numRounds + 1) % 3].first.unlock();
+ locks[(numRounds + 2) % 3].first.unlock_shared();
+}
+
+static void folly_rwspin_ping_pong(size_t n, size_t scale, size_t burnCount) {
+ runPingPong<RWSpinLock>(n / scale, burnCount);
+}
+
+static void shmtx_w_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
+ runPingPong<SharedMutexWritePriority>(n / scale, burnCount);
+}
+
+static void shmtx_r_bare_ping_pong(size_t n, size_t scale, size_t burnCount) {
+ runPingPong<SharedMutexReadPriority>(n / scale, burnCount);
+}
+
+static void folly_ticket_ping_pong(size_t n, size_t scale, size_t burnCount) {
+ runPingPong<RWTicketSpinLock64>(n / scale, burnCount);
+}
+
+static void boost_shared_ping_pong(size_t n, size_t scale, size_t burnCount) {
+ runPingPong<boost::shared_mutex>(n / scale, burnCount);
+}
+
+static void pthrd_rwlock_ping_pong(size_t n, size_t scale, size_t burnCount) {
+ runPingPong<PosixRWLock>(n / scale, burnCount);
+}
+
+TEST(SharedMutex, deterministic_ping_pong_write_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ DSched sched(DSched::uniform(pass));
+ runPingPong<DSharedMutexWritePriority, DeterministicAtomic>(500, 0);
+ }
+}
+
+TEST(SharedMutex, deterministic_ping_pong_read_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ DSched sched(DSched::uniform(pass));
+ runPingPong<DSharedMutexReadPriority, DeterministicAtomic>(500, 0);
+ }
+}
+
+TEST(SharedMutex, ping_pong_write_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ runPingPong<SharedMutexWritePriority, atomic>(50000, 0);
+ }
+}
+
+TEST(SharedMutex, ping_pong_read_prio) {
+ for (int pass = 0; pass < 1; ++pass) {
+ runPingPong<SharedMutexReadPriority, atomic>(50000, 0);
+ }
+}
+
+// This is here so you can tell how much of the runtime reported by the
+// more complex harnesses is due to the harness, although due to the
+// magic of compiler optimization it may also be slower
+BENCHMARK(single_thread_lock_shared_unlock_shared, iters) {
+ SharedMutex lock;
+ for (size_t n = 0; n < iters; ++n) {
+ SharedMutex::Token token;
+ lock.lock_shared(token);
+ folly::doNotOptimizeAway(0);
+ lock.unlock_shared(token);
+ }
+}
+
+BENCHMARK(single_thread_lock_unlock, iters) {
+ SharedMutex lock;
+ for (size_t n = 0; n < iters; ++n) {
+ lock.lock();
+ folly::doNotOptimizeAway(0);
+ lock.unlock();
+ }
+}
+
+#define BENCH_BASE(args...) BENCHMARK_NAMED_PARAM(args)
+#define BENCH_REL(args...) BENCHMARK_RELATIVE_NAMED_PARAM(args)
+
+// 100% reads. Best-case scenario for deferred locks. Lock is colocated
+// with read data, so inline lock takes cache miss every time but deferred
+// lock has only cache hits and local access.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 1thread, 1, false)
+BENCH_REL (shmtx_wr_pri_reads, 1thread, 1, false)
+BENCH_REL (shmtx_w_bare_reads, 1thread, 1, false)
+BENCH_REL (shmtx_rd_pri_reads, 1thread, 1, false)
+BENCH_REL (shmtx_r_bare_reads, 1thread, 1, false)
+BENCH_REL (folly_ticket_reads, 1thread, 1, false)
+BENCH_REL (boost_shared_reads, 1thread, 1, false)
+BENCH_REL (pthrd_rwlock_reads, 1thread, 1, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 2thread, 2, false)
+BENCH_REL (shmtx_wr_pri_reads, 2thread, 2, false)
+BENCH_REL (shmtx_w_bare_reads, 2thread, 2, false)
+BENCH_REL (shmtx_rd_pri_reads, 2thread, 2, false)
+BENCH_REL (shmtx_r_bare_reads, 2thread, 2, false)
+BENCH_REL (folly_ticket_reads, 2thread, 2, false)
+BENCH_REL (boost_shared_reads, 2thread, 2, false)
+BENCH_REL (pthrd_rwlock_reads, 2thread, 2, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 4thread, 4, false)
+BENCH_REL (shmtx_wr_pri_reads, 4thread, 4, false)
+BENCH_REL (shmtx_w_bare_reads, 4thread, 4, false)
+BENCH_REL (shmtx_rd_pri_reads, 4thread, 4, false)
+BENCH_REL (shmtx_r_bare_reads, 4thread, 4, false)
+BENCH_REL (folly_ticket_reads, 4thread, 4, false)
+BENCH_REL (boost_shared_reads, 4thread, 4, false)
+BENCH_REL (pthrd_rwlock_reads, 4thread, 4, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 8thread, 8, false)
+BENCH_REL (shmtx_wr_pri_reads, 8thread, 8, false)
+BENCH_REL (shmtx_w_bare_reads, 8thread, 8, false)
+BENCH_REL (shmtx_rd_pri_reads, 8thread, 8, false)
+BENCH_REL (shmtx_r_bare_reads, 8thread, 8, false)
+BENCH_REL (folly_ticket_reads, 8thread, 8, false)
+BENCH_REL (boost_shared_reads, 8thread, 8, false)
+BENCH_REL (pthrd_rwlock_reads, 8thread, 8, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 16thread, 16, false)
+BENCH_REL (shmtx_wr_pri_reads, 16thread, 16, false)
+BENCH_REL (shmtx_w_bare_reads, 16thread, 16, false)
+BENCH_REL (shmtx_rd_pri_reads, 16thread, 16, false)
+BENCH_REL (shmtx_r_bare_reads, 16thread, 16, false)
+BENCH_REL (folly_ticket_reads, 16thread, 16, false)
+BENCH_REL (boost_shared_reads, 16thread, 16, false)
+BENCH_REL (pthrd_rwlock_reads, 16thread, 16, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 32thread, 32, false)
+BENCH_REL (shmtx_wr_pri_reads, 32thread, 32, false)
+BENCH_REL (shmtx_w_bare_reads, 32thread, 32, false)
+BENCH_REL (shmtx_rd_pri_reads, 32thread, 32, false)
+BENCH_REL (shmtx_r_bare_reads, 32thread, 32, false)
+BENCH_REL (folly_ticket_reads, 32thread, 32, false)
+BENCH_REL (boost_shared_reads, 32thread, 32, false)
+BENCH_REL (pthrd_rwlock_reads, 32thread, 32, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_reads, 64thread, 64, false)
+BENCH_REL (shmtx_wr_pri_reads, 64thread, 64, false)
+BENCH_REL (shmtx_w_bare_reads, 64thread, 64, false)
+BENCH_REL (shmtx_rd_pri_reads, 64thread, 64, false)
+BENCH_REL (shmtx_r_bare_reads, 64thread, 64, false)
+BENCH_REL (folly_ticket_reads, 64thread, 64, false)
+BENCH_REL (boost_shared_reads, 64thread, 64, false)
+BENCH_REL (pthrd_rwlock_reads, 64thread, 64, false)
+
+// 1 lock used by everybody, 100% writes. Threads only hurt, but it is
+// good to not fail catastrophically. Compare to single_thread_lock_unlock
+// to see the overhead of the generic driver (and its pseudo-random number
+// generator). pthrd_mutex_ is a pthread_mutex_t (default, not adaptive),
+// which is better than any of the reader-writer locks for this scenario.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (folly_ticket, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (boost_shared, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (pthrd_rwlock, 1thread_all_write, 1, 1.0, false)
+BENCH_REL (pthrd_mutex_, 1thread_all_write, 1, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (folly_ticket, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (boost_shared, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (pthrd_rwlock, 2thread_all_write, 2, 1.0, false)
+BENCH_REL (pthrd_mutex_, 2thread_all_write, 2, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (folly_ticket, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (boost_shared, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (pthrd_rwlock, 4thread_all_write, 4, 1.0, false)
+BENCH_REL (pthrd_mutex_, 4thread_all_write, 4, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (folly_ticket, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (boost_shared, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (pthrd_rwlock, 8thread_all_write, 8, 1.0, false)
+BENCH_REL (pthrd_mutex_, 8thread_all_write, 8, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (folly_ticket, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (boost_shared, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (pthrd_rwlock, 16thread_all_write, 16, 1.0, false)
+BENCH_REL (pthrd_mutex_, 16thread_all_write, 16, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (folly_ticket, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (boost_shared, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (pthrd_rwlock, 32thread_all_write, 32, 1.0, false)
+BENCH_REL (pthrd_mutex_, 32thread_all_write, 32, 1.0, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (shmtx_wr_pri, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (shmtx_rd_pri, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (folly_ticket, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (boost_shared, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (pthrd_rwlock, 64thread_all_write, 64, 1.0, false)
+BENCH_REL (pthrd_mutex_, 64thread_all_write, 64, 1.0, false)
+
+// 1 lock used by everybody, 10% writes. Not much scaling to be had. Perf
+// is best at 1 thread, once you've got multiple threads > 8 threads hurts.
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (folly_ticket, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (boost_shared, 1thread_10pct_write, 1, 0.10, false)
+BENCH_REL (pthrd_rwlock, 1thread_10pct_write, 1, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (folly_ticket, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (boost_shared, 2thread_10pct_write, 2, 0.10, false)
+BENCH_REL (pthrd_rwlock, 2thread_10pct_write, 2, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (folly_ticket, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (boost_shared, 4thread_10pct_write, 4, 0.10, false)
+BENCH_REL (pthrd_rwlock, 4thread_10pct_write, 4, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (folly_ticket, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (boost_shared, 8thread_10pct_write, 8, 0.10, false)
+BENCH_REL (pthrd_rwlock, 8thread_10pct_write, 8, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (folly_ticket, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (boost_shared, 16thread_10pct_write, 16, 0.10, false)
+BENCH_REL (pthrd_rwlock, 16thread_10pct_write, 16, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (folly_ticket, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (boost_shared, 32thread_10pct_write, 32, 0.10, false)
+BENCH_REL (pthrd_rwlock, 32thread_10pct_write, 32, 0.10, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (shmtx_wr_pri, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (shmtx_rd_pri, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (folly_ticket, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (boost_shared, 64thread_10pct_write, 64, 0.10, false)
+BENCH_REL (pthrd_rwlock, 64thread_10pct_write, 64, 0.10, false)
+
+// 1 lock used by everybody, 1% writes. This is a more realistic example
+// than the concurrent_*_reads benchmark, but still shows SharedMutex locks
+// winning over all of the others
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_w_bare, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (shmtx_r_bare, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (folly_ticket, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (boost_shared, 1thread_1pct_write, 1, 0.01, false)
+BENCH_REL (pthrd_rwlock, 1thread_1pct_write, 1, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_w_bare, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (shmtx_r_bare, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (folly_ticket, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (boost_shared, 2thread_1pct_write, 2, 0.01, false)
+BENCH_REL (pthrd_rwlock, 2thread_1pct_write, 2, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_w_bare, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (shmtx_r_bare, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (folly_ticket, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (boost_shared, 4thread_1pct_write, 4, 0.01, false)
+BENCH_REL (pthrd_rwlock, 4thread_1pct_write, 4, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_w_bare, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (shmtx_r_bare, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (folly_ticket, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (boost_shared, 8thread_1pct_write, 8, 0.01, false)
+BENCH_REL (pthrd_rwlock, 8thread_1pct_write, 8, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_w_bare, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (shmtx_r_bare, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (folly_ticket, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (boost_shared, 16thread_1pct_write, 16, 0.01, false)
+BENCH_REL (pthrd_rwlock, 16thread_1pct_write, 16, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_w_bare, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (shmtx_r_bare, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (folly_ticket, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (boost_shared, 32thread_1pct_write, 32, 0.01, false)
+BENCH_REL (pthrd_rwlock, 32thread_1pct_write, 32, 0.01, false)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_wr_pri, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_w_bare, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_rd_pri, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (shmtx_r_bare, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (folly_ticket, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (boost_shared, 64thread_1pct_write, 64, 0.01, false)
+BENCH_REL (pthrd_rwlock, 64thread_1pct_write, 64, 0.01, false)
+
+// Worst case scenario for deferred locks. No actual sharing, likely that
+// read operations will have to first set the kDeferredReadersPossibleBit,
+// and likely that writers will have to scan deferredReaders[].
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_50pct_write, 2, 0.50, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_50pct_write, 4, 0.50, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_50pct_write, 8, 0.50, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_50pct_write, 16, 0.50, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_50pct_write, 32, 0.50, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_50pct_write, 64, 0.50, true)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_10pct_write, 2, 0.10, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_10pct_write, 4, 0.10, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_10pct_write, 8, 0.10, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_10pct_write, 16, 0.10, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_10pct_write, 32, 0.10, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_10pct_write, 64, 0.10, true)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 2thr_2lock_1pct_write, 2, 0.01, true)
+BENCH_BASE(folly_rwspin, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 4thr_4lock_1pct_write, 4, 0.01, true)
+BENCH_BASE(folly_rwspin, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 8thr_8lock_1pct_write, 8, 0.01, true)
+BENCH_BASE(folly_rwspin, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 16thr_16lock_1pct_write, 16, 0.01, true)
+BENCH_BASE(folly_rwspin, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 32thr_32lock_1pct_write, 32, 0.01, true)
+BENCH_BASE(folly_rwspin, 64thr_64lock_1pct_write, 64, 0.01, true)
+BENCH_REL (shmtx_wr_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
+BENCH_REL (shmtx_rd_pri, 64thr_64lock_1pct_write, 64, 0.01, true)
+
+// Ping-pong tests have a scaled number of iterations, because their burn
+// loop would make them too slow otherwise. Ping-pong with burn count of
+// 100k or 300k shows the advantage of soft-spin, reducing the cost of
+// each wakeup by about 20 usec. (Take benchmark reported difference,
+// ~400 nanos, multiply by the scale of 100, then divide by 2 because
+// each round has two wakeups.)
+BENCHMARK_DRAW_LINE()
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn0, 1, 0)
+BENCH_REL (shmtx_w_bare_ping_pong, burn0, 1, 0)
+BENCH_REL (shmtx_r_bare_ping_pong, burn0, 1, 0)
+BENCH_REL (folly_ticket_ping_pong, burn0, 1, 0)
+BENCH_REL (boost_shared_ping_pong, burn0, 1, 0)
+BENCH_REL (pthrd_rwlock_ping_pong, burn0, 1, 0)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn100k, 100, 100000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn100k, 100, 100000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn100k, 100, 100000)
+BENCH_REL (folly_ticket_ping_pong, burn100k, 100, 100000)
+BENCH_REL (boost_shared_ping_pong, burn100k, 100, 100000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn100k, 100, 100000)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn300k, 100, 300000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn300k, 100, 300000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn300k, 100, 300000)
+BENCH_REL (folly_ticket_ping_pong, burn300k, 100, 300000)
+BENCH_REL (boost_shared_ping_pong, burn300k, 100, 300000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn300k, 100, 300000)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(folly_rwspin_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (shmtx_w_bare_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (shmtx_r_bare_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (folly_ticket_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (boost_shared_ping_pong, burn1M, 1000, 1000000)
+BENCH_REL (pthrd_rwlock_ping_pong, burn1M, 1000, 1000000)
+
+// Reproduce with 10 minutes and
+// sudo nice -n -20 \
+// shared_mutex_test --benchmark --bm_min_iters=1000000
+//
+// Comparison use folly::RWSpinLock as the baseline, with the
+// following row being the default SharedMutex (using *Holder or
+// Token-ful methods).
+// ============================================================================
+// folly/experimental/test/SharedMutexTest.cpp relative time/iter iters/s
+// ============================================================================
+// single_thread_lock_shared_unlock_shared 23.01ns 43.47M
+// single_thread_lock_unlock 25.42ns 39.34M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(1thread) 15.13ns 66.10M
+// shmtx_wr_pri_reads(1thread) 73.76% 20.51ns 48.75M
+// shmtx_w_bare_reads(1thread) 59.49% 25.43ns 39.32M
+// shmtx_rd_pri_reads(1thread) 72.60% 20.84ns 47.99M
+// shmtx_r_bare_reads(1thread) 59.62% 25.37ns 39.41M
+// folly_ticket_reads(1thread) 55.40% 27.31ns 36.62M
+// boost_shared_reads(1thread) 10.88% 139.01ns 7.19M
+// pthrd_rwlock_reads(1thread) 40.70% 37.17ns 26.90M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(2thread) 47.51ns 21.05M
+// shmtx_wr_pri_reads(2thread) 237.28% 20.02ns 49.94M
+// shmtx_w_bare_reads(2thread) 222.10% 21.39ns 46.74M
+// shmtx_rd_pri_reads(2thread) 251.68% 18.88ns 52.97M
+// shmtx_r_bare_reads(2thread) 222.29% 21.37ns 46.78M
+// folly_ticket_reads(2thread) 55.00% 86.39ns 11.58M
+// boost_shared_reads(2thread) 22.86% 207.81ns 4.81M
+// pthrd_rwlock_reads(2thread) 61.36% 77.43ns 12.92M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(4thread) 69.29ns 14.43M
+// shmtx_wr_pri_reads(4thread) 694.46% 9.98ns 100.23M
+// shmtx_w_bare_reads(4thread) 650.25% 10.66ns 93.85M
+// shmtx_rd_pri_reads(4thread) 738.08% 9.39ns 106.53M
+// shmtx_r_bare_reads(4thread) 650.71% 10.65ns 93.92M
+// folly_ticket_reads(4thread) 63.86% 108.49ns 9.22M
+// boost_shared_reads(4thread) 19.53% 354.79ns 2.82M
+// pthrd_rwlock_reads(4thread) 33.86% 204.61ns 4.89M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(8thread) 75.34ns 13.27M
+// shmtx_wr_pri_reads(8thread) 1500.46% 5.02ns 199.16M
+// shmtx_w_bare_reads(8thread) 1397.84% 5.39ns 185.54M
+// shmtx_rd_pri_reads(8thread) 1589.99% 4.74ns 211.05M
+// shmtx_r_bare_reads(8thread) 1398.83% 5.39ns 185.67M
+// folly_ticket_reads(8thread) 53.26% 141.45ns 7.07M
+// boost_shared_reads(8thread) 26.24% 287.11ns 3.48M
+// pthrd_rwlock_reads(8thread) 43.40% 173.57ns 5.76M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(16thread) 80.81ns 12.38M
+// shmtx_wr_pri_reads(16thread) 3119.49% 2.59ns 386.05M
+// shmtx_w_bare_reads(16thread) 2916.06% 2.77ns 360.87M
+// shmtx_rd_pri_reads(16thread) 3330.06% 2.43ns 412.11M
+// shmtx_r_bare_reads(16thread) 2909.05% 2.78ns 360.01M
+// folly_ticket_reads(16thread) 44.59% 181.21ns 5.52M
+// boost_shared_reads(16thread) 29.56% 273.40ns 3.66M
+// pthrd_rwlock_reads(16thread) 48.39% 166.99ns 5.99M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(32thread) 73.29ns 13.64M
+// shmtx_wr_pri_reads(32thread) 4417.58% 1.66ns 602.77M
+// shmtx_w_bare_reads(32thread) 4463.71% 1.64ns 609.06M
+// shmtx_rd_pri_reads(32thread) 4777.84% 1.53ns 651.92M
+// shmtx_r_bare_reads(32thread) 4312.45% 1.70ns 588.42M
+// folly_ticket_reads(32thread) 25.56% 286.75ns 3.49M
+// boost_shared_reads(32thread) 22.08% 331.86ns 3.01M
+// pthrd_rwlock_reads(32thread) 46.72% 156.87ns 6.37M
+// ----------------------------------------------------------------------------
+// folly_rwspin_reads(64thread) 74.92ns 13.35M
+// shmtx_wr_pri_reads(64thread) 4171.71% 1.80ns 556.83M
+// shmtx_w_bare_reads(64thread) 3973.49% 1.89ns 530.37M
+// shmtx_rd_pri_reads(64thread) 4404.73% 1.70ns 587.94M
+// shmtx_r_bare_reads(64thread) 3985.48% 1.88ns 531.98M
+// folly_ticket_reads(64thread) 26.07% 287.39ns 3.48M
+// boost_shared_reads(64thread) 23.59% 317.64ns 3.15M
+// pthrd_rwlock_reads(64thread) 49.54% 151.24ns 6.61M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_all_write) 25.29ns 39.53M
+// shmtx_wr_pri(1thread_all_write) 96.76% 26.14ns 38.25M
+// shmtx_rd_pri(1thread_all_write) 96.60% 26.18ns 38.19M
+// folly_ticket(1thread_all_write) 89.58% 28.24ns 35.42M
+// boost_shared(1thread_all_write) 17.06% 148.29ns 6.74M
+// pthrd_rwlock(1thread_all_write) 63.32% 39.95ns 25.03M
+// pthrd_mutex_(1thread_all_write) 81.38% 31.08ns 32.17M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_all_write) 104.60ns 9.56M
+// shmtx_wr_pri(2thread_all_write) 48.87% 214.06ns 4.67M
+// shmtx_rd_pri(2thread_all_write) 42.47% 246.31ns 4.06M
+// folly_ticket(2thread_all_write) 73.12% 143.05ns 6.99M
+// boost_shared(2thread_all_write) 24.59% 425.41ns 2.35M
+// pthrd_rwlock(2thread_all_write) 38.69% 270.37ns 3.70M
+// pthrd_mutex_(2thread_all_write) 155.45% 67.29ns 14.86M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_all_write) 166.17ns 6.02M
+// shmtx_wr_pri(4thread_all_write) 45.40% 366.00ns 2.73M
+// shmtx_rd_pri(4thread_all_write) 62.81% 264.56ns 3.78M
+// folly_ticket(4thread_all_write) 118.11% 140.69ns 7.11M
+// boost_shared(4thread_all_write) 8.78% 1.89us 528.22K
+// pthrd_rwlock(4thread_all_write) 27.30% 608.59ns 1.64M
+// pthrd_mutex_(4thread_all_write) 92.18% 180.27ns 5.55M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_all_write) 363.10ns 2.75M
+// shmtx_wr_pri(8thread_all_write) 163.18% 222.51ns 4.49M
+// shmtx_rd_pri(8thread_all_write) 91.20% 398.11ns 2.51M
+// folly_ticket(8thread_all_write) 150.11% 241.89ns 4.13M
+// boost_shared(8thread_all_write) 7.53% 4.82us 207.48K
+// pthrd_rwlock(8thread_all_write) 57.06% 636.32ns 1.57M
+// pthrd_mutex_(8thread_all_write) 218.78% 165.96ns 6.03M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_all_write) 762.75ns 1.31M
+// shmtx_wr_pri(16thread_all_write) 131.04% 582.08ns 1.72M
+// shmtx_rd_pri(16thread_all_write) 130.26% 585.57ns 1.71M
+// folly_ticket(16thread_all_write) 253.39% 301.01ns 3.32M
+// boost_shared(16thread_all_write) 10.33% 7.38us 135.43K
+// pthrd_rwlock(16thread_all_write) 141.66% 538.43ns 1.86M
+// pthrd_mutex_(16thread_all_write) 471.34% 161.83ns 6.18M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_all_write) 1.42us 705.40K
+// shmtx_wr_pri(32thread_all_write) 229.36% 618.09ns 1.62M
+// shmtx_rd_pri(32thread_all_write) 228.78% 619.65ns 1.61M
+// folly_ticket(32thread_all_write) 326.61% 434.04ns 2.30M
+// boost_shared(32thread_all_write) 18.65% 7.60us 131.59K
+// pthrd_rwlock(32thread_all_write) 261.56% 542.00ns 1.85M
+// pthrd_mutex_(32thread_all_write) 946.65% 149.75ns 6.68M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_all_write) 1.83us 545.94K
+// shmtx_wr_pri(64thread_all_write) 248.08% 738.34ns 1.35M
+// shmtx_rd_pri(64thread_all_write) 249.47% 734.23ns 1.36M
+// folly_ticket(64thread_all_write) 342.38% 535.00ns 1.87M
+// boost_shared(64thread_all_write) 23.95% 7.65us 130.75K
+// pthrd_rwlock(64thread_all_write) 318.32% 575.42ns 1.74M
+// pthrd_mutex_(64thread_all_write) 1288.43% 142.16ns 7.03M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_10pct_write) 19.13ns 52.28M
+// shmtx_wr_pri(1thread_10pct_write) 80.47% 23.77ns 42.07M
+// shmtx_rd_pri(1thread_10pct_write) 80.63% 23.72ns 42.15M
+// folly_ticket(1thread_10pct_write) 69.33% 27.59ns 36.25M
+// boost_shared(1thread_10pct_write) 12.46% 153.53ns 6.51M
+// pthrd_rwlock(1thread_10pct_write) 46.35% 41.27ns 24.23M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_10pct_write) 142.93ns 7.00M
+// shmtx_wr_pri(2thread_10pct_write) 165.37% 86.43ns 11.57M
+// shmtx_rd_pri(2thread_10pct_write) 159.35% 89.70ns 11.15M
+// folly_ticket(2thread_10pct_write) 129.31% 110.53ns 9.05M
+// boost_shared(2thread_10pct_write) 39.42% 362.54ns 2.76M
+// pthrd_rwlock(2thread_10pct_write) 87.87% 162.65ns 6.15M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_10pct_write) 197.39ns 5.07M
+// shmtx_wr_pri(4thread_10pct_write) 171.06% 115.39ns 8.67M
+// shmtx_rd_pri(4thread_10pct_write) 139.86% 141.13ns 7.09M
+// folly_ticket(4thread_10pct_write) 129.34% 152.62ns 6.55M
+// boost_shared(4thread_10pct_write) 16.99% 1.16us 860.70K
+// pthrd_rwlock(4thread_10pct_write) 47.65% 414.28ns 2.41M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_10pct_write) 392.62ns 2.55M
+// shmtx_wr_pri(8thread_10pct_write) 273.40% 143.61ns 6.96M
+// shmtx_rd_pri(8thread_10pct_write) 194.52% 201.84ns 4.95M
+// folly_ticket(8thread_10pct_write) 189.91% 206.75ns 4.84M
+// boost_shared(8thread_10pct_write) 16.84% 2.33us 429.03K
+// pthrd_rwlock(8thread_10pct_write) 87.03% 451.14ns 2.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_10pct_write) 794.93ns 1.26M
+// shmtx_wr_pri(16thread_10pct_write) 352.64% 225.43ns 4.44M
+// shmtx_rd_pri(16thread_10pct_write) 295.42% 269.09ns 3.72M
+// folly_ticket(16thread_10pct_write) 296.11% 268.46ns 3.72M
+// boost_shared(16thread_10pct_write) 17.04% 4.66us 214.39K
+// pthrd_rwlock(16thread_10pct_write) 176.40% 450.64ns 2.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_10pct_write) 821.14ns 1.22M
+// shmtx_wr_pri(32thread_10pct_write) 355.74% 230.82ns 4.33M
+// shmtx_rd_pri(32thread_10pct_write) 320.09% 256.53ns 3.90M
+// folly_ticket(32thread_10pct_write) 262.01% 313.41ns 3.19M
+// boost_shared(32thread_10pct_write) 8.15% 10.08us 99.20K
+// pthrd_rwlock(32thread_10pct_write) 175.15% 468.83ns 2.13M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_10pct_write) 1.20us 836.33K
+// shmtx_wr_pri(64thread_10pct_write) 437.20% 273.49ns 3.66M
+// shmtx_rd_pri(64thread_10pct_write) 438.80% 272.49ns 3.67M
+// folly_ticket(64thread_10pct_write) 254.51% 469.82ns 2.13M
+// boost_shared(64thread_10pct_write) 6.05% 19.78us 50.56K
+// pthrd_rwlock(64thread_10pct_write) 254.24% 470.30ns 2.13M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin(1thread_1pct_write) 18.60ns 53.76M
+// shmtx_wr_pri(1thread_1pct_write) 79.07% 23.52ns 42.51M
+// shmtx_w_bare(1thread_1pct_write) 66.09% 28.15ns 35.53M
+// shmtx_rd_pri(1thread_1pct_write) 79.21% 23.48ns 42.58M
+// shmtx_r_bare(1thread_1pct_write) 65.98% 28.19ns 35.47M
+// folly_ticket(1thread_1pct_write) 67.69% 27.48ns 36.39M
+// boost_shared(1thread_1pct_write) 12.17% 152.88ns 6.54M
+// pthrd_rwlock(1thread_1pct_write) 45.04% 41.30ns 24.22M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thread_1pct_write) 128.42ns 7.79M
+// shmtx_wr_pri(2thread_1pct_write) 347.63% 36.94ns 27.07M
+// shmtx_w_bare(2thread_1pct_write) 475.37% 27.02ns 37.02M
+// shmtx_rd_pri(2thread_1pct_write) 312.94% 41.04ns 24.37M
+// shmtx_r_bare(2thread_1pct_write) 149.38% 85.97ns 11.63M
+// folly_ticket(2thread_1pct_write) 147.88% 86.84ns 11.52M
+// boost_shared(2thread_1pct_write) 45.50% 282.24ns 3.54M
+// pthrd_rwlock(2thread_1pct_write) 129.88% 98.88ns 10.11M
+// ----------------------------------------------------------------------------
+// folly_rwspin(4thread_1pct_write) 148.88ns 6.72M
+// shmtx_wr_pri(4thread_1pct_write) 504.03% 29.54ns 33.86M
+// shmtx_w_bare(4thread_1pct_write) 471.63% 31.57ns 31.68M
+// shmtx_rd_pri(4thread_1pct_write) 291.84% 51.01ns 19.60M
+// shmtx_r_bare(4thread_1pct_write) 81.41% 182.86ns 5.47M
+// folly_ticket(4thread_1pct_write) 114.59% 129.92ns 7.70M
+// boost_shared(4thread_1pct_write) 26.70% 557.56ns 1.79M
+// pthrd_rwlock(4thread_1pct_write) 64.46% 230.97ns 4.33M
+// ----------------------------------------------------------------------------
+// folly_rwspin(8thread_1pct_write) 213.06ns 4.69M
+// shmtx_wr_pri(8thread_1pct_write) 734.88% 28.99ns 34.49M
+// shmtx_w_bare(8thread_1pct_write) 676.88% 31.48ns 31.77M
+// shmtx_rd_pri(8thread_1pct_write) 196.93% 108.19ns 9.24M
+// shmtx_r_bare(8thread_1pct_write) 99.35% 214.46ns 4.66M
+// folly_ticket(8thread_1pct_write) 120.84% 176.31ns 5.67M
+// boost_shared(8thread_1pct_write) 28.51% 747.36ns 1.34M
+// pthrd_rwlock(8thread_1pct_write) 88.85% 239.81ns 4.17M
+// ----------------------------------------------------------------------------
+// folly_rwspin(16thread_1pct_write) 481.61ns 2.08M
+// shmtx_wr_pri(16thread_1pct_write) 1204.17% 40.00ns 25.00M
+// shmtx_w_bare(16thread_1pct_write) 1241.61% 38.79ns 25.78M
+// shmtx_rd_pri(16thread_1pct_write) 315.61% 152.60ns 6.55M
+// shmtx_r_bare(16thread_1pct_write) 211.23% 228.00ns 4.39M
+// folly_ticket(16thread_1pct_write) 227.88% 211.35ns 4.73M
+// boost_shared(16thread_1pct_write) 34.17% 1.41us 709.47K
+// pthrd_rwlock(16thread_1pct_write) 210.97% 228.28ns 4.38M
+// ----------------------------------------------------------------------------
+// folly_rwspin(32thread_1pct_write) 382.40ns 2.62M
+// shmtx_wr_pri(32thread_1pct_write) 984.99% 38.82ns 25.76M
+// shmtx_w_bare(32thread_1pct_write) 957.41% 39.94ns 25.04M
+// shmtx_rd_pri(32thread_1pct_write) 248.87% 153.65ns 6.51M
+// shmtx_r_bare(32thread_1pct_write) 175.33% 218.11ns 4.58M
+// folly_ticket(32thread_1pct_write) 140.50% 272.18ns 3.67M
+// boost_shared(32thread_1pct_write) 12.67% 3.02us 331.22K
+// pthrd_rwlock(32thread_1pct_write) 172.70% 221.42ns 4.52M
+// ----------------------------------------------------------------------------
+// folly_rwspin(64thread_1pct_write) 448.64ns 2.23M
+// shmtx_wr_pri(64thread_1pct_write) 1136.53% 39.47ns 25.33M
+// shmtx_w_bare(64thread_1pct_write) 1037.84% 43.23ns 23.13M
+// shmtx_rd_pri(64thread_1pct_write) 284.52% 157.68ns 6.34M
+// shmtx_r_bare(64thread_1pct_write) 216.51% 207.21ns 4.83M
+// folly_ticket(64thread_1pct_write) 114.00% 393.54ns 2.54M
+// boost_shared(64thread_1pct_write) 8.29% 5.41us 184.85K
+// pthrd_rwlock(64thread_1pct_write) 207.19% 216.53ns 4.62M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_50pct_write) 10.84ns 92.23M
+// shmtx_wr_pri(2thr_2lock_50pct_write) 85.21% 12.72ns 78.59M
+// shmtx_rd_pri(2thr_2lock_50pct_write) 84.80% 12.79ns 78.21M
+// folly_rwspin(4thr_4lock_50pct_write) 5.33ns 187.76M
+// shmtx_wr_pri(4thr_4lock_50pct_write) 84.84% 6.28ns 159.30M
+// shmtx_rd_pri(4thr_4lock_50pct_write) 84.38% 6.31ns 158.42M
+// folly_rwspin(8thr_8lock_50pct_write) 2.63ns 379.54M
+// shmtx_wr_pri(8thr_8lock_50pct_write) 84.30% 3.13ns 319.97M
+// shmtx_rd_pri(8thr_8lock_50pct_write) 84.35% 3.12ns 320.16M
+// folly_rwspin(16thr_16lock_50pct_write) 1.31ns 760.73M
+// shmtx_wr_pri(16thr_16lock_50pct_write) 83.58% 1.57ns 635.80M
+// shmtx_rd_pri(16thr_16lock_50pct_write) 83.72% 1.57ns 636.89M
+// folly_rwspin(32thr_32lock_50pct_write) 1.19ns 838.77M
+// shmtx_wr_pri(32thr_32lock_50pct_write) 89.84% 1.33ns 753.55M
+// shmtx_rd_pri(32thr_32lock_50pct_write) 89.39% 1.33ns 749.82M
+// folly_rwspin(64thr_64lock_50pct_write) 1.39ns 718.11M
+// shmtx_wr_pri(64thr_64lock_50pct_write) 91.89% 1.52ns 659.90M
+// shmtx_rd_pri(64thr_64lock_50pct_write) 91.08% 1.53ns 654.04M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_10pct_write) 10.25ns 97.53M
+// shmtx_wr_pri(2thr_2lock_10pct_write) 84.23% 12.17ns 82.14M
+// shmtx_rd_pri(2thr_2lock_10pct_write) 84.03% 12.20ns 81.96M
+// folly_rwspin(4thr_4lock_10pct_write) 5.05ns 197.98M
+// shmtx_wr_pri(4thr_4lock_10pct_write) 84.01% 6.01ns 166.31M
+// shmtx_rd_pri(4thr_4lock_10pct_write) 83.98% 6.01ns 166.27M
+// folly_rwspin(8thr_8lock_10pct_write) 2.46ns 405.97M
+// shmtx_wr_pri(8thr_8lock_10pct_write) 82.52% 2.98ns 335.03M
+// shmtx_rd_pri(8thr_8lock_10pct_write) 82.47% 2.99ns 334.82M
+// folly_rwspin(16thr_16lock_10pct_write) 1.23ns 813.48M
+// shmtx_wr_pri(16thr_16lock_10pct_write) 82.08% 1.50ns 667.72M
+// shmtx_rd_pri(16thr_16lock_10pct_write) 81.53% 1.51ns 663.23M
+// folly_rwspin(32thr_32lock_10pct_write) 1.20ns 836.43M
+// shmtx_wr_pri(32thr_32lock_10pct_write) 91.52% 1.31ns 765.47M
+// shmtx_rd_pri(32thr_32lock_10pct_write) 91.87% 1.30ns 768.45M
+// folly_rwspin(64thr_64lock_10pct_write) 1.39ns 721.74M
+// shmtx_wr_pri(64thr_64lock_10pct_write) 92.04% 1.51ns 664.28M
+// shmtx_rd_pri(64thr_64lock_10pct_write) 92.57% 1.50ns 668.15M
+// ----------------------------------------------------------------------------
+// folly_rwspin(2thr_2lock_1pct_write) 10.13ns 98.71M
+// shmtx_wr_pri(2thr_2lock_1pct_write) 83.59% 12.12ns 82.51M
+// shmtx_rd_pri(2thr_2lock_1pct_write) 83.59% 12.12ns 82.51M
+// folly_rwspin(4thr_4lock_1pct_write) 4.96ns 201.67M
+// shmtx_wr_pri(4thr_4lock_1pct_write) 82.87% 5.98ns 167.13M
+// shmtx_rd_pri(4thr_4lock_1pct_write) 83.05% 5.97ns 167.48M
+// folly_rwspin(8thr_8lock_1pct_write) 2.44ns 409.64M
+// shmtx_wr_pri(8thr_8lock_1pct_write) 82.46% 2.96ns 337.79M
+// shmtx_rd_pri(8thr_8lock_1pct_write) 82.40% 2.96ns 337.55M
+// folly_rwspin(16thr_16lock_1pct_write) 1.22ns 821.15M
+// shmtx_wr_pri(16thr_16lock_1pct_write) 81.63% 1.49ns 670.29M
+// shmtx_rd_pri(16thr_16lock_1pct_write) 81.65% 1.49ns 670.50M
+// folly_rwspin(32thr_32lock_1pct_write) 1.20ns 832.88M
+// shmtx_wr_pri(32thr_32lock_1pct_write) 92.22% 1.30ns 768.06M
+// shmtx_rd_pri(32thr_32lock_1pct_write) 92.21% 1.30ns 768.01M
+// folly_rwspin(64thr_64lock_1pct_write) 1.38ns 726.10M
+// shmtx_wr_pri(64thr_64lock_1pct_write) 92.24% 1.49ns 669.75M
+// shmtx_rd_pri(64thr_64lock_1pct_write) 92.13% 1.49ns 668.95M
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn0) 503.21ns 1.99M
+// shmtx_w_bare_ping_pong(burn0) 79.13% 635.96ns 1.57M
+// shmtx_r_bare_ping_pong(burn0) 59.08% 851.81ns 1.17M
+// folly_ticket_ping_pong(burn0) 60.50% 831.77ns 1.20M
+// boost_shared_ping_pong(burn0) 4.46% 11.28us 88.65K
+// pthrd_rwlock_ping_pong(burn0) 6.86% 7.34us 136.27K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn100k) 685.00ns 1.46M
+// shmtx_w_bare_ping_pong(burn100k) 100.05% 684.65ns 1.46M
+// shmtx_r_bare_ping_pong(burn100k) 99.93% 685.51ns 1.46M
+// folly_ticket_ping_pong(burn100k) 99.32% 689.72ns 1.45M
+// boost_shared_ping_pong(burn100k) 56.59% 1.21us 826.06K
+// pthrd_rwlock_ping_pong(burn100k) 58.32% 1.17us 851.41K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn300k) 2.15us 464.20K
+// shmtx_w_bare_ping_pong(burn300k) 101.02% 2.13us 468.93K
+// shmtx_r_bare_ping_pong(burn300k) 103.95% 2.07us 482.55K
+// folly_ticket_ping_pong(burn300k) 104.06% 2.07us 483.05K
+// boost_shared_ping_pong(burn300k) 86.36% 2.49us 400.86K
+// pthrd_rwlock_ping_pong(burn300k) 87.30% 2.47us 405.25K
+// ----------------------------------------------------------------------------
+// folly_rwspin_ping_pong(burn1M) 675.20ns 1.48M
+// shmtx_w_bare_ping_pong(burn1M) 99.73% 677.02ns 1.48M
+// shmtx_r_bare_ping_pong(burn1M) 99.23% 680.45ns 1.47M
+// folly_ticket_ping_pong(burn1M) 97.85% 690.01ns 1.45M
+// boost_shared_ping_pong(burn1M) 93.17% 724.67ns 1.38M
+// pthrd_rwlock_ping_pong(burn1M) 91.84% 735.22ns 1.36M
+// ============================================================================
+
+int main(int argc, char** argv) {
+ (void)folly_rwspin_reads;
+ (void)shmtx_wr_pri_reads;
+ (void)shmtx_w_bare_reads;
+ (void)shmtx_rd_pri_reads;
+ (void)shmtx_r_bare_reads;
+ (void)folly_ticket_reads;
+ (void)boost_shared_reads;
+ (void)pthrd_rwlock_reads;
+ (void)folly_rwspin;
+ (void)shmtx_wr_pri;
+ (void)shmtx_w_bare;
+ (void)shmtx_rd_pri;
+ (void)shmtx_r_bare;
+ (void)folly_ticket;
+ (void)boost_shared;
+ (void)pthrd_rwlock;
+ (void)pthrd_mutex_;
+ (void)folly_rwspin_ping_pong;
+ (void)shmtx_w_bare_ping_pong;
+ (void)shmtx_r_bare_ping_pong;
+ (void)folly_ticket_ping_pong;
+ (void)boost_shared_ping_pong;
+ (void)pthrd_rwlock_ping_pong;
+
+ testing::InitGoogleTest(&argc, argv);
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ int rv = RUN_ALL_TESTS();
+ folly::runBenchmarksOnFlag();
+ return rv;
+}