+/*
+ * Copyright 2017-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * N.B. You most likely do _not_ want to use RWSpinLock or any other
+ * kind of spinlock. Use SharedMutex instead.
+ *
+ * In short, spinlocks in preemptive multi-tasking operating systems
+ * have serious problems and fast mutexes like SharedMutex are almost
+ * certainly the better choice, because letting the OS scheduler put a
+ * thread to sleep is better for system responsiveness and throughput
+ * than wasting a timeslice repeatedly querying a lock held by a
+ * thread that's blocked, and you can't prevent userspace
+ * programs blocking.
+ *
+ * Spinlocks in an operating system kernel make much more sense than
+ * they do in userspace.
+ *
+ * -------------------------------------------------------------------
+ *
+ * Two Read-Write spin lock implementations.
+ *
+ * Ref: http://locklessinc.com/articles/locks
+ *
+ * Both locks here are faster than pthread_rwlock and have very low
+ * overhead (usually 20-30ns). They don't use any system mutexes and
+ * are very compact (4/8 bytes), so are suitable for per-instance
+ * based locking, particularly when contention is not expected.
+ *
+ * For a spinlock, RWSpinLock is a reasonable choice. (See the note
+ * about for why a spin lock is frequently a bad idea generally.)
+ * RWSpinLock has minimal overhead, and comparable contention
+ * performance when the number of competing threads is less than or
+ * equal to the number of logical CPUs. Even as the number of
+ * threads gets larger, RWSpinLock can still be very competitive in
+ * READ, although it is slower on WRITE, and also inherently unfair
+ * to writers.
+ *
+ * RWTicketSpinLock shows more balanced READ/WRITE performance. If
+ * your application really needs a lot more threads, and a
+ * higher-priority writer, prefer one of the RWTicketSpinLock locks.
+ *
+ * Caveats:
+ *
+ * RWTicketSpinLock locks can only be used with GCC on x86/x86-64
+ * based systems.
+ *
+ * RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
+ * readers and writers.
+ *
+ * RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
+ * readers and writers.
+ *
+ * RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
+ * writer priority) is NOT reentrant, even for lock_shared().
+ *
+ * The lock will not grant any new shared (read) accesses while a thread
+ * attempting to acquire the lock in write mode is blocked. (That is,
+ * if the lock is held in shared mode by N threads, and a thread attempts
+ * to acquire it in write mode, no one else can acquire it in shared mode
+ * until these N threads release the lock and then the blocked thread
+ * acquires and releases the exclusive lock.) This also applies for
+ * attempts to reacquire the lock in shared mode by threads that already
+ * hold it in shared mode, making the lock non-reentrant.
+ *
+ * RWSpinLock handles 2^30 - 1 concurrent readers.
+ *
+ * @author Xin Liu <xliux@fb.com>
+ */
+
+#pragma once
+
+/*
+========================================================================
+Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs)
+========================================================================
+
+------------------------------------------------------------------------------
+1. Single thread benchmark (read/write lock + unlock overhead)
+Benchmark Iters Total t t/iter iter/sec
+-------------------------------------------------------------------------------
+* BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M
++30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M
++85.7% BM_RWTicketSpinLock32Read 100000 3.317 ms 33.17 ns 28.75M
++96.0% BM_RWTicketSpinLock32Write 100000 3.5 ms 35 ns 27.25M
++85.6% BM_RWTicketSpinLock64Read 100000 3.315 ms 33.15 ns 28.77M
++96.0% BM_RWTicketSpinLock64Write 100000 3.5 ms 35 ns 27.25M
++85.7% BM_RWTicketSpinLock32FavorWriterRead 100000 3.317 ms 33.17 ns 28.75M
++29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000 2.316 ms 23.16 ns 41.18M
++85.3% BM_RWTicketSpinLock64FavorWriterRead 100000 3.309 ms 33.09 ns 28.82M
++30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000 2.325 ms 23.25 ns 41.02M
++ 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M
++ 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M
+
+------------------------------------------------------------------------------
+2. Contention Benchmark 90% read 10% write
+Benchmark hits average min max sigma
+------------------------------------------------------------------------------
+---------- 8 threads ------------
+RWSpinLock Write 142666 220ns 78ns 40.8us 269ns
+RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns
+RWTicketSpinLock Write 85692 209ns 71ns 17.9us 252ns
+RWTicketSpinLock Read 769571 215ns 78ns 33.4us 251ns
+pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us
+pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us
+
+---------- 16 threads ------------
+RWSpinLock Write 124236 237ns 78ns 261us 801ns
+RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us
+RWTicketSpinLock Write 81781 231ns 71ns 31.4us 351ns
+RWTicketSpinLock Read 734518 238ns 78ns 73.6us 379ns
+pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us
+pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us
+
+---------- 50 threads ------------
+RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us
+RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us
+RWTicketSpinLock Write 83045 397ns 73ns 7.01ms 31.5us
+RWTicketSpinLock Read 744133 386ns 78ns 11ms 31.4us
+pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us
+pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us
+
+*/
+
+#include <folly/Portability.h>
+#include <folly/portability/Asm.h>
+
+#if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
+#define RW_SPINLOCK_USE_X86_INTRINSIC_
+#include <x86intrin.h>
+#elif defined(_MSC_VER) && defined(FOLLY_X64)
+#define RW_SPINLOCK_USE_X86_INTRINSIC_
+#elif FOLLY_AARCH64
+#define RW_SPINLOCK_USE_X86_INTRINSIC_
+#else
+#undef RW_SPINLOCK_USE_X86_INTRINSIC_
+#endif
+
+// iOS doesn't define _mm_cvtsi64_si128 and friends
+#if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
+#define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+#else
+#undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <thread>
+
+#include <glog/logging.h>
+
+#include <folly/Likely.h>
+
+namespace folly {
+
+/*
+ * A simple, small (4-bytes), but unfair rwlock. Use it when you want
+ * a nice writer and don't expect a lot of write/read contention, or
+ * when you need small rwlocks since you are creating a large number
+ * of them.
+ *
+ * Note that the unfairness here is extreme: if the lock is
+ * continually accessed for read, writers will never get a chance. If
+ * the lock can be that highly contended this class is probably not an
+ * ideal choice anyway.
+ *
+ * It currently implements most of the Lockable, SharedLockable and
+ * UpgradeLockable concepts except the TimedLockable related locking/unlocking
+ * interfaces.
+ */
+class RWSpinLock {
+ enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
+ public:
+ constexpr RWSpinLock() : bits_(0) {}
+
+ RWSpinLock(RWSpinLock const&) = delete;
+ RWSpinLock& operator=(RWSpinLock const&) = delete;
+
+ // Lockable Concept
+ void lock() {
+ uint_fast32_t count = 0;
+ while (!LIKELY(try_lock())) {
+ if (++count > 1000) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
+ void unlock() {
+ static_assert(READER > WRITER + UPGRADED, "wrong bits!");
+ bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
+ }
+
+ // SharedLockable Concept
+ void lock_shared() {
+ uint_fast32_t count = 0;
+ while (!LIKELY(try_lock_shared())) {
+ if (++count > 1000) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ void unlock_shared() {
+ bits_.fetch_add(-READER, std::memory_order_release);
+ }
+
+ // Downgrade the lock from writer status to reader status.
+ void unlock_and_lock_shared() {
+ bits_.fetch_add(READER, std::memory_order_acquire);
+ unlock();
+ }
+
+ // UpgradeLockable Concept
+ void lock_upgrade() {
+ uint_fast32_t count = 0;
+ while (!try_lock_upgrade()) {
+ if (++count > 1000) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ void unlock_upgrade() {
+ bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
+ }
+
+ // unlock upgrade and try to acquire write lock
+ void unlock_upgrade_and_lock() {
+ int64_t count = 0;
+ while (!try_unlock_upgrade_and_lock()) {
+ if (++count > 1000) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ // unlock upgrade and read lock atomically
+ void unlock_upgrade_and_lock_shared() {
+ bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
+ }
+
+ // write unlock and upgrade lock atomically
+ void unlock_and_lock_upgrade() {
+ // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
+ // the same time when other threads are trying do try_lock_upgrade().
+ bits_.fetch_or(UPGRADED, std::memory_order_acquire);
+ bits_.fetch_add(-WRITER, std::memory_order_release);
+ }
+
+
+ // Attempt to acquire writer permission. Return false if we didn't get it.
+ bool try_lock() {
+ int32_t expect = 0;
+ return bits_.compare_exchange_strong(expect, WRITER,
+ std::memory_order_acq_rel);
+ }
+
+ // Try to get reader permission on the lock. This can fail if we
+ // find out someone is a writer or upgrader.
+ // Setting the UPGRADED bit would allow a writer-to-be to indicate
+ // its intention to write and block any new readers while waiting
+ // for existing readers to finish and release their read locks. This
+ // helps avoid starving writers (promoted from upgraders).
+ bool try_lock_shared() {
+ // fetch_add is considerably (100%) faster than compare_exchange,
+ // so here we are optimizing for the common (lock success) case.
+ int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
+ if (UNLIKELY(value & (WRITER|UPGRADED))) {
+ bits_.fetch_add(-READER, std::memory_order_release);
+ return false;
+ }
+ return true;
+ }
+
+ // try to unlock upgrade and write lock atomically
+ bool try_unlock_upgrade_and_lock() {
+ int32_t expect = UPGRADED;
+ return bits_.compare_exchange_strong(expect, WRITER,
+ std::memory_order_acq_rel);
+ }
+
+ // try to acquire an upgradable lock.
+ bool try_lock_upgrade() {
+ int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
+
+ // Note: when failed, we cannot flip the UPGRADED bit back,
+ // as in this case there is either another upgrade lock or a write lock.
+ // If it's a write lock, the bit will get cleared up when that lock's done
+ // with unlock().
+ return ((value & (UPGRADED | WRITER)) == 0);
+ }
+
+ // mainly for debugging purposes.
+ int32_t bits() const { return bits_.load(std::memory_order_acquire); }
+
+ class ReadHolder;
+ class UpgradedHolder;
+ class WriteHolder;
+
+ class ReadHolder {
+ public:
+ explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
+ if (lock_) {
+ lock_->lock_shared();
+ }
+ }
+
+ explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
+ lock_->lock_shared();
+ }
+
+ ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
+ other.lock_ = nullptr;
+ }
+
+ // down-grade
+ explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
+ upgraded.lock_ = nullptr;
+ if (lock_) {
+ lock_->unlock_upgrade_and_lock_shared();
+ }
+ }
+
+ explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
+ writer.lock_ = nullptr;
+ if (lock_) {
+ lock_->unlock_and_lock_shared();
+ }
+ }
+
+ ReadHolder& operator=(ReadHolder&& other) {
+ using std::swap;
+ swap(lock_, other.lock_);
+ return *this;
+ }
+
+ ReadHolder(const ReadHolder& other) = delete;
+ ReadHolder& operator=(const ReadHolder& other) = delete;
+
+ ~ReadHolder() {
+ if (lock_) {
+ lock_->unlock_shared();
+ }
+ }
+
+ void reset(RWSpinLock* lock = nullptr) {
+ if (lock == lock_) {
+ return;
+ }
+ if (lock_) {
+ lock_->unlock_shared();
+ }
+ lock_ = lock;
+ if (lock_) {
+ lock_->lock_shared();
+ }
+ }
+
+ void swap(ReadHolder* other) {
+ std::swap(lock_, other->lock_);
+ }
+
+ private:
+ friend class UpgradedHolder;
+ friend class WriteHolder;
+ RWSpinLock* lock_;
+ };
+
+ class UpgradedHolder {
+ public:
+ explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
+ if (lock_) {
+ lock_->lock_upgrade();
+ }
+ }
+
+ explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
+ lock_->lock_upgrade();
+ }
+
+ explicit UpgradedHolder(WriteHolder&& writer) {
+ lock_ = writer.lock_;
+ writer.lock_ = nullptr;
+ if (lock_) {
+ lock_->unlock_and_lock_upgrade();
+ }
+ }
+
+ UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
+ other.lock_ = nullptr;
+ }
+
+ UpgradedHolder& operator =(UpgradedHolder&& other) {
+ using std::swap;
+ swap(lock_, other.lock_);
+ return *this;
+ }
+
+ UpgradedHolder(const UpgradedHolder& other) = delete;
+ UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
+
+ ~UpgradedHolder() {
+ if (lock_) {
+ lock_->unlock_upgrade();
+ }
+ }
+
+ void reset(RWSpinLock* lock = nullptr) {
+ if (lock == lock_) {
+ return;
+ }
+ if (lock_) {
+ lock_->unlock_upgrade();
+ }
+ lock_ = lock;
+ if (lock_) {
+ lock_->lock_upgrade();
+ }
+ }
+
+ void swap(UpgradedHolder* other) {
+ using std::swap;
+ swap(lock_, other->lock_);
+ }
+
+ private:
+ friend class WriteHolder;
+ friend class ReadHolder;
+ RWSpinLock* lock_;
+ };
+
+ class WriteHolder {
+ public:
+ explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
+ if (lock_) {
+ lock_->lock();
+ }
+ }
+
+ explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
+ lock_->lock();
+ }
+
+ // promoted from an upgrade lock holder
+ explicit WriteHolder(UpgradedHolder&& upgraded) {
+ lock_ = upgraded.lock_;
+ upgraded.lock_ = nullptr;
+ if (lock_) {
+ lock_->unlock_upgrade_and_lock();
+ }
+ }
+
+ WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
+ other.lock_ = nullptr;
+ }
+
+ WriteHolder& operator =(WriteHolder&& other) {
+ using std::swap;
+ swap(lock_, other.lock_);
+ return *this;
+ }
+
+ WriteHolder(const WriteHolder& other) = delete;
+ WriteHolder& operator =(const WriteHolder& other) = delete;
+
+ ~WriteHolder() {
+ if (lock_) {
+ lock_->unlock();
+ }
+ }
+
+ void reset(RWSpinLock* lock = nullptr) {
+ if (lock == lock_) {
+ return;
+ }
+ if (lock_) {
+ lock_->unlock();
+ }
+ lock_ = lock;
+ if (lock_) {
+ lock_->lock();
+ }
+ }
+
+ void swap(WriteHolder* other) {
+ using std::swap;
+ swap(lock_, other->lock_);
+ }
+
+ private:
+ friend class ReadHolder;
+ friend class UpgradedHolder;
+ RWSpinLock* lock_;
+ };
+
+ private:
+ std::atomic<int32_t> bits_;
+};
+
+
+#ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
+// A more balanced Read-Write spin lock implemented based on GCC intrinsics.
+
+namespace detail {
+template <size_t kBitWidth> struct RWTicketIntTrait {
+ static_assert(kBitWidth == 32 || kBitWidth == 64,
+ "bit width has to be either 32 or 64 ");
+};
+
+template <>
+struct RWTicketIntTrait<64> {
+ typedef uint64_t FullInt;
+ typedef uint32_t HalfInt;
+ typedef uint16_t QuarterInt;
+
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+ static __m128i make128(const uint16_t v[4]) {
+ return _mm_set_epi16(0, 0, 0, 0,
+ short(v[3]), short(v[2]), short(v[1]), short(v[0]));
+ }
+ static inline __m128i fromInteger(uint64_t from) {
+ return _mm_cvtsi64_si128(int64_t(from));
+ }
+ static inline uint64_t toInteger(__m128i in) {
+ return uint64_t(_mm_cvtsi128_si64(in));
+ }
+ static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
+ return toInteger(_mm_add_epi16(in, kDelta));
+ }
+#endif
+};
+
+template <>
+struct RWTicketIntTrait<32> {
+ typedef uint32_t FullInt;
+ typedef uint16_t HalfInt;
+ typedef uint8_t QuarterInt;
+
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+ static __m128i make128(const uint8_t v[4]) {
+ return _mm_set_epi8(
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ char(v[3]), char(v[2]), char(v[1]), char(v[0]));
+ }
+ static inline __m128i fromInteger(uint32_t from) {
+ return _mm_cvtsi32_si128(int32_t(from));
+ }
+ static inline uint32_t toInteger(__m128i in) {
+ return uint32_t(_mm_cvtsi128_si32(in));
+ }
+ static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
+ return toInteger(_mm_add_epi8(in, kDelta));
+ }
+#endif
+};
+} // namespace detail
+
+template <size_t kBitWidth, bool kFavorWriter = false>
+class RWTicketSpinLockT {
+ typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
+ typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
+ typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
+ typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
+ QuarterInt;
+
+ union RWTicket {
+ constexpr RWTicket() : whole(0) {}
+ FullInt whole;
+ HalfInt readWrite;
+ __extension__ struct {
+ QuarterInt write;
+ QuarterInt read;
+ QuarterInt users;
+ };
+ } ticket;
+
+ private: // Some x64-specific utilities for atomic access to ticket.
+ template <class T> static T load_acquire(T* addr) {
+ T t = *addr; // acquire barrier
+ asm_volatile_memory();
+ return t;
+ }
+
+ template <class T>
+ static void store_release(T* addr, T v) {
+ asm_volatile_memory();
+ *addr = v; // release barrier
+ }
+
+ public:
+
+ constexpr RWTicketSpinLockT() {}
+
+ RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
+ RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
+
+ void lock() {
+ if (kFavorWriter) {
+ writeLockAggressive();
+ } else {
+ writeLockNice();
+ }
+ }
+
+ /*
+ * Both try_lock and try_lock_shared diverge in our implementation from the
+ * lock algorithm described in the link above.
+ *
+ * In the read case, it is undesirable that the readers could wait
+ * for another reader (before increasing ticket.read in the other
+ * implementation). Our approach gives up on
+ * first-come-first-serve, but our benchmarks showed improve
+ * performance for both readers and writers under heavily contended
+ * cases, particularly when the number of threads exceeds the number
+ * of logical CPUs.
+ *
+ * We have writeLockAggressive() using the original implementation
+ * for a writer, which gives some advantage to the writer over the
+ * readers---for that path it is guaranteed that the writer will
+ * acquire the lock after all the existing readers exit.
+ */
+ bool try_lock() {
+ RWTicket t;
+ FullInt old = t.whole = load_acquire(&ticket.whole);
+ if (t.users != t.write) {
+ return false;
+ }
+ ++t.users;
+ return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
+ }
+
+ /*
+ * Call this if you want to prioritize writer to avoid starvation.
+ * Unlike writeLockNice, immediately acquires the write lock when
+ * the existing readers (arriving before the writer) finish their
+ * turns.
+ */
+ void writeLockAggressive() {
+ // std::this_thread::yield() is needed here to avoid a pathology if the number
+ // of threads attempting concurrent writes is >= the number of real
+ // cores allocated to this process. This is less likely than the
+ // corresponding situation in lock_shared(), but we still want to
+ // avoid it
+ uint_fast32_t count = 0;
+ QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
+ while (val != load_acquire(&ticket.write)) {
+ asm_volatile_pause();
+ if (UNLIKELY(++count > 1000)) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ // Call this when the writer should be nicer to the readers.
+ void writeLockNice() {
+ // Here it doesn't cpu-relax the writer.
+ //
+ // This is because usually we have many more readers than the
+ // writers, so the writer has less chance to get the lock when
+ // there are a lot of competing readers. The aggressive spinning
+ // can help to avoid starving writers.
+ //
+ // We don't worry about std::this_thread::yield() here because the caller
+ // has already explicitly abandoned fairness.
+ while (!try_lock()) {}
+ }
+
+ // Atomically unlock the write-lock from writer and acquire the read-lock.
+ void unlock_and_lock_shared() {
+ QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
+ }
+
+ // Release writer permission on the lock.
+ void unlock() {
+ RWTicket t;
+ t.whole = load_acquire(&ticket.whole);
+
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+ FullInt old = t.whole;
+ // SSE2 can reduce the lock and unlock overhead by 10%
+ static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 }; // write/read/user
+ static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
+ __m128i m = IntTraitType::fromInteger(old);
+ t.whole = IntTraitType::addParallel(m, kDelta);
+#else
+ ++t.read;
+ ++t.write;
+#endif
+ store_release(&ticket.readWrite, t.readWrite);
+ }
+
+ void lock_shared() {
+ // std::this_thread::yield() is important here because we can't grab the
+ // shared lock if there is a pending writeLockAggressive, so we
+ // need to let threads that already have a shared lock complete
+ uint_fast32_t count = 0;
+ while (!LIKELY(try_lock_shared())) {
+ asm_volatile_pause();
+ if (UNLIKELY((++count & 1023) == 0)) {
+ std::this_thread::yield();
+ }
+ }
+ }
+
+ bool try_lock_shared() {
+ RWTicket t, old;
+ old.whole = t.whole = load_acquire(&ticket.whole);
+ old.users = old.read;
+#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
+ // SSE2 may reduce the total lock and unlock overhead by 10%
+ static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 }; // write/read/user
+ static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
+ __m128i m = IntTraitType::fromInteger(old.whole);
+ t.whole = IntTraitType::addParallel(m, kDelta);
+#else
+ ++t.read;
+ ++t.users;
+#endif
+ return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
+ }
+
+ void unlock_shared() {
+ __sync_fetch_and_add(&ticket.write, 1);
+ }
+
+ class WriteHolder;
+
+ typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
+ class ReadHolder {
+ public:
+ ReadHolder(ReadHolder const&) = delete;
+ ReadHolder& operator=(ReadHolder const&) = delete;
+
+ explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
+ if (lock_) {
+ lock_->lock_shared();
+ }
+ }
+
+ explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
+ if (lock_) {
+ lock_->lock_shared();
+ }
+ }
+
+ // atomically unlock the write-lock from writer and acquire the read-lock
+ explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
+ std::swap(this->lock_, writer->lock_);
+ if (lock_) {
+ lock_->unlock_and_lock_shared();
+ }
+ }
+
+ ~ReadHolder() {
+ if (lock_) {
+ lock_->unlock_shared();
+ }
+ }
+
+ void reset(RWSpinLock *lock = nullptr) {
+ if (lock_) {
+ lock_->unlock_shared();
+ }
+ lock_ = lock;
+ if (lock_) {
+ lock_->lock_shared();
+ }
+ }
+
+ void swap(ReadHolder *other) {
+ std::swap(this->lock_, other->lock_);
+ }
+
+ private:
+ RWSpinLock *lock_;
+ };
+
+ class WriteHolder {
+ public:
+ WriteHolder(WriteHolder const&) = delete;
+ WriteHolder& operator=(WriteHolder const&) = delete;
+
+ explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
+ if (lock_) {
+ lock_->lock();
+ }
+ }
+ explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
+ if (lock_) {
+ lock_->lock();
+ }
+ }
+
+ ~WriteHolder() {
+ if (lock_) {
+ lock_->unlock();
+ }
+ }
+
+ void reset(RWSpinLock *lock = nullptr) {
+ if (lock == lock_) {
+ return;
+ }
+ if (lock_) {
+ lock_->unlock();
+ }
+ lock_ = lock;
+ if (lock_) {
+ lock_->lock();
+ }
+ }
+
+ void swap(WriteHolder *other) {
+ std::swap(this->lock_, other->lock_);
+ }
+
+ private:
+ friend class ReadHolder;
+ RWSpinLock *lock_;
+ };
+};
+
+typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
+typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
+
+#endif // RW_SPINLOCK_USE_X86_INTRINSIC_
+
+} // namespace folly
+
+#ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
+#undef RW_SPINLOCK_USE_X86_INTRINSIC_
+#endif