2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 * Two Read-Write spin lock implementations.
20 * Ref: http://locklessinc.com/articles/locks
22 * Both locks here are faster than pthread_rwlock and have very low
23 * overhead (usually 20-30ns). They don't use any system mutexes and
24 * are very compact (4/8 bytes), so are suitable for per-instance
25 * based locking, particularly when contention is not expected.
27 * In most cases, RWSpinLock is a reasonable choice. It has minimal
28 * overhead, and comparable contention performance when the number of
29 * competing threads is less than or equal to the number of logical
30 * CPUs. Even as the number of threads gets larger, RWSpinLock can
31 * still be very competitive in READ, although it is slower on WRITE,
32 * and also inherently unfair to writers.
34 * RWTicketSpinLock shows more balanced READ/WRITE performance. If
35 * your application really needs a lot more threads, and a
36 * higher-priority writer, prefer one of the RWTicketSpinLock locks.
40 * RWTicketSpinLock locks can only be used with GCC on x86/x86-64
43 * RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
44 * readers and writers.
46 * RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
47 * readers and writers.
49 * RWSpinLock handles 2^30 - 1 concurrent readers.
51 * @author Xin Liu <xliux@fb.com>
54 #ifndef FOLLY_RWSPINLOCK_H_
55 #define FOLLY_RWSPINLOCK_H_
58 ========================================================================
59 Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs)
60 ========================================================================
62 ------------------------------------------------------------------------------
63 1. Single thread benchmark (read/write lock + unlock overhead)
64 Benchmark Iters Total t t/iter iter/sec
65 -------------------------------------------------------------------------------
66 * BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M
67 +30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M
68 +85.7% BM_RWTicketSpinLock32Read 100000 3.317 ms 33.17 ns 28.75M
69 +96.0% BM_RWTicketSpinLock32Write 100000 3.5 ms 35 ns 27.25M
70 +85.6% BM_RWTicketSpinLock64Read 100000 3.315 ms 33.15 ns 28.77M
71 +96.0% BM_RWTicketSpinLock64Write 100000 3.5 ms 35 ns 27.25M
72 +85.7% BM_RWTicketSpinLock32FavorWriterRead 100000 3.317 ms 33.17 ns 28.75M
73 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000 2.316 ms 23.16 ns 41.18M
74 +85.3% BM_RWTicketSpinLock64FavorWriterRead 100000 3.309 ms 33.09 ns 28.82M
75 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000 2.325 ms 23.25 ns 41.02M
76 + 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M
77 + 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M
79 ------------------------------------------------------------------------------
80 2. Contention Benchmark 90% read 10% write
81 Benchmark hits average min max sigma
82 ------------------------------------------------------------------------------
83 ---------- 8 threads ------------
84 RWSpinLock Write 142666 220ns 78ns 40.8us 269ns
85 RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns
86 RWTicketSpinLock Write 85692 209ns 71ns 17.9us 252ns
87 RWTicketSpinLock Read 769571 215ns 78ns 33.4us 251ns
88 pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us
89 pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us
91 ---------- 16 threads ------------
92 RWSpinLock Write 124236 237ns 78ns 261us 801ns
93 RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us
94 RWTicketSpinLock Write 81781 231ns 71ns 31.4us 351ns
95 RWTicketSpinLock Read 734518 238ns 78ns 73.6us 379ns
96 pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us
97 pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us
99 ---------- 50 threads ------------
100 RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us
101 RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us
102 RWTicketSpinLock Write 83045 397ns 73ns 7.01ms 31.5us
103 RWTicketSpinLock Read 744133 386ns 78ns 11ms 31.4us
104 pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us
105 pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us
109 #include <folly/Portability.h>
111 #if defined(__GNUC__) && \
112 (defined(__i386) || FOLLY_X64 || \
114 #define RW_SPINLOCK_USE_X86_INTRINSIC_
115 #include <x86intrin.h>
117 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
120 // iOS doesn't define _mm_cvtsi64_si128 and friends
121 #if defined(__SSE2__) && !TARGET_OS_IPHONE
122 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
124 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
130 #include <boost/noncopyable.hpp>
133 #include <glog/logging.h>
135 #include <folly/Likely.h>
140 * A simple, small (4-bytes), but unfair rwlock. Use it when you want
141 * a nice writer and don't expect a lot of write/read contention, or
142 * when you need small rwlocks since you are creating a large number
145 * Note that the unfairness here is extreme: if the lock is
146 * continually accessed for read, writers will never get a chance. If
147 * the lock can be that highly contended this class is probably not an
148 * ideal choice anyway.
150 * It currently implements most of the Lockable, SharedLockable and
151 * UpgradeLockable concepts except the TimedLockable related locking/unlocking
154 class RWSpinLock : boost::noncopyable {
155 enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
157 RWSpinLock() : bits_(0) {}
162 while (!LIKELY(try_lock())) {
163 if (++count > 1000) sched_yield();
167 // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
169 static_assert(READER > WRITER + UPGRADED, "wrong bits!");
170 bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
173 // SharedLockable Concept
176 while (!LIKELY(try_lock_shared())) {
177 if (++count > 1000) sched_yield();
181 void unlock_shared() {
182 bits_.fetch_add(-READER, std::memory_order_release);
185 // Downgrade the lock from writer status to reader status.
186 void unlock_and_lock_shared() {
187 bits_.fetch_add(READER, std::memory_order_acquire);
191 // UpgradeLockable Concept
192 void lock_upgrade() {
194 while (!try_lock_upgrade()) {
195 if (++count > 1000) sched_yield();
199 void unlock_upgrade() {
200 bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
203 // unlock upgrade and try to acquire write lock
204 void unlock_upgrade_and_lock() {
206 while (!try_unlock_upgrade_and_lock()) {
207 if (++count > 1000) sched_yield();
211 // unlock upgrade and read lock atomically
212 void unlock_upgrade_and_lock_shared() {
213 bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
216 // write unlock and upgrade lock atomically
217 void unlock_and_lock_upgrade() {
218 // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
219 // the same time when other threads are trying do try_lock_upgrade().
220 bits_.fetch_or(UPGRADED, std::memory_order_acquire);
221 bits_.fetch_add(-WRITER, std::memory_order_release);
225 // Attempt to acquire writer permission. Return false if we didn't get it.
228 return bits_.compare_exchange_strong(expect, WRITER,
229 std::memory_order_acq_rel);
232 // Try to get reader permission on the lock. This can fail if we
233 // find out someone is a writer or upgrader.
234 // Setting the UPGRADED bit would allow a writer-to-be to indicate
235 // its intention to write and block any new readers while waiting
236 // for existing readers to finish and release their read locks. This
237 // helps avoid starving writers (promoted from upgraders).
238 bool try_lock_shared() {
239 // fetch_add is considerably (100%) faster than compare_exchange,
240 // so here we are optimizing for the common (lock success) case.
241 int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
242 if (UNLIKELY(value & (WRITER|UPGRADED))) {
243 bits_.fetch_add(-READER, std::memory_order_release);
249 // try to unlock upgrade and write lock atomically
250 bool try_unlock_upgrade_and_lock() {
251 int32_t expect = UPGRADED;
252 return bits_.compare_exchange_strong(expect, WRITER,
253 std::memory_order_acq_rel);
256 // try to acquire an upgradable lock.
257 bool try_lock_upgrade() {
258 int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
260 // Note: when failed, we cannot flip the UPGRADED bit back,
261 // as in this case there is either another upgrade lock or a write lock.
262 // If it's a write lock, the bit will get cleared up when that lock's done
264 return ((value & (UPGRADED | WRITER)) == 0);
267 // mainly for debugging purposes.
268 int32_t bits() const { return bits_.load(std::memory_order_acquire); }
271 class UpgradedHolder;
276 explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
277 if (lock_) lock_->lock_shared();
280 explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
281 lock_->lock_shared();
284 ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
285 other.lock_ = nullptr;
289 explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
290 upgraded.lock_ = nullptr;
291 if (lock_) lock_->unlock_upgrade_and_lock_shared();
294 explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
295 writer.lock_ = nullptr;
296 if (lock_) lock_->unlock_and_lock_shared();
299 ReadHolder& operator=(ReadHolder&& other) {
301 swap(lock_, other.lock_);
305 ReadHolder(const ReadHolder& other) = delete;
306 ReadHolder& operator=(const ReadHolder& other) = delete;
308 ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
310 void reset(RWSpinLock* lock = nullptr) {
311 if (lock == lock_) return;
312 if (lock_) lock_->unlock_shared();
314 if (lock_) lock_->lock_shared();
317 void swap(ReadHolder* other) {
318 std::swap(lock_, other->lock_);
322 friend class UpgradedHolder;
323 friend class WriteHolder;
327 class UpgradedHolder {
329 explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
330 if (lock_) lock_->lock_upgrade();
333 explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
334 lock_->lock_upgrade();
337 explicit UpgradedHolder(WriteHolder&& writer) {
338 lock_ = writer.lock_;
339 writer.lock_ = nullptr;
340 if (lock_) lock_->unlock_and_lock_upgrade();
343 UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
344 other.lock_ = nullptr;
347 UpgradedHolder& operator =(UpgradedHolder&& other) {
349 swap(lock_, other.lock_);
353 UpgradedHolder(const UpgradedHolder& other) = delete;
354 UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
356 ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
358 void reset(RWSpinLock* lock = nullptr) {
359 if (lock == lock_) return;
360 if (lock_) lock_->unlock_upgrade();
362 if (lock_) lock_->lock_upgrade();
365 void swap(UpgradedHolder* other) {
367 swap(lock_, other->lock_);
371 friend class WriteHolder;
372 friend class ReadHolder;
378 explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
379 if (lock_) lock_->lock();
382 explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
386 // promoted from an upgrade lock holder
387 explicit WriteHolder(UpgradedHolder&& upgraded) {
388 lock_ = upgraded.lock_;
389 upgraded.lock_ = nullptr;
390 if (lock_) lock_->unlock_upgrade_and_lock();
393 WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
394 other.lock_ = nullptr;
397 WriteHolder& operator =(WriteHolder&& other) {
399 swap(lock_, other.lock_);
403 WriteHolder(const WriteHolder& other) = delete;
404 WriteHolder& operator =(const WriteHolder& other) = delete;
406 ~WriteHolder () { if (lock_) lock_->unlock(); }
408 void reset(RWSpinLock* lock = nullptr) {
409 if (lock == lock_) return;
410 if (lock_) lock_->unlock();
412 if (lock_) lock_->lock();
415 void swap(WriteHolder* other) {
417 swap(lock_, other->lock_);
421 friend class ReadHolder;
422 friend class UpgradedHolder;
426 // Synchronized<> adaptors
427 friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
428 friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
429 friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
430 friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
433 std::atomic<int32_t> bits_;
437 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
438 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
441 template <size_t kBitWidth> struct RWTicketIntTrait {
442 static_assert(kBitWidth == 32 || kBitWidth == 64,
443 "bit width has to be either 32 or 64 ");
447 struct RWTicketIntTrait<64> {
448 typedef uint64_t FullInt;
449 typedef uint32_t HalfInt;
450 typedef uint16_t QuarterInt;
452 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
453 static __m128i make128(const uint16_t v[4]) {
454 return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
456 static inline __m128i fromInteger(uint64_t from) {
457 return _mm_cvtsi64_si128(from);
459 static inline uint64_t toInteger(__m128i in) {
460 return _mm_cvtsi128_si64(in);
462 static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
463 return toInteger(_mm_add_epi16(in, kDelta));
469 struct RWTicketIntTrait<32> {
470 typedef uint32_t FullInt;
471 typedef uint16_t HalfInt;
472 typedef uint8_t QuarterInt;
474 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
475 static __m128i make128(const uint8_t v[4]) {
476 return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
477 0, 0, 0, 0, v[3], v[2], v[1], v[0]);
479 static inline __m128i fromInteger(uint32_t from) {
480 return _mm_cvtsi32_si128(from);
482 static inline uint32_t toInteger(__m128i in) {
483 return _mm_cvtsi128_si32(in);
485 static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
486 return toInteger(_mm_add_epi8(in, kDelta));
493 template<size_t kBitWidth, bool kFavorWriter=false>
494 class RWTicketSpinLockT : boost::noncopyable {
495 typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
496 typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
497 typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
498 typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
504 __extension__ struct {
511 private: // Some x64-specific utilities for atomic access to ticket.
512 template<class T> static T load_acquire(T* addr) {
513 T t = *addr; // acquire barrier
514 asm volatile("" : : : "memory");
519 static void store_release(T* addr, T v) {
520 asm volatile("" : : : "memory");
521 *addr = v; // release barrier
526 RWTicketSpinLockT() {
527 store_release(&ticket.whole, FullInt(0));
532 writeLockAggressive();
539 * Both try_lock and try_lock_shared diverge in our implementation from the
540 * lock algorithm described in the link above.
542 * In the read case, it is undesirable that the readers could wait
543 * for another reader (before increasing ticket.read in the other
544 * implementation). Our approach gives up on
545 * first-come-first-serve, but our benchmarks showed improve
546 * performance for both readers and writers under heavily contended
547 * cases, particularly when the number of threads exceeds the number
550 * We have writeLockAggressive() using the original implementation
551 * for a writer, which gives some advantage to the writer over the
552 * readers---for that path it is guaranteed that the writer will
553 * acquire the lock after all the existing readers exit.
557 FullInt old = t.whole = load_acquire(&ticket.whole);
558 if (t.users != t.write) return false;
560 return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
564 * Call this if you want to prioritize writer to avoid starvation.
565 * Unlike writeLockNice, immediately acquires the write lock when
566 * the existing readers (arriving before the writer) finish their
569 void writeLockAggressive() {
570 // sched_yield() is needed here to avoid a pathology if the number
571 // of threads attempting concurrent writes is >= the number of real
572 // cores allocated to this process. This is less likely than the
573 // corresponding situation in lock_shared(), but we still want to
576 QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
577 while (val != load_acquire(&ticket.write)) {
578 asm volatile("pause");
579 if (UNLIKELY(++count > 1000)) sched_yield();
583 // Call this when the writer should be nicer to the readers.
584 void writeLockNice() {
585 // Here it doesn't cpu-relax the writer.
587 // This is because usually we have many more readers than the
588 // writers, so the writer has less chance to get the lock when
589 // there are a lot of competing readers. The aggressive spinning
590 // can help to avoid starving writers.
592 // We don't worry about sched_yield() here because the caller
593 // has already explicitly abandoned fairness.
594 while (!try_lock()) {}
597 // Atomically unlock the write-lock from writer and acquire the read-lock.
598 void unlock_and_lock_shared() {
599 QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
602 // Release writer permission on the lock.
605 t.whole = load_acquire(&ticket.whole);
606 FullInt old = t.whole;
608 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
609 // SSE2 can reduce the lock and unlock overhead by 10%
610 static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 }; // write/read/user
611 static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
612 __m128i m = IntTraitType::fromInteger(old);
613 t.whole = IntTraitType::addParallel(m, kDelta);
618 store_release(&ticket.readWrite, t.readWrite);
622 // sched_yield() is important here because we can't grab the
623 // shared lock if there is a pending writeLockAggressive, so we
624 // need to let threads that already have a shared lock complete
626 while (!LIKELY(try_lock_shared())) {
627 asm volatile("pause");
628 if (UNLIKELY((++count & 1023) == 0)) sched_yield();
632 bool try_lock_shared() {
634 old.whole = t.whole = load_acquire(&ticket.whole);
635 old.users = old.read;
636 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
637 // SSE2 may reduce the total lock and unlock overhead by 10%
638 static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 }; // write/read/user
639 static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
640 __m128i m = IntTraitType::fromInteger(old.whole);
641 t.whole = IntTraitType::addParallel(m, kDelta);
646 return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
649 void unlock_shared() {
650 QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
655 typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
656 class ReadHolder : boost::noncopyable {
658 explicit ReadHolder(RWSpinLock *lock = nullptr) :
660 if (lock_) lock_->lock_shared();
663 explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
664 if (lock_) lock_->lock_shared();
667 // atomically unlock the write-lock from writer and acquire the read-lock
668 explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
669 std::swap(this->lock_, writer->lock_);
671 lock_->unlock_and_lock_shared();
676 if (lock_) lock_->unlock_shared();
679 void reset(RWSpinLock *lock = nullptr) {
680 if (lock_) lock_->unlock_shared();
682 if (lock_) lock_->lock_shared();
685 void swap(ReadHolder *other) {
686 std::swap(this->lock_, other->lock_);
693 class WriteHolder : boost::noncopyable {
695 explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
696 if (lock_) lock_->lock();
698 explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
699 if (lock_) lock_->lock();
703 if (lock_) lock_->unlock();
706 void reset(RWSpinLock *lock = nullptr) {
707 if (lock == lock_) return;
708 if (lock_) lock_->unlock();
710 if (lock_) lock_->lock();
713 void swap(WriteHolder *other) {
714 std::swap(this->lock_, other->lock_);
718 friend class ReadHolder;
722 // Synchronized<> adaptors.
723 friend void acquireRead(RWTicketSpinLockT& mutex) {
726 friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
729 friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
730 unsigned int milliseconds) {
734 friend void releaseRead(RWTicketSpinLockT& mutex) {
735 mutex.unlock_shared();
737 friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
742 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
743 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
745 #endif // RW_SPINLOCK_USE_X86_INTRINSIC_
749 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
750 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
753 #endif // FOLLY_RWSPINLOCK_H_