RWSpinLock.h

   1 /*
   2  * Copyright 2011-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 /*
  17  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  18  * kind of spinlock.  Use SharedMutex instead.
  19  *
  20  * In short, spinlocks in preemptive multi-tasking operating systems
  21  * have serious problems and fast mutexes like SharedMutex are almost
  22  * certainly the better choice, because letting the OS scheduler put a
  23  * thread to sleep is better for system responsiveness and throughput
  24  * than wasting a timeslice repeatedly querying a lock held by a
  25  * thread that's blocked, and you can't prevent userspace
  26  * programs blocking.
  27  *
  28  * Spinlocks in an operating system kernel make much more sense than
  29  * they do in userspace.
  30  *
  31  * -------------------------------------------------------------------
  32  *
  33  * Two Read-Write spin lock implementations.
  34  *
  35  *  Ref: http://locklessinc.com/articles/locks
  36  *
  37  *  Both locks here are faster than pthread_rwlock and have very low
  38  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  39  *  are very compact (4/8 bytes), so are suitable for per-instance
  40  *  based locking, particularly when contention is not expected.
  41  *
  42  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  43  *  about for why a spin lock is frequently a bad idea generally.)
  44  *  RWSpinLock has minimal overhead, and comparable contention
  45  *  performance when the number of competing threads is less than or
  46  *  equal to the number of logical CPUs.  Even as the number of
  47  *  threads gets larger, RWSpinLock can still be very competitive in
  48  *  READ, although it is slower on WRITE, and also inherently unfair
  49  *  to writers.
  50  *
  51  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  52  *  your application really needs a lot more threads, and a
  53  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  54  *
  55  *  Caveats:
  56  *
  57  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  58  *    based systems.
  59  *
  60  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  61  *    readers and writers.
  62  *
  63  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  64  *    readers and writers.
  65  *
  66  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  67  *    writer priority) is NOT reentrant, even for lock_shared().
  68  *
  69  *    The lock will not grant any new shared (read) accesses while a thread
  70  *    attempting to acquire the lock in write mode is blocked. (That is,
  71  *    if the lock is held in shared mode by N threads, and a thread attempts
  72  *    to acquire it in write mode, no one else can acquire it in shared mode
  73  *    until these N threads release the lock and then the blocked thread
  74  *    acquires and releases the exclusive lock.) This also applies for
  75  *    attempts to reacquire the lock in shared mode by threads that already
  76  *    hold it in shared mode, making the lock non-reentrant.
  77  *
  78  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  79  *
  80  * @author Xin Liu <xliux@fb.com>
  81  */
  82
  83 #pragma once
  84
  85 /*
  86 ========================================================================
  87 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  88 ========================================================================
  89
  90 ------------------------------------------------------------------------------
  91 1. Single thread benchmark (read/write lock + unlock overhead)
  92 Benchmark                                    Iters   Total t    t/iter iter/sec
  93 -------------------------------------------------------------------------------
  94 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  95 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  96 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  97 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  98 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  99 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 100 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 101 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 102 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 103 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 104 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 105 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 106
 107 ------------------------------------------------------------------------------
 108 2. Contention Benchmark      90% read  10% write
 109 Benchmark                    hits       average    min       max        sigma
 110 ------------------------------------------------------------------------------
 111 ---------- 8  threads ------------
 112 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 113 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 114 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 115 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 116 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 117 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 118
 119 ---------- 16 threads ------------
 120 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 121 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 122 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 123 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 124 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 125 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 126
 127 ---------- 50 threads ------------
 128 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 129 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 130 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 131 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 132 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 133 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 134
 135 */
 136
 137 #include <folly/Portability.h>
 138 #include <folly/portability/Asm.h>
 139
 140 #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
 141 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 142 #include <x86intrin.h>
 143 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 144 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 145 #elif FOLLY_AARCH64
 146 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 147 #else
 148 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 149 #endif
 150
 151 // iOS doesn't define _mm_cvtsi64_si128 and friends
 152 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
 153 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 154 #else
 155 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 156 #endif
 157
 158 #include <algorithm>
 159 #include <atomic>
 160 #include <thread>
 161
 162 #include <folly/Likely.h>
 163
 164 namespace folly {
 165
 166 /*
 167  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 168  * a nice writer and don't expect a lot of write/read contention, or
 169  * when you need small rwlocks since you are creating a large number
 170  * of them.
 171  *
 172  * Note that the unfairness here is extreme: if the lock is
 173  * continually accessed for read, writers will never get a chance.  If
 174  * the lock can be that highly contended this class is probably not an
 175  * ideal choice anyway.
 176  *
 177  * It currently implements most of the Lockable, SharedLockable and
 178  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 179  * interfaces.
 180  */
 181 class RWSpinLock {
 182   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 183  public:
 184   constexpr RWSpinLock() : bits_(0) {}
 185
 186   RWSpinLock(RWSpinLock const&) = delete;
 187   RWSpinLock& operator=(RWSpinLock const&) = delete;
 188
 189   // Lockable Concept
 190   void lock() {
 191     uint_fast32_t count = 0;
 192     while (!LIKELY(try_lock())) {
 193       if (++count > 1000) {
 194         std::this_thread::yield();
 195       }
 196     }
 197   }
 198
 199   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 200   void unlock() {
 201     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 202     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 203   }
 204
 205   // SharedLockable Concept
 206   void lock_shared() {
 207     uint_fast32_t count = 0;
 208     while (!LIKELY(try_lock_shared())) {
 209       if (++count > 1000) {
 210         std::this_thread::yield();
 211       }
 212     }
 213   }
 214
 215   void unlock_shared() {
 216     bits_.fetch_add(-READER, std::memory_order_release);
 217   }
 218
 219   // Downgrade the lock from writer status to reader status.
 220   void unlock_and_lock_shared() {
 221     bits_.fetch_add(READER, std::memory_order_acquire);
 222     unlock();
 223   }
 224
 225   // UpgradeLockable Concept
 226   void lock_upgrade() {
 227     uint_fast32_t count = 0;
 228     while (!try_lock_upgrade()) {
 229       if (++count > 1000) {
 230         std::this_thread::yield();
 231       }
 232     }
 233   }
 234
 235   void unlock_upgrade() {
 236     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 237   }
 238
 239   // unlock upgrade and try to acquire write lock
 240   void unlock_upgrade_and_lock() {
 241     int64_t count = 0;
 242     while (!try_unlock_upgrade_and_lock()) {
 243       if (++count > 1000) {
 244         std::this_thread::yield();
 245       }
 246     }
 247   }
 248
 249   // unlock upgrade and read lock atomically
 250   void unlock_upgrade_and_lock_shared() {
 251     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 252   }
 253
 254   // write unlock and upgrade lock atomically
 255   void unlock_and_lock_upgrade() {
 256     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 257     // the same time when other threads are trying do try_lock_upgrade().
 258     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 259     bits_.fetch_add(-WRITER, std::memory_order_release);
 260   }
 261
 262
 263   // Attempt to acquire writer permission. Return false if we didn't get it.
 264   bool try_lock() {
 265     int32_t expect = 0;
 266     return bits_.compare_exchange_strong(expect, WRITER,
 267       std::memory_order_acq_rel);
 268   }
 269
 270   // Try to get reader permission on the lock. This can fail if we
 271   // find out someone is a writer or upgrader.
 272   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 273   // its intention to write and block any new readers while waiting
 274   // for existing readers to finish and release their read locks. This
 275   // helps avoid starving writers (promoted from upgraders).
 276   bool try_lock_shared() {
 277     // fetch_add is considerably (100%) faster than compare_exchange,
 278     // so here we are optimizing for the common (lock success) case.
 279     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 280     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 281       bits_.fetch_add(-READER, std::memory_order_release);
 282       return false;
 283     }
 284     return true;
 285   }
 286
 287   // try to unlock upgrade and write lock atomically
 288   bool try_unlock_upgrade_and_lock() {
 289     int32_t expect = UPGRADED;
 290     return bits_.compare_exchange_strong(expect, WRITER,
 291         std::memory_order_acq_rel);
 292   }
 293
 294   // try to acquire an upgradable lock.
 295   bool try_lock_upgrade() {
 296     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 297
 298     // Note: when failed, we cannot flip the UPGRADED bit back,
 299     // as in this case there is either another upgrade lock or a write lock.
 300     // If it's a write lock, the bit will get cleared up when that lock's done
 301     // with unlock().
 302     return ((value & (UPGRADED | WRITER)) == 0);
 303   }
 304
 305   // mainly for debugging purposes.
 306   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 307
 308   class ReadHolder;
 309   class UpgradedHolder;
 310   class WriteHolder;
 311
 312   class ReadHolder {
 313    public:
 314     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 315       if (lock_) {
 316         lock_->lock_shared();
 317       }
 318     }
 319
 320     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 321       lock_->lock_shared();
 322     }
 323
 324     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 325       other.lock_ = nullptr;
 326     }
 327
 328     // down-grade
 329     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 330       upgraded.lock_ = nullptr;
 331       if (lock_) {
 332         lock_->unlock_upgrade_and_lock_shared();
 333       }
 334     }
 335
 336     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 337       writer.lock_ = nullptr;
 338       if (lock_) {
 339         lock_->unlock_and_lock_shared();
 340       }
 341     }
 342
 343     ReadHolder& operator=(ReadHolder&& other) {
 344       using std::swap;
 345       swap(lock_, other.lock_);
 346       return *this;
 347     }
 348
 349     ReadHolder(const ReadHolder& other) = delete;
 350     ReadHolder& operator=(const ReadHolder& other) = delete;
 351
 352     ~ReadHolder() {
 353       if (lock_) {
 354         lock_->unlock_shared();
 355       }
 356     }
 357
 358     void reset(RWSpinLock* lock = nullptr) {
 359       if (lock == lock_) {
 360         return;
 361       }
 362       if (lock_) {
 363         lock_->unlock_shared();
 364       }
 365       lock_ = lock;
 366       if (lock_) {
 367         lock_->lock_shared();
 368       }
 369     }
 370
 371     void swap(ReadHolder* other) {
 372       std::swap(lock_, other->lock_);
 373     }
 374
 375    private:
 376     friend class UpgradedHolder;
 377     friend class WriteHolder;
 378     RWSpinLock* lock_;
 379   };
 380
 381   class UpgradedHolder {
 382    public:
 383     explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
 384       if (lock_) {
 385         lock_->lock_upgrade();
 386       }
 387     }
 388
 389     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 390       lock_->lock_upgrade();
 391     }
 392
 393     explicit UpgradedHolder(WriteHolder&& writer) {
 394       lock_ = writer.lock_;
 395       writer.lock_ = nullptr;
 396       if (lock_) {
 397         lock_->unlock_and_lock_upgrade();
 398       }
 399     }
 400
 401     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 402       other.lock_ = nullptr;
 403     }
 404
 405     UpgradedHolder& operator =(UpgradedHolder&& other) {
 406       using std::swap;
 407       swap(lock_, other.lock_);
 408       return *this;
 409     }
 410
 411     UpgradedHolder(const UpgradedHolder& other) = delete;
 412     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 413
 414     ~UpgradedHolder() {
 415       if (lock_) {
 416         lock_->unlock_upgrade();
 417       }
 418     }
 419
 420     void reset(RWSpinLock* lock = nullptr) {
 421       if (lock == lock_) {
 422         return;
 423       }
 424       if (lock_) {
 425         lock_->unlock_upgrade();
 426       }
 427       lock_ = lock;
 428       if (lock_) {
 429         lock_->lock_upgrade();
 430       }
 431     }
 432
 433     void swap(UpgradedHolder* other) {
 434       using std::swap;
 435       swap(lock_, other->lock_);
 436     }
 437
 438    private:
 439     friend class WriteHolder;
 440     friend class ReadHolder;
 441     RWSpinLock* lock_;
 442   };
 443
 444   class WriteHolder {
 445    public:
 446     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 447       if (lock_) {
 448         lock_->lock();
 449       }
 450     }
 451
 452     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 453       lock_->lock();
 454     }
 455
 456     // promoted from an upgrade lock holder
 457     explicit WriteHolder(UpgradedHolder&& upgraded) {
 458       lock_ = upgraded.lock_;
 459       upgraded.lock_ = nullptr;
 460       if (lock_) {
 461         lock_->unlock_upgrade_and_lock();
 462       }
 463     }
 464
 465     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 466       other.lock_ = nullptr;
 467     }
 468
 469     WriteHolder& operator =(WriteHolder&& other) {
 470       using std::swap;
 471       swap(lock_, other.lock_);
 472       return *this;
 473     }
 474
 475     WriteHolder(const WriteHolder& other) = delete;
 476     WriteHolder& operator =(const WriteHolder& other) = delete;
 477
 478     ~WriteHolder() {
 479       if (lock_) {
 480         lock_->unlock();
 481       }
 482     }
 483
 484     void reset(RWSpinLock* lock = nullptr) {
 485       if (lock == lock_) {
 486         return;
 487       }
 488       if (lock_) {
 489         lock_->unlock();
 490       }
 491       lock_ = lock;
 492       if (lock_) {
 493         lock_->lock();
 494       }
 495     }
 496
 497     void swap(WriteHolder* other) {
 498       using std::swap;
 499       swap(lock_, other->lock_);
 500     }
 501
 502    private:
 503     friend class ReadHolder;
 504     friend class UpgradedHolder;
 505     RWSpinLock* lock_;
 506   };
 507
 508  private:
 509   std::atomic<int32_t> bits_;
 510 };
 511
 512
 513 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 514 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 515
 516 namespace detail {
 517 template <size_t kBitWidth> struct RWTicketIntTrait {
 518   static_assert(kBitWidth == 32 || kBitWidth == 64,
 519       "bit width has to be either 32 or 64 ");
 520 };
 521
 522 template <>
 523 struct RWTicketIntTrait<64> {
 524   typedef uint64_t FullInt;
 525   typedef uint32_t HalfInt;
 526   typedef uint16_t QuarterInt;
 527
 528 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 529   static __m128i make128(const uint16_t v[4]) {
 530     return _mm_set_epi16(0, 0, 0, 0,
 531         short(v[3]), short(v[2]), short(v[1]), short(v[0]));
 532   }
 533   static inline __m128i fromInteger(uint64_t from) {
 534     return _mm_cvtsi64_si128(int64_t(from));
 535   }
 536   static inline uint64_t toInteger(__m128i in) {
 537     return uint64_t(_mm_cvtsi128_si64(in));
 538   }
 539   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 540     return toInteger(_mm_add_epi16(in, kDelta));
 541   }
 542 #endif
 543 };
 544
 545 template <>
 546 struct RWTicketIntTrait<32> {
 547   typedef uint32_t FullInt;
 548   typedef uint16_t HalfInt;
 549   typedef uint8_t QuarterInt;
 550
 551 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 552   static __m128i make128(const uint8_t v[4]) {
 553     return _mm_set_epi8(
 554         0, 0, 0, 0,
 555         0, 0, 0, 0,
 556         0, 0, 0, 0,
 557         char(v[3]), char(v[2]), char(v[1]), char(v[0]));
 558   }
 559   static inline __m128i fromInteger(uint32_t from) {
 560     return _mm_cvtsi32_si128(int32_t(from));
 561   }
 562   static inline uint32_t toInteger(__m128i in) {
 563     return uint32_t(_mm_cvtsi128_si32(in));
 564   }
 565   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 566     return toInteger(_mm_add_epi8(in, kDelta));
 567   }
 568 #endif
 569 };
 570 } // namespace detail
 571
 572 template <size_t kBitWidth, bool kFavorWriter = false>
 573 class RWTicketSpinLockT {
 574   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 575   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 576   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 577   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 578     QuarterInt;
 579
 580   union RWTicket {
 581     constexpr RWTicket() : whole(0) {}
 582     FullInt whole;
 583     HalfInt readWrite;
 584     __extension__ struct {
 585       QuarterInt write;
 586       QuarterInt read;
 587       QuarterInt users;
 588     };
 589   } ticket;
 590
 591  private: // Some x64-specific utilities for atomic access to ticket.
 592   template <class T> static T load_acquire(T* addr) {
 593     T t = *addr; // acquire barrier
 594     asm_volatile_memory();
 595     return t;
 596   }
 597
 598   template <class T>
 599   static void store_release(T* addr, T v) {
 600     asm_volatile_memory();
 601     *addr = v; // release barrier
 602   }
 603
 604  public:
 605
 606   constexpr RWTicketSpinLockT() {}
 607
 608   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 609   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 610
 611   void lock() {
 612     if (kFavorWriter) {
 613       writeLockAggressive();
 614     } else {
 615       writeLockNice();
 616     }
 617   }
 618
 619   /*
 620    * Both try_lock and try_lock_shared diverge in our implementation from the
 621    * lock algorithm described in the link above.
 622    *
 623    * In the read case, it is undesirable that the readers could wait
 624    * for another reader (before increasing ticket.read in the other
 625    * implementation).  Our approach gives up on
 626    * first-come-first-serve, but our benchmarks showed improve
 627    * performance for both readers and writers under heavily contended
 628    * cases, particularly when the number of threads exceeds the number
 629    * of logical CPUs.
 630    *
 631    * We have writeLockAggressive() using the original implementation
 632    * for a writer, which gives some advantage to the writer over the
 633    * readers---for that path it is guaranteed that the writer will
 634    * acquire the lock after all the existing readers exit.
 635    */
 636   bool try_lock() {
 637     RWTicket t;
 638     FullInt old = t.whole = load_acquire(&ticket.whole);
 639     if (t.users != t.write) {
 640       return false;
 641     }
 642     ++t.users;
 643     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 644   }
 645
 646   /*
 647    * Call this if you want to prioritize writer to avoid starvation.
 648    * Unlike writeLockNice, immediately acquires the write lock when
 649    * the existing readers (arriving before the writer) finish their
 650    * turns.
 651    */
 652   void writeLockAggressive() {
 653     // std::this_thread::yield() is needed here to avoid a pathology if the number
 654     // of threads attempting concurrent writes is >= the number of real
 655     // cores allocated to this process. This is less likely than the
 656     // corresponding situation in lock_shared(), but we still want to
 657     // avoid it
 658     uint_fast32_t count = 0;
 659     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 660     while (val != load_acquire(&ticket.write)) {
 661       asm_volatile_pause();
 662       if (UNLIKELY(++count > 1000)) {
 663         std::this_thread::yield();
 664       }
 665     }
 666   }
 667
 668   // Call this when the writer should be nicer to the readers.
 669   void writeLockNice() {
 670     // Here it doesn't cpu-relax the writer.
 671     //
 672     // This is because usually we have many more readers than the
 673     // writers, so the writer has less chance to get the lock when
 674     // there are a lot of competing readers.  The aggressive spinning
 675     // can help to avoid starving writers.
 676     //
 677     // We don't worry about std::this_thread::yield() here because the caller
 678     // has already explicitly abandoned fairness.
 679     while (!try_lock()) {}
 680   }
 681
 682   // Atomically unlock the write-lock from writer and acquire the read-lock.
 683   void unlock_and_lock_shared() {
 684     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 685   }
 686
 687   // Release writer permission on the lock.
 688   void unlock() {
 689     RWTicket t;
 690     t.whole = load_acquire(&ticket.whole);
 691
 692 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 693     FullInt old = t.whole;
 694     // SSE2 can reduce the lock and unlock overhead by 10%
 695     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 696     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 697     __m128i m = IntTraitType::fromInteger(old);
 698     t.whole = IntTraitType::addParallel(m, kDelta);
 699 #else
 700     ++t.read;
 701     ++t.write;
 702 #endif
 703     store_release(&ticket.readWrite, t.readWrite);
 704   }
 705
 706   void lock_shared() {
 707     // std::this_thread::yield() is important here because we can't grab the
 708     // shared lock if there is a pending writeLockAggressive, so we
 709     // need to let threads that already have a shared lock complete
 710     uint_fast32_t count = 0;
 711     while (!LIKELY(try_lock_shared())) {
 712       asm_volatile_pause();
 713       if (UNLIKELY((++count & 1023) == 0)) {
 714         std::this_thread::yield();
 715       }
 716     }
 717   }
 718
 719   bool try_lock_shared() {
 720     RWTicket t, old;
 721     old.whole = t.whole = load_acquire(&ticket.whole);
 722     old.users = old.read;
 723 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 724     // SSE2 may reduce the total lock and unlock overhead by 10%
 725     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 726     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 727     __m128i m = IntTraitType::fromInteger(old.whole);
 728     t.whole = IntTraitType::addParallel(m, kDelta);
 729 #else
 730     ++t.read;
 731     ++t.users;
 732 #endif
 733     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 734   }
 735
 736   void unlock_shared() {
 737     __sync_fetch_and_add(&ticket.write, 1);
 738   }
 739
 740   class WriteHolder;
 741
 742   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 743   class ReadHolder {
 744    public:
 745     ReadHolder(ReadHolder const&) = delete;
 746     ReadHolder& operator=(ReadHolder const&) = delete;
 747
 748     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 749       if (lock_) {
 750         lock_->lock_shared();
 751       }
 752     }
 753
 754     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 755       if (lock_) {
 756         lock_->lock_shared();
 757       }
 758     }
 759
 760     // atomically unlock the write-lock from writer and acquire the read-lock
 761     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 762       std::swap(this->lock_, writer->lock_);
 763       if (lock_) {
 764         lock_->unlock_and_lock_shared();
 765       }
 766     }
 767
 768     ~ReadHolder() {
 769       if (lock_) {
 770         lock_->unlock_shared();
 771       }
 772     }
 773
 774     void reset(RWSpinLock *lock = nullptr) {
 775       if (lock_) {
 776         lock_->unlock_shared();
 777       }
 778       lock_ = lock;
 779       if (lock_) {
 780         lock_->lock_shared();
 781       }
 782     }
 783
 784     void swap(ReadHolder *other) {
 785       std::swap(this->lock_, other->lock_);
 786     }
 787
 788    private:
 789     RWSpinLock *lock_;
 790   };
 791
 792   class WriteHolder {
 793    public:
 794     WriteHolder(WriteHolder const&) = delete;
 795     WriteHolder& operator=(WriteHolder const&) = delete;
 796
 797     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 798       if (lock_) {
 799         lock_->lock();
 800       }
 801     }
 802     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 803       if (lock_) {
 804         lock_->lock();
 805       }
 806     }
 807
 808     ~WriteHolder() {
 809       if (lock_) {
 810         lock_->unlock();
 811       }
 812     }
 813
 814     void reset(RWSpinLock *lock = nullptr) {
 815       if (lock == lock_) {
 816         return;
 817       }
 818       if (lock_) {
 819         lock_->unlock();
 820       }
 821       lock_ = lock;
 822       if (lock_) {
 823         lock_->lock();
 824       }
 825     }
 826
 827     void swap(WriteHolder *other) {
 828       std::swap(this->lock_, other->lock_);
 829     }
 830
 831    private:
 832     friend class ReadHolder;
 833     RWSpinLock *lock_;
 834   };
 835 };
 836
 837 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 838 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 839
 840 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 841
 842 } // namespace folly
 843
 844 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 845 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 846 #endif