folly/RWSpinLock.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * N.B. You most likely do _not_ want to use RWSpinLock or any other
  19  * kind of spinlock.  Use SharedMutex instead.
  20  *
  21  * In short, spinlocks in preemptive multi-tasking operating systems
  22  * have serious problems and fast mutexes like SharedMutex are almost
  23  * certainly the better choice, because letting the OS scheduler put a
  24  * thread to sleep is better for system responsiveness and throughput
  25  * than wasting a timeslice repeatedly querying a lock held by a
  26  * thread that's blocked, and you can't prevent userspace
  27  * programs blocking.
  28  *
  29  * Spinlocks in an operating system kernel make much more sense than
  30  * they do in userspace.
  31  *
  32  * -------------------------------------------------------------------
  33  *
  34  * Two Read-Write spin lock implementations.
  35  *
  36  *  Ref: http://locklessinc.com/articles/locks
  37  *
  38  *  Both locks here are faster than pthread_rwlock and have very low
  39  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  40  *  are very compact (4/8 bytes), so are suitable for per-instance
  41  *  based locking, particularly when contention is not expected.
  42  *
  43  *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
  44  *  about for why a spin lock is frequently a bad idea generally.)
  45  *  RWSpinLock has minimal overhead, and comparable contention
  46  *  performance when the number of competing threads is less than or
  47  *  equal to the number of logical CPUs.  Even as the number of
  48  *  threads gets larger, RWSpinLock can still be very competitive in
  49  *  READ, although it is slower on WRITE, and also inherently unfair
  50  *  to writers.
  51  *
  52  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  53  *  your application really needs a lot more threads, and a
  54  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  55  *
  56  *  Caveats:
  57  *
  58  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  59  *    based systems.
  60  *
  61  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  62  *    readers and writers.
  63  *
  64  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  65  *    readers and writers.
  66  *
  67  *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
  68  *    writer priority) is NOT reentrant, even for lock_shared().
  69  *
  70  *    The lock will not grant any new shared (read) accesses while a thread
  71  *    attempting to acquire the lock in write mode is blocked. (That is,
  72  *    if the lock is held in shared mode by N threads, and a thread attempts
  73  *    to acquire it in write mode, no one else can acquire it in shared mode
  74  *    until these N threads release the lock and then the blocked thread
  75  *    acquires and releases the exclusive lock.) This also applies for
  76  *    attempts to reacquire the lock in shared mode by threads that already
  77  *    hold it in shared mode, making the lock non-reentrant.
  78  *
  79  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  80  *
  81  * @author Xin Liu <xliux@fb.com>
  82  */
  83
  84 #pragma once
  85
  86 /*
  87 ========================================================================
  88 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  89 ========================================================================
  90
  91 ------------------------------------------------------------------------------
  92 1. Single thread benchmark (read/write lock + unlock overhead)
  93 Benchmark                                    Iters   Total t    t/iter iter/sec
  94 -------------------------------------------------------------------------------
  95 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  96 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  97 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  98 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  99 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
 100 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
 101 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
 102 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
 103 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
 104 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
 105 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
 106 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
 107
 108 ------------------------------------------------------------------------------
 109 2. Contention Benchmark      90% read  10% write
 110 Benchmark                    hits       average    min       max        sigma
 111 ------------------------------------------------------------------------------
 112 ---------- 8  threads ------------
 113 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
 114 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
 115 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
 116 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
 117 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
 118 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
 119
 120 ---------- 16 threads ------------
 121 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
 122 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
 123 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
 124 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
 125 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
 126 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
 127
 128 ---------- 50 threads ------------
 129 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 130 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 131 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 132 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 133 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 134 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 135
 136 */
 137
 138 #include <folly/Portability.h>
 139 #include <folly/portability/Asm.h>
 140
 141 #if defined(__GNUC__) && (defined(__i386) || FOLLY_X64 || defined(ARCH_K8))
 142 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 143 #include <x86intrin.h>
 144 #elif defined(_MSC_VER) && defined(FOLLY_X64)
 145 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 146 #else
 147 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 148 #endif
 149
 150 // iOS doesn't define _mm_cvtsi64_si128 and friends
 151 #if (FOLLY_SSE >= 2) && !FOLLY_MOBILE && FOLLY_X64
 152 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 153 #else
 154 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 155 #endif
 156
 157 #include <algorithm>
 158 #include <atomic>
 159 #include <string>
 160 #include <thread>
 161
 162 #include <glog/logging.h>
 163
 164 #include <folly/Likely.h>
 165
 166 namespace folly {
 167
 168 /*
 169  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 170  * a nice writer and don't expect a lot of write/read contention, or
 171  * when you need small rwlocks since you are creating a large number
 172  * of them.
 173  *
 174  * Note that the unfairness here is extreme: if the lock is
 175  * continually accessed for read, writers will never get a chance.  If
 176  * the lock can be that highly contended this class is probably not an
 177  * ideal choice anyway.
 178  *
 179  * It currently implements most of the Lockable, SharedLockable and
 180  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 181  * interfaces.
 182  */
 183 class RWSpinLock {
 184   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 185  public:
 186   constexpr RWSpinLock() : bits_(0) {}
 187
 188   RWSpinLock(RWSpinLock const&) = delete;
 189   RWSpinLock& operator=(RWSpinLock const&) = delete;
 190
 191   // Lockable Concept
 192   void lock() {
 193     uint_fast32_t count = 0;
 194     while (!LIKELY(try_lock())) {
 195       if (++count > 1000) {
 196         std::this_thread::yield();
 197       }
 198     }
 199   }
 200
 201   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 202   void unlock() {
 203     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 204     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 205   }
 206
 207   // SharedLockable Concept
 208   void lock_shared() {
 209     uint_fast32_t count = 0;
 210     while (!LIKELY(try_lock_shared())) {
 211       if (++count > 1000) {
 212         std::this_thread::yield();
 213       }
 214     }
 215   }
 216
 217   void unlock_shared() {
 218     bits_.fetch_add(-READER, std::memory_order_release);
 219   }
 220
 221   // Downgrade the lock from writer status to reader status.
 222   void unlock_and_lock_shared() {
 223     bits_.fetch_add(READER, std::memory_order_acquire);
 224     unlock();
 225   }
 226
 227   // UpgradeLockable Concept
 228   void lock_upgrade() {
 229     uint_fast32_t count = 0;
 230     while (!try_lock_upgrade()) {
 231       if (++count > 1000) {
 232         std::this_thread::yield();
 233       }
 234     }
 235   }
 236
 237   void unlock_upgrade() {
 238     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 239   }
 240
 241   // unlock upgrade and try to acquire write lock
 242   void unlock_upgrade_and_lock() {
 243     int64_t count = 0;
 244     while (!try_unlock_upgrade_and_lock()) {
 245       if (++count > 1000) {
 246         std::this_thread::yield();
 247       }
 248     }
 249   }
 250
 251   // unlock upgrade and read lock atomically
 252   void unlock_upgrade_and_lock_shared() {
 253     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 254   }
 255
 256   // write unlock and upgrade lock atomically
 257   void unlock_and_lock_upgrade() {
 258     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 259     // the same time when other threads are trying do try_lock_upgrade().
 260     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 261     bits_.fetch_add(-WRITER, std::memory_order_release);
 262   }
 263
 264
 265   // Attempt to acquire writer permission. Return false if we didn't get it.
 266   bool try_lock() {
 267     int32_t expect = 0;
 268     return bits_.compare_exchange_strong(expect, WRITER,
 269       std::memory_order_acq_rel);
 270   }
 271
 272   // Try to get reader permission on the lock. This can fail if we
 273   // find out someone is a writer or upgrader.
 274   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 275   // its intention to write and block any new readers while waiting
 276   // for existing readers to finish and release their read locks. This
 277   // helps avoid starving writers (promoted from upgraders).
 278   bool try_lock_shared() {
 279     // fetch_add is considerably (100%) faster than compare_exchange,
 280     // so here we are optimizing for the common (lock success) case.
 281     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 282     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 283       bits_.fetch_add(-READER, std::memory_order_release);
 284       return false;
 285     }
 286     return true;
 287   }
 288
 289   // try to unlock upgrade and write lock atomically
 290   bool try_unlock_upgrade_and_lock() {
 291     int32_t expect = UPGRADED;
 292     return bits_.compare_exchange_strong(expect, WRITER,
 293         std::memory_order_acq_rel);
 294   }
 295
 296   // try to acquire an upgradable lock.
 297   bool try_lock_upgrade() {
 298     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 299
 300     // Note: when failed, we cannot flip the UPGRADED bit back,
 301     // as in this case there is either another upgrade lock or a write lock.
 302     // If it's a write lock, the bit will get cleared up when that lock's done
 303     // with unlock().
 304     return ((value & (UPGRADED | WRITER)) == 0);
 305   }
 306
 307   // mainly for debugging purposes.
 308   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 309
 310   class ReadHolder;
 311   class UpgradedHolder;
 312   class WriteHolder;
 313
 314   class ReadHolder {
 315    public:
 316     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 317       if (lock_) {
 318         lock_->lock_shared();
 319       }
 320     }
 321
 322     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 323       lock_->lock_shared();
 324     }
 325
 326     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 327       other.lock_ = nullptr;
 328     }
 329
 330     // down-grade
 331     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 332       upgraded.lock_ = nullptr;
 333       if (lock_) {
 334         lock_->unlock_upgrade_and_lock_shared();
 335       }
 336     }
 337
 338     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 339       writer.lock_ = nullptr;
 340       if (lock_) {
 341         lock_->unlock_and_lock_shared();
 342       }
 343     }
 344
 345     ReadHolder& operator=(ReadHolder&& other) {
 346       using std::swap;
 347       swap(lock_, other.lock_);
 348       return *this;
 349     }
 350
 351     ReadHolder(const ReadHolder& other) = delete;
 352     ReadHolder& operator=(const ReadHolder& other) = delete;
 353
 354     ~ReadHolder() {
 355       if (lock_) {
 356         lock_->unlock_shared();
 357       }
 358     }
 359
 360     void reset(RWSpinLock* lock = nullptr) {
 361       if (lock == lock_) {
 362         return;
 363       }
 364       if (lock_) {
 365         lock_->unlock_shared();
 366       }
 367       lock_ = lock;
 368       if (lock_) {
 369         lock_->lock_shared();
 370       }
 371     }
 372
 373     void swap(ReadHolder* other) {
 374       std::swap(lock_, other->lock_);
 375     }
 376
 377    private:
 378     friend class UpgradedHolder;
 379     friend class WriteHolder;
 380     RWSpinLock* lock_;
 381   };
 382
 383   class UpgradedHolder {
 384    public:
 385     explicit UpgradedHolder(RWSpinLock* lock) : lock_(lock) {
 386       if (lock_) {
 387         lock_->lock_upgrade();
 388       }
 389     }
 390
 391     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 392       lock_->lock_upgrade();
 393     }
 394
 395     explicit UpgradedHolder(WriteHolder&& writer) {
 396       lock_ = writer.lock_;
 397       writer.lock_ = nullptr;
 398       if (lock_) {
 399         lock_->unlock_and_lock_upgrade();
 400       }
 401     }
 402
 403     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 404       other.lock_ = nullptr;
 405     }
 406
 407     UpgradedHolder& operator =(UpgradedHolder&& other) {
 408       using std::swap;
 409       swap(lock_, other.lock_);
 410       return *this;
 411     }
 412
 413     UpgradedHolder(const UpgradedHolder& other) = delete;
 414     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 415
 416     ~UpgradedHolder() {
 417       if (lock_) {
 418         lock_->unlock_upgrade();
 419       }
 420     }
 421
 422     void reset(RWSpinLock* lock = nullptr) {
 423       if (lock == lock_) {
 424         return;
 425       }
 426       if (lock_) {
 427         lock_->unlock_upgrade();
 428       }
 429       lock_ = lock;
 430       if (lock_) {
 431         lock_->lock_upgrade();
 432       }
 433     }
 434
 435     void swap(UpgradedHolder* other) {
 436       using std::swap;
 437       swap(lock_, other->lock_);
 438     }
 439
 440    private:
 441     friend class WriteHolder;
 442     friend class ReadHolder;
 443     RWSpinLock* lock_;
 444   };
 445
 446   class WriteHolder {
 447    public:
 448     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 449       if (lock_) {
 450         lock_->lock();
 451       }
 452     }
 453
 454     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 455       lock_->lock();
 456     }
 457
 458     // promoted from an upgrade lock holder
 459     explicit WriteHolder(UpgradedHolder&& upgraded) {
 460       lock_ = upgraded.lock_;
 461       upgraded.lock_ = nullptr;
 462       if (lock_) {
 463         lock_->unlock_upgrade_and_lock();
 464       }
 465     }
 466
 467     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 468       other.lock_ = nullptr;
 469     }
 470
 471     WriteHolder& operator =(WriteHolder&& other) {
 472       using std::swap;
 473       swap(lock_, other.lock_);
 474       return *this;
 475     }
 476
 477     WriteHolder(const WriteHolder& other) = delete;
 478     WriteHolder& operator =(const WriteHolder& other) = delete;
 479
 480     ~WriteHolder() {
 481       if (lock_) {
 482         lock_->unlock();
 483       }
 484     }
 485
 486     void reset(RWSpinLock* lock = nullptr) {
 487       if (lock == lock_) {
 488         return;
 489       }
 490       if (lock_) {
 491         lock_->unlock();
 492       }
 493       lock_ = lock;
 494       if (lock_) {
 495         lock_->lock();
 496       }
 497     }
 498
 499     void swap(WriteHolder* other) {
 500       using std::swap;
 501       swap(lock_, other->lock_);
 502     }
 503
 504    private:
 505     friend class ReadHolder;
 506     friend class UpgradedHolder;
 507     RWSpinLock* lock_;
 508   };
 509
 510  private:
 511   std::atomic<int32_t> bits_;
 512 };
 513
 514
 515 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 516 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 517
 518 namespace detail {
 519 template <size_t kBitWidth> struct RWTicketIntTrait {
 520   static_assert(kBitWidth == 32 || kBitWidth == 64,
 521       "bit width has to be either 32 or 64 ");
 522 };
 523
 524 template <>
 525 struct RWTicketIntTrait<64> {
 526   typedef uint64_t FullInt;
 527   typedef uint32_t HalfInt;
 528   typedef uint16_t QuarterInt;
 529
 530 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 531   static __m128i make128(const uint16_t v[4]) {
 532     return _mm_set_epi16(0, 0, 0, 0,
 533         short(v[3]), short(v[2]), short(v[1]), short(v[0]));
 534   }
 535   static inline __m128i fromInteger(uint64_t from) {
 536     return _mm_cvtsi64_si128(int64_t(from));
 537   }
 538   static inline uint64_t toInteger(__m128i in) {
 539     return uint64_t(_mm_cvtsi128_si64(in));
 540   }
 541   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 542     return toInteger(_mm_add_epi16(in, kDelta));
 543   }
 544 #endif
 545 };
 546
 547 template <>
 548 struct RWTicketIntTrait<32> {
 549   typedef uint32_t FullInt;
 550   typedef uint16_t HalfInt;
 551   typedef uint8_t QuarterInt;
 552
 553 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 554   static __m128i make128(const uint8_t v[4]) {
 555     return _mm_set_epi8(
 556         0, 0, 0, 0,
 557         0, 0, 0, 0,
 558         0, 0, 0, 0,
 559         char(v[3]), char(v[2]), char(v[1]), char(v[0]));
 560   }
 561   static inline __m128i fromInteger(uint32_t from) {
 562     return _mm_cvtsi32_si128(int32_t(from));
 563   }
 564   static inline uint32_t toInteger(__m128i in) {
 565     return uint32_t(_mm_cvtsi128_si32(in));
 566   }
 567   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 568     return toInteger(_mm_add_epi8(in, kDelta));
 569   }
 570 #endif
 571 };
 572 } // namespace detail
 573
 574 template <size_t kBitWidth, bool kFavorWriter = false>
 575 class RWTicketSpinLockT {
 576   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 577   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 578   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 579   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 580     QuarterInt;
 581
 582   union RWTicket {
 583     constexpr RWTicket() : whole(0) {}
 584     FullInt whole;
 585     HalfInt readWrite;
 586     __extension__ struct {
 587       QuarterInt write;
 588       QuarterInt read;
 589       QuarterInt users;
 590     };
 591   } ticket;
 592
 593  private: // Some x64-specific utilities for atomic access to ticket.
 594   template <class T> static T load_acquire(T* addr) {
 595     T t = *addr; // acquire barrier
 596     asm_volatile_memory();
 597     return t;
 598   }
 599
 600   template <class T>
 601   static void store_release(T* addr, T v) {
 602     asm_volatile_memory();
 603     *addr = v; // release barrier
 604   }
 605
 606  public:
 607
 608   constexpr RWTicketSpinLockT() {}
 609
 610   RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
 611   RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
 612
 613   void lock() {
 614     if (kFavorWriter) {
 615       writeLockAggressive();
 616     } else {
 617       writeLockNice();
 618     }
 619   }
 620
 621   /*
 622    * Both try_lock and try_lock_shared diverge in our implementation from the
 623    * lock algorithm described in the link above.
 624    *
 625    * In the read case, it is undesirable that the readers could wait
 626    * for another reader (before increasing ticket.read in the other
 627    * implementation).  Our approach gives up on
 628    * first-come-first-serve, but our benchmarks showed improve
 629    * performance for both readers and writers under heavily contended
 630    * cases, particularly when the number of threads exceeds the number
 631    * of logical CPUs.
 632    *
 633    * We have writeLockAggressive() using the original implementation
 634    * for a writer, which gives some advantage to the writer over the
 635    * readers---for that path it is guaranteed that the writer will
 636    * acquire the lock after all the existing readers exit.
 637    */
 638   bool try_lock() {
 639     RWTicket t;
 640     FullInt old = t.whole = load_acquire(&ticket.whole);
 641     if (t.users != t.write) {
 642       return false;
 643     }
 644     ++t.users;
 645     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 646   }
 647
 648   /*
 649    * Call this if you want to prioritize writer to avoid starvation.
 650    * Unlike writeLockNice, immediately acquires the write lock when
 651    * the existing readers (arriving before the writer) finish their
 652    * turns.
 653    */
 654   void writeLockAggressive() {
 655     // std::this_thread::yield() is needed here to avoid a pathology if the number
 656     // of threads attempting concurrent writes is >= the number of real
 657     // cores allocated to this process. This is less likely than the
 658     // corresponding situation in lock_shared(), but we still want to
 659     // avoid it
 660     uint_fast32_t count = 0;
 661     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 662     while (val != load_acquire(&ticket.write)) {
 663       asm_volatile_pause();
 664       if (UNLIKELY(++count > 1000)) {
 665         std::this_thread::yield();
 666       }
 667     }
 668   }
 669
 670   // Call this when the writer should be nicer to the readers.
 671   void writeLockNice() {
 672     // Here it doesn't cpu-relax the writer.
 673     //
 674     // This is because usually we have many more readers than the
 675     // writers, so the writer has less chance to get the lock when
 676     // there are a lot of competing readers.  The aggressive spinning
 677     // can help to avoid starving writers.
 678     //
 679     // We don't worry about std::this_thread::yield() here because the caller
 680     // has already explicitly abandoned fairness.
 681     while (!try_lock()) {}
 682   }
 683
 684   // Atomically unlock the write-lock from writer and acquire the read-lock.
 685   void unlock_and_lock_shared() {
 686     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 687   }
 688
 689   // Release writer permission on the lock.
 690   void unlock() {
 691     RWTicket t;
 692     t.whole = load_acquire(&ticket.whole);
 693     FullInt old = t.whole;
 694
 695 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 696     // SSE2 can reduce the lock and unlock overhead by 10%
 697     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 698     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 699     __m128i m = IntTraitType::fromInteger(old);
 700     t.whole = IntTraitType::addParallel(m, kDelta);
 701 #else
 702     ++t.read;
 703     ++t.write;
 704 #endif
 705     store_release(&ticket.readWrite, t.readWrite);
 706   }
 707
 708   void lock_shared() {
 709     // std::this_thread::yield() is important here because we can't grab the
 710     // shared lock if there is a pending writeLockAggressive, so we
 711     // need to let threads that already have a shared lock complete
 712     uint_fast32_t count = 0;
 713     while (!LIKELY(try_lock_shared())) {
 714       asm_volatile_pause();
 715       if (UNLIKELY((++count & 1023) == 0)) {
 716         std::this_thread::yield();
 717       }
 718     }
 719   }
 720
 721   bool try_lock_shared() {
 722     RWTicket t, old;
 723     old.whole = t.whole = load_acquire(&ticket.whole);
 724     old.users = old.read;
 725 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 726     // SSE2 may reduce the total lock and unlock overhead by 10%
 727     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 728     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 729     __m128i m = IntTraitType::fromInteger(old.whole);
 730     t.whole = IntTraitType::addParallel(m, kDelta);
 731 #else
 732     ++t.read;
 733     ++t.users;
 734 #endif
 735     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 736   }
 737
 738   void unlock_shared() {
 739     __sync_fetch_and_add(&ticket.write, 1);
 740   }
 741
 742   class WriteHolder;
 743
 744   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 745   class ReadHolder {
 746    public:
 747     ReadHolder(ReadHolder const&) = delete;
 748     ReadHolder& operator=(ReadHolder const&) = delete;
 749
 750     explicit ReadHolder(RWSpinLock* lock) : lock_(lock) {
 751       if (lock_) {
 752         lock_->lock_shared();
 753       }
 754     }
 755
 756     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 757       if (lock_) {
 758         lock_->lock_shared();
 759       }
 760     }
 761
 762     // atomically unlock the write-lock from writer and acquire the read-lock
 763     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 764       std::swap(this->lock_, writer->lock_);
 765       if (lock_) {
 766         lock_->unlock_and_lock_shared();
 767       }
 768     }
 769
 770     ~ReadHolder() {
 771       if (lock_) {
 772         lock_->unlock_shared();
 773       }
 774     }
 775
 776     void reset(RWSpinLock *lock = nullptr) {
 777       if (lock_) {
 778         lock_->unlock_shared();
 779       }
 780       lock_ = lock;
 781       if (lock_) {
 782         lock_->lock_shared();
 783       }
 784     }
 785
 786     void swap(ReadHolder *other) {
 787       std::swap(this->lock_, other->lock_);
 788     }
 789
 790    private:
 791     RWSpinLock *lock_;
 792   };
 793
 794   class WriteHolder {
 795    public:
 796     WriteHolder(WriteHolder const&) = delete;
 797     WriteHolder& operator=(WriteHolder const&) = delete;
 798
 799     explicit WriteHolder(RWSpinLock* lock) : lock_(lock) {
 800       if (lock_) {
 801         lock_->lock();
 802       }
 803     }
 804     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 805       if (lock_) {
 806         lock_->lock();
 807       }
 808     }
 809
 810     ~WriteHolder() {
 811       if (lock_) {
 812         lock_->unlock();
 813       }
 814     }
 815
 816     void reset(RWSpinLock *lock = nullptr) {
 817       if (lock == lock_) {
 818         return;
 819       }
 820       if (lock_) {
 821         lock_->unlock();
 822       }
 823       lock_ = lock;
 824       if (lock_) {
 825         lock_->lock();
 826       }
 827     }
 828
 829     void swap(WriteHolder *other) {
 830       std::swap(this->lock_, other->lock_);
 831     }
 832
 833    private:
 834     friend class ReadHolder;
 835     RWSpinLock *lock_;
 836   };
 837 };
 838
 839 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 840 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 841
 842 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 843
 844 } // namespace folly
 845
 846 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 847 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 848 #endif