folly/RWSpinLock.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  50  *
  51  * @author Xin Liu <xliux@fb.com>
  52  */
  53
  54 #ifndef FOLLY_RWSPINLOCK_H_
  55 #define FOLLY_RWSPINLOCK_H_
  56
  57 /*
  58 ========================================================================
  59 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  60 ========================================================================
  61
  62 ------------------------------------------------------------------------------
  63 1. Single thread benchmark (read/write lock + unlock overhead)
  64 Benchmark                                    Iters   Total t    t/iter iter/sec
  65 -------------------------------------------------------------------------------
  66 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  67 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  68 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  69 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  70 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  71 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  72 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  73 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  74 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  75 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  76 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  77 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  78
  79 ------------------------------------------------------------------------------
  80 2. Contention Benchmark      90% read  10% write
  81 Benchmark                    hits       average    min       max        sigma
  82 ------------------------------------------------------------------------------
  83 ---------- 8  threads ------------
  84 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  85 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  86 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  87 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
  88 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
  89 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
  90
  91 ---------- 16 threads ------------
  92 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
  93 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
  94 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
  95 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
  96 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
  97 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
  98
  99 ---------- 50 threads ------------
 100 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 101 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 102 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 103 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 104 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 105 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 106
 107 */
 108
 109 #include <folly/Portability.h>
 110
 111 #if defined(__GNUC__) && \
 112   (defined(__i386) || FOLLY_X64 || \
 113    defined(ARCH_K8))
 114 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 115 #include <x86intrin.h>
 116 #else
 117 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 118 #endif
 119
 120 // iOS doesn't define _mm_cvtsi64_si128 and friends
 121 #if defined(__SSE2__) && !TARGET_OS_IPHONE
 122 #define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 123 #else
 124 #undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 125 #endif
 126
 127 #include <atomic>
 128 #include <string>
 129 #include <algorithm>
 130 #include <boost/noncopyable.hpp>
 131
 132 #include <sched.h>
 133 #include <glog/logging.h>
 134
 135 #include <folly/Likely.h>
 136
 137 namespace folly {
 138
 139 /*
 140  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 141  * a nice writer and don't expect a lot of write/read contention, or
 142  * when you need small rwlocks since you are creating a large number
 143  * of them.
 144  *
 145  * Note that the unfairness here is extreme: if the lock is
 146  * continually accessed for read, writers will never get a chance.  If
 147  * the lock can be that highly contended this class is probably not an
 148  * ideal choice anyway.
 149  *
 150  * It currently implements most of the Lockable, SharedLockable and
 151  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 152  * interfaces.
 153  */
 154 class RWSpinLock : boost::noncopyable {
 155   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 156  public:
 157   RWSpinLock() : bits_(0) {}
 158
 159   // Lockable Concept
 160   void lock() {
 161     int count = 0;
 162     while (!LIKELY(try_lock())) {
 163       if (++count > 1000) sched_yield();
 164     }
 165   }
 166
 167   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 168   void unlock() {
 169     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 170     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 171   }
 172
 173   // SharedLockable Concept
 174   void lock_shared() {
 175     int count = 0;
 176     while (!LIKELY(try_lock_shared())) {
 177       if (++count > 1000) sched_yield();
 178     }
 179   }
 180
 181   void unlock_shared() {
 182     bits_.fetch_add(-READER, std::memory_order_release);
 183   }
 184
 185   // Downgrade the lock from writer status to reader status.
 186   void unlock_and_lock_shared() {
 187     bits_.fetch_add(READER, std::memory_order_acquire);
 188     unlock();
 189   }
 190
 191   // UpgradeLockable Concept
 192   void lock_upgrade() {
 193     int count = 0;
 194     while (!try_lock_upgrade()) {
 195       if (++count > 1000) sched_yield();
 196     }
 197   }
 198
 199   void unlock_upgrade() {
 200     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 201   }
 202
 203   // unlock upgrade and try to acquire write lock
 204   void unlock_upgrade_and_lock() {
 205     int64_t count = 0;
 206     while (!try_unlock_upgrade_and_lock()) {
 207       if (++count > 1000) sched_yield();
 208     }
 209   }
 210
 211   // unlock upgrade and read lock atomically
 212   void unlock_upgrade_and_lock_shared() {
 213     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 214   }
 215
 216   // write unlock and upgrade lock atomically
 217   void unlock_and_lock_upgrade() {
 218     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 219     // the same time when other threads are trying do try_lock_upgrade().
 220     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 221     bits_.fetch_add(-WRITER, std::memory_order_release);
 222   }
 223
 224
 225   // Attempt to acquire writer permission. Return false if we didn't get it.
 226   bool try_lock() {
 227     int32_t expect = 0;
 228     return bits_.compare_exchange_strong(expect, WRITER,
 229       std::memory_order_acq_rel);
 230   }
 231
 232   // Try to get reader permission on the lock. This can fail if we
 233   // find out someone is a writer or upgrader.
 234   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 235   // its intention to write and block any new readers while waiting
 236   // for existing readers to finish and release their read locks. This
 237   // helps avoid starving writers (promoted from upgraders).
 238   bool try_lock_shared() {
 239     // fetch_add is considerably (100%) faster than compare_exchange,
 240     // so here we are optimizing for the common (lock success) case.
 241     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 242     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 243       bits_.fetch_add(-READER, std::memory_order_release);
 244       return false;
 245     }
 246     return true;
 247   }
 248
 249   // try to unlock upgrade and write lock atomically
 250   bool try_unlock_upgrade_and_lock() {
 251     int32_t expect = UPGRADED;
 252     return bits_.compare_exchange_strong(expect, WRITER,
 253         std::memory_order_acq_rel);
 254   }
 255
 256   // try to acquire an upgradable lock.
 257   bool try_lock_upgrade() {
 258     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 259
 260     // Note: when failed, we cannot flip the UPGRADED bit back,
 261     // as in this case there is either another upgrade lock or a write lock.
 262     // If it's a write lock, the bit will get cleared up when that lock's done
 263     // with unlock().
 264     return ((value & (UPGRADED | WRITER)) == 0);
 265   }
 266
 267   // mainly for debugging purposes.
 268   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 269
 270   class ReadHolder;
 271   class UpgradedHolder;
 272   class WriteHolder;
 273
 274   class ReadHolder {
 275    public:
 276     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 277       if (lock_) lock_->lock_shared();
 278     }
 279
 280     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 281       lock_->lock_shared();
 282     }
 283
 284     ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
 285       other.lock_ = nullptr;
 286     }
 287
 288     // down-grade
 289     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 290       upgraded.lock_ = nullptr;
 291       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 292     }
 293
 294     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 295       writer.lock_ = nullptr;
 296       if (lock_) lock_->unlock_and_lock_shared();
 297     }
 298
 299     ReadHolder& operator=(ReadHolder&& other) {
 300       using std::swap;
 301       swap(lock_, other.lock_);
 302       return *this;
 303     }
 304
 305     ReadHolder(const ReadHolder& other) = delete;
 306     ReadHolder& operator=(const ReadHolder& other) = delete;
 307
 308     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 309
 310     void reset(RWSpinLock* lock = nullptr) {
 311       if (lock == lock_) return;
 312       if (lock_) lock_->unlock_shared();
 313       lock_ = lock;
 314       if (lock_) lock_->lock_shared();
 315     }
 316
 317     void swap(ReadHolder* other) {
 318       std::swap(lock_, other->lock_);
 319     }
 320
 321    private:
 322     friend class UpgradedHolder;
 323     friend class WriteHolder;
 324     RWSpinLock* lock_;
 325   };
 326
 327   class UpgradedHolder {
 328    public:
 329     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 330       if (lock_) lock_->lock_upgrade();
 331     }
 332
 333     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 334       lock_->lock_upgrade();
 335     }
 336
 337     explicit UpgradedHolder(WriteHolder&& writer) {
 338       lock_ = writer.lock_;
 339       writer.lock_ = nullptr;
 340       if (lock_) lock_->unlock_and_lock_upgrade();
 341     }
 342
 343     UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
 344       other.lock_ = nullptr;
 345     }
 346
 347     UpgradedHolder& operator =(UpgradedHolder&& other) {
 348       using std::swap;
 349       swap(lock_, other.lock_);
 350       return *this;
 351     }
 352
 353     UpgradedHolder(const UpgradedHolder& other) = delete;
 354     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 355
 356     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 357
 358     void reset(RWSpinLock* lock = nullptr) {
 359       if (lock == lock_) return;
 360       if (lock_) lock_->unlock_upgrade();
 361       lock_ = lock;
 362       if (lock_) lock_->lock_upgrade();
 363     }
 364
 365     void swap(UpgradedHolder* other) {
 366       using std::swap;
 367       swap(lock_, other->lock_);
 368     }
 369
 370    private:
 371     friend class WriteHolder;
 372     friend class ReadHolder;
 373     RWSpinLock* lock_;
 374   };
 375
 376   class WriteHolder {
 377    public:
 378     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 379       if (lock_) lock_->lock();
 380     }
 381
 382     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 383       lock_->lock();
 384     }
 385
 386     // promoted from an upgrade lock holder
 387     explicit WriteHolder(UpgradedHolder&& upgraded) {
 388       lock_ = upgraded.lock_;
 389       upgraded.lock_ = nullptr;
 390       if (lock_) lock_->unlock_upgrade_and_lock();
 391     }
 392
 393     WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
 394       other.lock_ = nullptr;
 395     }
 396
 397     WriteHolder& operator =(WriteHolder&& other) {
 398       using std::swap;
 399       swap(lock_, other.lock_);
 400       return *this;
 401     }
 402
 403     WriteHolder(const WriteHolder& other) = delete;
 404     WriteHolder& operator =(const WriteHolder& other) = delete;
 405
 406     ~WriteHolder () { if (lock_) lock_->unlock(); }
 407
 408     void reset(RWSpinLock* lock = nullptr) {
 409       if (lock == lock_) return;
 410       if (lock_) lock_->unlock();
 411       lock_ = lock;
 412       if (lock_) lock_->lock();
 413     }
 414
 415     void swap(WriteHolder* other) {
 416       using std::swap;
 417       swap(lock_, other->lock_);
 418     }
 419
 420    private:
 421     friend class ReadHolder;
 422     friend class UpgradedHolder;
 423     RWSpinLock* lock_;
 424   };
 425
 426   // Synchronized<> adaptors
 427   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 428   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 429   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 430   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 431
 432  private:
 433   std::atomic<int32_t> bits_;
 434 };
 435
 436
 437 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 438 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 439
 440 namespace detail {
 441 template <size_t kBitWidth> struct RWTicketIntTrait {
 442   static_assert(kBitWidth == 32 || kBitWidth == 64,
 443       "bit width has to be either 32 or 64 ");
 444 };
 445
 446 template <>
 447 struct RWTicketIntTrait<64> {
 448   typedef uint64_t FullInt;
 449   typedef uint32_t HalfInt;
 450   typedef uint16_t QuarterInt;
 451
 452 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 453   static __m128i make128(const uint16_t v[4]) {
 454     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 455   }
 456   static inline __m128i fromInteger(uint64_t from) {
 457     return _mm_cvtsi64_si128(from);
 458   }
 459   static inline uint64_t toInteger(__m128i in) {
 460     return _mm_cvtsi128_si64(in);
 461   }
 462   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 463     return toInteger(_mm_add_epi16(in, kDelta));
 464   }
 465 #endif
 466 };
 467
 468 template <>
 469 struct RWTicketIntTrait<32> {
 470   typedef uint32_t FullInt;
 471   typedef uint16_t HalfInt;
 472   typedef uint8_t QuarterInt;
 473
 474 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 475   static __m128i make128(const uint8_t v[4]) {
 476     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 477         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 478   }
 479   static inline __m128i fromInteger(uint32_t from) {
 480     return _mm_cvtsi32_si128(from);
 481   }
 482   static inline uint32_t toInteger(__m128i in) {
 483     return _mm_cvtsi128_si32(in);
 484   }
 485   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 486     return toInteger(_mm_add_epi8(in, kDelta));
 487   }
 488 #endif
 489 };
 490 }  // detail
 491
 492
 493 template<size_t kBitWidth, bool kFavorWriter=false>
 494 class RWTicketSpinLockT : boost::noncopyable {
 495   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 496   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 497   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 498   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 499     QuarterInt;
 500
 501   union RWTicket {
 502     FullInt whole;
 503     HalfInt readWrite;
 504     __extension__ struct {
 505       QuarterInt write;
 506       QuarterInt read;
 507       QuarterInt users;
 508     };
 509   } ticket;
 510
 511  private: // Some x64-specific utilities for atomic access to ticket.
 512   template<class T> static T load_acquire(T* addr) {
 513     T t = *addr; // acquire barrier
 514     asm volatile("" : : : "memory");
 515     return t;
 516   }
 517
 518   template<class T>
 519   static void store_release(T* addr, T v) {
 520     asm volatile("" : : : "memory");
 521     *addr = v; // release barrier
 522   }
 523
 524  public:
 525
 526   RWTicketSpinLockT() {
 527     store_release(&ticket.whole, FullInt(0));
 528   }
 529
 530   void lock() {
 531     if (kFavorWriter) {
 532       writeLockAggressive();
 533     } else {
 534       writeLockNice();
 535     }
 536   }
 537
 538   /*
 539    * Both try_lock and try_lock_shared diverge in our implementation from the
 540    * lock algorithm described in the link above.
 541    *
 542    * In the read case, it is undesirable that the readers could wait
 543    * for another reader (before increasing ticket.read in the other
 544    * implementation).  Our approach gives up on
 545    * first-come-first-serve, but our benchmarks showed improve
 546    * performance for both readers and writers under heavily contended
 547    * cases, particularly when the number of threads exceeds the number
 548    * of logical CPUs.
 549    *
 550    * We have writeLockAggressive() using the original implementation
 551    * for a writer, which gives some advantage to the writer over the
 552    * readers---for that path it is guaranteed that the writer will
 553    * acquire the lock after all the existing readers exit.
 554    */
 555   bool try_lock() {
 556     RWTicket t;
 557     FullInt old = t.whole = load_acquire(&ticket.whole);
 558     if (t.users != t.write) return false;
 559     ++t.users;
 560     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 561   }
 562
 563   /*
 564    * Call this if you want to prioritize writer to avoid starvation.
 565    * Unlike writeLockNice, immediately acquires the write lock when
 566    * the existing readers (arriving before the writer) finish their
 567    * turns.
 568    */
 569   void writeLockAggressive() {
 570     // sched_yield() is needed here to avoid a pathology if the number
 571     // of threads attempting concurrent writes is >= the number of real
 572     // cores allocated to this process. This is less likely than the
 573     // corresponding situation in lock_shared(), but we still want to
 574     // avoid it
 575     int count = 0;
 576     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 577     while (val != load_acquire(&ticket.write)) {
 578       asm volatile("pause");
 579       if (UNLIKELY(++count > 1000)) sched_yield();
 580     }
 581   }
 582
 583   // Call this when the writer should be nicer to the readers.
 584   void writeLockNice() {
 585     // Here it doesn't cpu-relax the writer.
 586     //
 587     // This is because usually we have many more readers than the
 588     // writers, so the writer has less chance to get the lock when
 589     // there are a lot of competing readers.  The aggressive spinning
 590     // can help to avoid starving writers.
 591     //
 592     // We don't worry about sched_yield() here because the caller
 593     // has already explicitly abandoned fairness.
 594     while (!try_lock()) {}
 595   }
 596
 597   // Atomically unlock the write-lock from writer and acquire the read-lock.
 598   void unlock_and_lock_shared() {
 599     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 600   }
 601
 602   // Release writer permission on the lock.
 603   void unlock() {
 604     RWTicket t;
 605     t.whole = load_acquire(&ticket.whole);
 606     FullInt old = t.whole;
 607
 608 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 609     // SSE2 can reduce the lock and unlock overhead by 10%
 610     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 611     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 612     __m128i m = IntTraitType::fromInteger(old);
 613     t.whole = IntTraitType::addParallel(m, kDelta);
 614 #else
 615     ++t.read;
 616     ++t.write;
 617 #endif
 618     store_release(&ticket.readWrite, t.readWrite);
 619   }
 620
 621   void lock_shared() {
 622     // sched_yield() is important here because we can't grab the
 623     // shared lock if there is a pending writeLockAggressive, so we
 624     // need to let threads that already have a shared lock complete
 625     int count = 0;
 626     while (!LIKELY(try_lock_shared())) {
 627       asm volatile("pause");
 628       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 629     }
 630   }
 631
 632   bool try_lock_shared() {
 633     RWTicket t, old;
 634     old.whole = t.whole = load_acquire(&ticket.whole);
 635     old.users = old.read;
 636 #ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
 637     // SSE2 may reduce the total lock and unlock overhead by 10%
 638     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 639     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 640     __m128i m = IntTraitType::fromInteger(old.whole);
 641     t.whole = IntTraitType::addParallel(m, kDelta);
 642 #else
 643     ++t.read;
 644     ++t.users;
 645 #endif
 646     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 647   }
 648
 649   void unlock_shared() {
 650     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 651   }
 652
 653   class WriteHolder;
 654
 655   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 656   class ReadHolder : boost::noncopyable {
 657    public:
 658     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 659       lock_(lock) {
 660       if (lock_) lock_->lock_shared();
 661     }
 662
 663     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 664       if (lock_) lock_->lock_shared();
 665     }
 666
 667     // atomically unlock the write-lock from writer and acquire the read-lock
 668     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 669       std::swap(this->lock_, writer->lock_);
 670       if (lock_) {
 671         lock_->unlock_and_lock_shared();
 672       }
 673     }
 674
 675     ~ReadHolder() {
 676       if (lock_) lock_->unlock_shared();
 677     }
 678
 679     void reset(RWSpinLock *lock = nullptr) {
 680       if (lock_) lock_->unlock_shared();
 681       lock_ = lock;
 682       if (lock_) lock_->lock_shared();
 683     }
 684
 685     void swap(ReadHolder *other) {
 686       std::swap(this->lock_, other->lock_);
 687     }
 688
 689    private:
 690     RWSpinLock *lock_;
 691   };
 692
 693   class WriteHolder : boost::noncopyable {
 694    public:
 695     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 696       if (lock_) lock_->lock();
 697     }
 698     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 699       if (lock_) lock_->lock();
 700     }
 701
 702     ~WriteHolder() {
 703       if (lock_) lock_->unlock();
 704     }
 705
 706     void reset(RWSpinLock *lock = nullptr) {
 707       if (lock == lock_) return;
 708       if (lock_) lock_->unlock();
 709       lock_ = lock;
 710       if (lock_) lock_->lock();
 711     }
 712
 713     void swap(WriteHolder *other) {
 714       std::swap(this->lock_, other->lock_);
 715     }
 716
 717    private:
 718     friend class ReadHolder;
 719     RWSpinLock *lock_;
 720   };
 721
 722   // Synchronized<> adaptors.
 723   friend void acquireRead(RWTicketSpinLockT& mutex) {
 724     mutex.lock_shared();
 725   }
 726   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 727     mutex.lock();
 728   }
 729   friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
 730                                unsigned int milliseconds) {
 731     mutex.lock();
 732     return true;
 733   }
 734   friend void releaseRead(RWTicketSpinLockT& mutex) {
 735     mutex.unlock_shared();
 736   }
 737   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 738     mutex.unlock();
 739   }
 740 };
 741
 742 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 743 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 744
 745 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 746
 747 }  // namespace folly
 748
 749 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 750 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 751 #endif
 752
 753 #endif  // FOLLY_RWSPINLOCK_H_