folly/RWSpinLock.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  50  *
  51  * @author Xin Liu <xliux@fb.com>
  52  */
  53
  54 #ifndef FOLLY_RWSPINLOCK_H_
  55 #define FOLLY_RWSPINLOCK_H_
  56
  57 /*
  58 ========================================================================
  59 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  60 ========================================================================
  61
  62 ------------------------------------------------------------------------------
  63 1. Single thread benchmark (read/write lock + unlock overhead)
  64 Benchmark                                    Iters   Total t    t/iter iter/sec
  65 -------------------------------------------------------------------------------
  66 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  67 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  68 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  69 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  70 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  71 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  72 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  73 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  74 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  75 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  76 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  77 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  78
  79 ------------------------------------------------------------------------------
  80 2. Contention Benchmark      90% read  10% write
  81 Benchmark                    hits       average    min       max        sigma
  82 ------------------------------------------------------------------------------
  83 ---------- 8  threads ------------
  84 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  85 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  86 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  87 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
  88 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
  89 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
  90
  91 ---------- 16 threads ------------
  92 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
  93 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
  94 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
  95 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
  96 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
  97 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
  98
  99 ---------- 50 threads ------------
 100 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 101 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 102 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 103 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 104 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 105 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 106
 107 */
 108
 109 #include "folly/Portability.h"
 110
 111 #if defined(__GNUC__) && !defined(__clang__) && \
 112   (defined(__i386) || FOLLY_X64 || \
 113    defined(ARCH_K8))
 114 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 115 #include <x86intrin.h>
 116 #else
 117 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 118 #endif
 119
 120 #include <atomic>
 121 #include <string>
 122 #include <algorithm>
 123 #include <boost/noncopyable.hpp>
 124
 125 #include <sched.h>
 126 #include <glog/logging.h>
 127
 128 #include "folly/Likely.h"
 129
 130 namespace folly {
 131
 132 /*
 133  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 134  * a nice writer and don't expect a lot of write/read contention, or
 135  * when you need small rwlocks since you are creating a large number
 136  * of them.
 137  *
 138  * Note that the unfairness here is extreme: if the lock is
 139  * continually accessed for read, writers will never get a chance.  If
 140  * the lock can be that highly contended this class is probably not an
 141  * ideal choice anyway.
 142  *
 143  * It currently implements most of the Lockable, SharedLockable and
 144  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 145  * interfaces.
 146  */
 147 class RWSpinLock : boost::noncopyable {
 148   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 149  public:
 150   RWSpinLock() : bits_(0) {}
 151
 152   // Lockable Concept
 153   void lock() {
 154     int count = 0;
 155     while (!LIKELY(try_lock())) {
 156       if (++count > 1000) sched_yield();
 157     }
 158   }
 159
 160   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 161   void unlock() {
 162     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 163     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 164   }
 165
 166   // SharedLockable Concept
 167   void lock_shared() {
 168     int count = 0;
 169     while (!LIKELY(try_lock_shared())) {
 170       if (++count > 1000) sched_yield();
 171     }
 172   }
 173
 174   void unlock_shared() {
 175     bits_.fetch_add(-READER, std::memory_order_release);
 176   }
 177
 178   // Downgrade the lock from writer status to reader status.
 179   void unlock_and_lock_shared() {
 180     bits_.fetch_add(READER, std::memory_order_acquire);
 181     unlock();
 182   }
 183
 184   // UpgradeLockable Concept
 185   void lock_upgrade() {
 186     int count = 0;
 187     while (!try_lock_upgrade()) {
 188       if (++count > 1000) sched_yield();
 189     }
 190   }
 191
 192   void unlock_upgrade() {
 193     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 194   }
 195
 196   // unlock upgrade and try to acquire write lock
 197   void unlock_upgrade_and_lock() {
 198     int64_t count = 0;
 199     while (!try_unlock_upgrade_and_lock()) {
 200       if (++count > 1000) sched_yield();
 201     }
 202   }
 203
 204   // unlock upgrade and read lock atomically
 205   void unlock_upgrade_and_lock_shared() {
 206     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 207   }
 208
 209   // write unlock and upgrade lock atomically
 210   void unlock_and_lock_upgrade() {
 211     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 212     // the same time when other threads are trying do try_lock_upgrade().
 213     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 214     bits_.fetch_add(-WRITER, std::memory_order_release);
 215   }
 216
 217
 218   // Attempt to acquire writer permission. Return false if we didn't get it.
 219   bool try_lock() {
 220     int32_t expect = 0;
 221     return bits_.compare_exchange_strong(expect, WRITER,
 222       std::memory_order_acq_rel);
 223   }
 224
 225   // Try to get reader permission on the lock. This can fail if we
 226   // find out someone is a writer or upgrader.
 227   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 228   // its intention to write and block any new readers while waiting
 229   // for existing readers to finish and release their read locks. This
 230   // helps avoid starving writers (promoted from upgraders).
 231   bool try_lock_shared() {
 232     // fetch_add is considerably (100%) faster than compare_exchange,
 233     // so here we are optimizing for the common (lock success) case.
 234     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 235     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 236       bits_.fetch_add(-READER, std::memory_order_release);
 237       return false;
 238     }
 239     return true;
 240   }
 241
 242   // try to unlock upgrade and write lock atomically
 243   bool try_unlock_upgrade_and_lock() {
 244     int32_t expect = UPGRADED;
 245     return bits_.compare_exchange_strong(expect, WRITER,
 246         std::memory_order_acq_rel);
 247   }
 248
 249   // try to acquire an upgradable lock.
 250   bool try_lock_upgrade() {
 251     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 252
 253     // Note: when failed, we cannot flip the UPGRADED bit back,
 254     // as in this case there is either another upgrade lock or a write lock.
 255     // If it's a write lock, the bit will get cleared up when that lock's done
 256     // with unlock().
 257     return ((value & (UPGRADED | WRITER)) == 0);
 258   }
 259
 260   // mainly for debugging purposes.
 261   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 262
 263   class ReadHolder;
 264   class UpgradedHolder;
 265   class WriteHolder;
 266
 267   class ReadHolder {
 268    public:
 269     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 270       if (lock_) lock_->lock_shared();
 271     }
 272
 273     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 274       lock_->lock_shared();
 275     }
 276
 277     ReadHolder(ReadHolder&& other) : lock_(other.lock_) {
 278       other.lock_ = nullptr;
 279     }
 280
 281     // down-grade
 282     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 283       upgraded.lock_ = nullptr;
 284       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 285     }
 286
 287     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 288       writer.lock_ = nullptr;
 289       if (lock_) lock_->unlock_and_lock_shared();
 290     }
 291
 292     ReadHolder& operator=(ReadHolder&& other) {
 293       using std::swap;
 294       swap(lock_, other.lock_);
 295       return *this;
 296     }
 297
 298     ReadHolder(const ReadHolder& other) = delete;
 299     ReadHolder& operator=(const ReadHolder& other) = delete;
 300
 301     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 302
 303     void reset(RWSpinLock* lock = nullptr) {
 304       if (lock == lock_) return;
 305       if (lock_) lock_->unlock_shared();
 306       lock_ = lock;
 307       if (lock_) lock_->lock_shared();
 308     }
 309
 310     void swap(ReadHolder* other) {
 311       std::swap(lock_, other->lock_);
 312     }
 313
 314    private:
 315     friend class UpgradedHolder;
 316     friend class WriteHolder;
 317     RWSpinLock* lock_;
 318   };
 319
 320   class UpgradedHolder {
 321    public:
 322     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 323       if (lock_) lock_->lock_upgrade();
 324     }
 325
 326     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 327       lock_->lock_upgrade();
 328     }
 329
 330     explicit UpgradedHolder(WriteHolder&& writer) {
 331       lock_ = writer.lock_;
 332       writer.lock_ = nullptr;
 333       if (lock_) lock_->unlock_and_lock_upgrade();
 334     }
 335
 336     UpgradedHolder(UpgradedHolder&& other) : lock_(other.lock_) {
 337       other.lock_ = nullptr;
 338     }
 339
 340     UpgradedHolder& operator =(UpgradedHolder&& other) {
 341       using std::swap;
 342       swap(lock_, other.lock_);
 343       return *this;
 344     }
 345
 346     UpgradedHolder(const UpgradedHolder& other) = delete;
 347     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 348
 349     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 350
 351     void reset(RWSpinLock* lock = nullptr) {
 352       if (lock == lock_) return;
 353       if (lock_) lock_->unlock_upgrade();
 354       lock_ = lock;
 355       if (lock_) lock_->lock_upgrade();
 356     }
 357
 358     void swap(UpgradedHolder* other) {
 359       using std::swap;
 360       swap(lock_, other->lock_);
 361     }
 362
 363    private:
 364     friend class WriteHolder;
 365     friend class ReadHolder;
 366     RWSpinLock* lock_;
 367   };
 368
 369   class WriteHolder {
 370    public:
 371     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 372       if (lock_) lock_->lock();
 373     }
 374
 375     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 376       lock_->lock();
 377     }
 378
 379     // promoted from an upgrade lock holder
 380     explicit WriteHolder(UpgradedHolder&& upgraded) {
 381       lock_ = upgraded.lock_;
 382       upgraded.lock_ = nullptr;
 383       if (lock_) lock_->unlock_upgrade_and_lock();
 384     }
 385
 386     WriteHolder(WriteHolder&& other) : lock_(other.lock_) {
 387       other.lock_ = nullptr;
 388     }
 389
 390     WriteHolder& operator =(WriteHolder&& other) {
 391       using std::swap;
 392       swap(lock_, other.lock_);
 393       return *this;
 394     }
 395
 396     WriteHolder(const WriteHolder& other) = delete;
 397     WriteHolder& operator =(const WriteHolder& other) = delete;
 398
 399     ~WriteHolder () { if (lock_) lock_->unlock(); }
 400
 401     void reset(RWSpinLock* lock = nullptr) {
 402       if (lock == lock_) return;
 403       if (lock_) lock_->unlock();
 404       lock_ = lock;
 405       if (lock_) lock_->lock();
 406     }
 407
 408     void swap(WriteHolder* other) {
 409       using std::swap;
 410       swap(lock_, other->lock_);
 411     }
 412
 413    private:
 414     friend class ReadHolder;
 415     friend class UpgradedHolder;
 416     RWSpinLock* lock_;
 417   };
 418
 419   // Synchronized<> adaptors
 420   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 421   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 422   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 423   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 424
 425  private:
 426   std::atomic<int32_t> bits_;
 427 };
 428
 429
 430 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 431 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 432
 433 namespace detail {
 434 template <size_t kBitWidth> struct RWTicketIntTrait {
 435   static_assert(kBitWidth == 32 || kBitWidth == 64,
 436       "bit width has to be either 32 or 64 ");
 437 };
 438
 439 template <>
 440 struct RWTicketIntTrait<64> {
 441   typedef uint64_t FullInt;
 442   typedef uint32_t HalfInt;
 443   typedef uint16_t QuarterInt;
 444
 445 #ifdef __SSE2__
 446   static __m128i make128(const uint16_t v[4]) {
 447     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 448   }
 449   static inline __m128i fromInteger(uint64_t from) {
 450     return _mm_cvtsi64_si128(from);
 451   }
 452   static inline uint64_t toInteger(__m128i in) {
 453     return _mm_cvtsi128_si64(in);
 454   }
 455   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 456     return toInteger(_mm_add_epi16(in, kDelta));
 457   }
 458 #endif
 459 };
 460
 461 template <>
 462 struct RWTicketIntTrait<32> {
 463   typedef uint32_t FullInt;
 464   typedef uint16_t HalfInt;
 465   typedef uint8_t QuarterInt;
 466
 467 #ifdef __SSE2__
 468   static __m128i make128(const uint8_t v[4]) {
 469     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 470         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 471   }
 472   static inline __m128i fromInteger(uint32_t from) {
 473     return _mm_cvtsi32_si128(from);
 474   }
 475   static inline uint32_t toInteger(__m128i in) {
 476     return _mm_cvtsi128_si32(in);
 477   }
 478   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 479     return toInteger(_mm_add_epi8(in, kDelta));
 480   }
 481 #endif
 482 };
 483 }  // detail
 484
 485
 486 template<size_t kBitWidth, bool kFavorWriter=false>
 487 class RWTicketSpinLockT : boost::noncopyable {
 488   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 489   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 490   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 491   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 492     QuarterInt;
 493
 494   union RWTicket {
 495     FullInt whole;
 496     HalfInt readWrite;
 497     __extension__ struct {
 498       QuarterInt write;
 499       QuarterInt read;
 500       QuarterInt users;
 501     };
 502   } ticket;
 503
 504  private: // Some x64-specific utilities for atomic access to ticket.
 505   template<class T> static T load_acquire(T* addr) {
 506     T t = *addr; // acquire barrier
 507     asm volatile("" : : : "memory");
 508     return t;
 509   }
 510
 511   template<class T>
 512   static void store_release(T* addr, T v) {
 513     asm volatile("" : : : "memory");
 514     *addr = v; // release barrier
 515   }
 516
 517  public:
 518
 519   RWTicketSpinLockT() {
 520     store_release(&ticket.whole, FullInt(0));
 521   }
 522
 523   void lock() {
 524     if (kFavorWriter) {
 525       writeLockAggressive();
 526     } else {
 527       writeLockNice();
 528     }
 529   }
 530
 531   /*
 532    * Both try_lock and try_lock_shared diverge in our implementation from the
 533    * lock algorithm described in the link above.
 534    *
 535    * In the read case, it is undesirable that the readers could wait
 536    * for another reader (before increasing ticket.read in the other
 537    * implementation).  Our approach gives up on
 538    * first-come-first-serve, but our benchmarks showed improve
 539    * performance for both readers and writers under heavily contended
 540    * cases, particularly when the number of threads exceeds the number
 541    * of logical CPUs.
 542    *
 543    * We have writeLockAggressive() using the original implementation
 544    * for a writer, which gives some advantage to the writer over the
 545    * readers---for that path it is guaranteed that the writer will
 546    * acquire the lock after all the existing readers exit.
 547    */
 548   bool try_lock() {
 549     RWTicket t;
 550     FullInt old = t.whole = load_acquire(&ticket.whole);
 551     if (t.users != t.write) return false;
 552     ++t.users;
 553     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 554   }
 555
 556   /*
 557    * Call this if you want to prioritize writer to avoid starvation.
 558    * Unlike writeLockNice, immediately acquires the write lock when
 559    * the existing readers (arriving before the writer) finish their
 560    * turns.
 561    */
 562   void writeLockAggressive() {
 563     // sched_yield() is needed here to avoid a pathology if the number
 564     // of threads attempting concurrent writes is >= the number of real
 565     // cores allocated to this process. This is less likely than the
 566     // corresponding situation in lock_shared(), but we still want to
 567     // avoid it
 568     int count = 0;
 569     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 570     while (val != load_acquire(&ticket.write)) {
 571       asm volatile("pause");
 572       if (UNLIKELY(++count > 1000)) sched_yield();
 573     }
 574   }
 575
 576   // Call this when the writer should be nicer to the readers.
 577   void writeLockNice() {
 578     // Here it doesn't cpu-relax the writer.
 579     //
 580     // This is because usually we have many more readers than the
 581     // writers, so the writer has less chance to get the lock when
 582     // there are a lot of competing readers.  The aggressive spinning
 583     // can help to avoid starving writers.
 584     //
 585     // We don't worry about sched_yield() here because the caller
 586     // has already explicitly abandoned fairness.
 587     while (!try_lock()) {}
 588   }
 589
 590   // Atomically unlock the write-lock from writer and acquire the read-lock.
 591   void unlock_and_lock_shared() {
 592     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 593   }
 594
 595   // Release writer permission on the lock.
 596   void unlock() {
 597     RWTicket t;
 598     t.whole = load_acquire(&ticket.whole);
 599     FullInt old = t.whole;
 600
 601 #ifdef __SSE2__
 602     // SSE2 can reduce the lock and unlock overhead by 10%
 603     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 604     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 605     __m128i m = IntTraitType::fromInteger(old);
 606     t.whole = IntTraitType::addParallel(m, kDelta);
 607 #else
 608     ++t.read;
 609     ++t.write;
 610 #endif
 611     store_release(&ticket.readWrite, t.readWrite);
 612   }
 613
 614   void lock_shared() {
 615     // sched_yield() is important here because we can't grab the
 616     // shared lock if there is a pending writeLockAggressive, so we
 617     // need to let threads that already have a shared lock complete
 618     int count = 0;
 619     while (!LIKELY(try_lock_shared())) {
 620       asm volatile("pause");
 621       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 622     }
 623   }
 624
 625   bool try_lock_shared() {
 626     RWTicket t, old;
 627     old.whole = t.whole = load_acquire(&ticket.whole);
 628     old.users = old.read;
 629 #ifdef  __SSE2__
 630     // SSE2 may reduce the total lock and unlock overhead by 10%
 631     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 632     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 633     __m128i m = IntTraitType::fromInteger(old.whole);
 634     t.whole = IntTraitType::addParallel(m, kDelta);
 635 #else
 636     ++t.read;
 637     ++t.users;
 638 #endif
 639     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 640   }
 641
 642   void unlock_shared() {
 643     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 644   }
 645
 646   class WriteHolder;
 647
 648   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 649   class ReadHolder : boost::noncopyable {
 650    public:
 651     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 652       lock_(lock) {
 653       if (lock_) lock_->lock_shared();
 654     }
 655
 656     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 657       if (lock_) lock_->lock_shared();
 658     }
 659
 660     // atomically unlock the write-lock from writer and acquire the read-lock
 661     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 662       std::swap(this->lock_, writer->lock_);
 663       if (lock_) {
 664         lock_->unlock_and_lock_shared();
 665       }
 666     }
 667
 668     ~ReadHolder() {
 669       if (lock_) lock_->unlock_shared();
 670     }
 671
 672     void reset(RWSpinLock *lock = nullptr) {
 673       if (lock_) lock_->unlock_shared();
 674       lock_ = lock;
 675       if (lock_) lock_->lock_shared();
 676     }
 677
 678     void swap(ReadHolder *other) {
 679       std::swap(this->lock_, other->lock_);
 680     }
 681
 682    private:
 683     RWSpinLock *lock_;
 684   };
 685
 686   class WriteHolder : boost::noncopyable {
 687    public:
 688     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 689       if (lock_) lock_->lock();
 690     }
 691     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 692       if (lock_) lock_->lock();
 693     }
 694
 695     ~WriteHolder() {
 696       if (lock_) lock_->unlock();
 697     }
 698
 699     void reset(RWSpinLock *lock = nullptr) {
 700       if (lock == lock_) return;
 701       if (lock_) lock_->unlock();
 702       lock_ = lock;
 703       if (lock_) lock_->lock();
 704     }
 705
 706     void swap(WriteHolder *other) {
 707       std::swap(this->lock_, other->lock_);
 708     }
 709
 710    private:
 711     friend class ReadHolder;
 712     RWSpinLock *lock_;
 713   };
 714
 715   // Synchronized<> adaptors.
 716   friend void acquireRead(RWTicketSpinLockT& mutex) {
 717     mutex.lock_shared();
 718   }
 719   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 720     mutex.lock();
 721   }
 722   friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
 723                                unsigned int milliseconds) {
 724     mutex.lock();
 725     return true;
 726   }
 727   friend void releaseRead(RWTicketSpinLockT& mutex) {
 728     mutex.unlock_shared();
 729   }
 730   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 731     mutex.unlock();
 732   }
 733 };
 734
 735 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 736 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 737
 738 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 739
 740 }  // namespace folly
 741
 742 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 743 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 744 #endif
 745
 746 #endif  // FOLLY_RWSPINLOCK_H_