folly/RWSpinLock.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  50  *
  51  * @author Xin Liu <xliux@fb.com>
  52  */
  53
  54 #ifndef FOLLY_RWSPINLOCK_H_
  55 #define FOLLY_RWSPINLOCK_H_
  56
  57 /*
  58 ========================================================================
  59 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  60 ========================================================================
  61
  62 ------------------------------------------------------------------------------
  63 1. Single thread benchmark (read/write lock + unlock overhead)
  64 Benchmark                                    Iters   Total t    t/iter iter/sec
  65 -------------------------------------------------------------------------------
  66 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  67 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  68 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  69 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  70 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  71 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  72 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  73 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  74 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  75 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  76 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  77 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  78
  79 ------------------------------------------------------------------------------
  80 2. Contention Benchmark      90% read  10% write
  81 Benchmark                    hits       average    min       max        sigma
  82 ------------------------------------------------------------------------------
  83 ---------- 8  threads ------------
  84 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  85 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  86 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  87 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
  88 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
  89 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
  90
  91 ---------- 16 threads ------------
  92 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
  93 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
  94 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
  95 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
  96 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
  97 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
  98
  99 ---------- 50 threads ------------
 100 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 101 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 102 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 103 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 104 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 105 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 106
 107 */
 108
 109 #if defined(__GNUC__) && !defined(__clang__) && \
 110   (defined(__i386) || defined(__x86_64__) || \
 111    defined(ARCH_K8))
 112 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 113 #include <x86intrin.h>
 114 #else
 115 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 116 #endif
 117
 118 #include <atomic>
 119 #include <string>
 120 #include <algorithm>
 121 #include <boost/noncopyable.hpp>
 122
 123 #include <sched.h>
 124 #include <glog/logging.h>
 125
 126 #include "folly/Likely.h"
 127
 128 namespace folly {
 129
 130 /*
 131  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 132  * a nice writer and don't expect a lot of write/read contention, or
 133  * when you need small rwlocks since you are creating a large number
 134  * of them.
 135  *
 136  * Note that the unfairness here is extreme: if the lock is
 137  * continually accessed for read, writers will never get a chance.  If
 138  * the lock can be that highly contended this class is probably not an
 139  * ideal choice anyway.
 140  *
 141  * It currently implements most of the Lockable, SharedLockable and
 142  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 143  * interfaces.
 144  */
 145 class RWSpinLock : boost::noncopyable {
 146   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 147  public:
 148   RWSpinLock() : bits_(0) {}
 149
 150   // Lockable Concept
 151   void lock() {
 152     int count = 0;
 153     while (!LIKELY(try_lock())) {
 154       if (++count > 1000) sched_yield();
 155     }
 156   }
 157
 158   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 159   void unlock() {
 160     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 161     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 162   }
 163
 164   // SharedLockable Concept
 165   void lock_shared() {
 166     int count = 0;
 167     while (!LIKELY(try_lock_shared())) {
 168       if (++count > 1000) sched_yield();
 169     }
 170   }
 171
 172   void unlock_shared() {
 173     bits_.fetch_add(-READER, std::memory_order_release);
 174   }
 175
 176   // Downgrade the lock from writer status to reader status.
 177   void unlock_and_lock_shared() {
 178     bits_.fetch_add(READER, std::memory_order_acquire);
 179     unlock();
 180   }
 181
 182   // UpgradeLockable Concept
 183   void lock_upgrade() {
 184     int count = 0;
 185     while (!try_lock_upgrade()) {
 186       if (++count > 1000) sched_yield();
 187     }
 188   }
 189
 190   void unlock_upgrade() {
 191     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 192   }
 193
 194   // unlock upgrade and try to acquire write lock
 195   void unlock_upgrade_and_lock() {
 196     int64_t count = 0;
 197     while (!try_unlock_upgrade_and_lock()) {
 198       if (++count > 1000) sched_yield();
 199     }
 200   }
 201
 202   // unlock upgrade and read lock atomically
 203   void unlock_upgrade_and_lock_shared() {
 204     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 205   }
 206
 207   // write unlock and upgrade lock atomically
 208   void unlock_and_lock_upgrade() {
 209     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 210     // the same time when other threads are trying do try_lock_upgrade().
 211     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 212     bits_.fetch_add(-WRITER, std::memory_order_release);
 213   }
 214
 215
 216   // Attempt to acquire writer permission. Return false if we didn't get it.
 217   bool try_lock() {
 218     int32_t expect = 0;
 219     return bits_.compare_exchange_strong(expect, WRITER,
 220       std::memory_order_acq_rel);
 221   }
 222
 223   // Try to get reader permission on the lock. This can fail if we
 224   // find out someone is a writer or upgrader.
 225   // Setting the UPGRADED bit would allow a writer-to-be to indicate
 226   // its intention to write and block any new readers while waiting
 227   // for existing readers to finish and release their read locks. This
 228   // helps avoid starving writers (promoted from upgraders).
 229   bool try_lock_shared() {
 230     // fetch_add is considerably (100%) faster than compare_exchange,
 231     // so here we are optimizing for the common (lock success) case.
 232     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 233     if (UNLIKELY(value & (WRITER|UPGRADED))) {
 234       bits_.fetch_add(-READER, std::memory_order_release);
 235       return false;
 236     }
 237     return true;
 238   }
 239
 240   // try to unlock upgrade and write lock atomically
 241   bool try_unlock_upgrade_and_lock() {
 242     int32_t expect = UPGRADED;
 243     return bits_.compare_exchange_strong(expect, WRITER,
 244         std::memory_order_acq_rel);
 245   }
 246
 247   // try to acquire an upgradable lock.
 248   bool try_lock_upgrade() {
 249     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 250
 251     // Note: when failed, we cannot flip the UPGRADED bit back,
 252     // as in this case there is either another upgrade lock or a write lock.
 253     // If it's a write lock, the bit will get cleared up when that lock's done
 254     // with unlock().
 255     return ((value & (UPGRADED | WRITER)) == 0);
 256   }
 257
 258   // mainly for debugging purposes.
 259   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 260
 261   class ReadHolder;
 262   class UpgradedHolder;
 263   class WriteHolder;
 264
 265   class ReadHolder {
 266    public:
 267     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 268       if (lock_) lock_->lock_shared();
 269     }
 270
 271     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 272       lock_->lock_shared();
 273     }
 274
 275     ReadHolder(ReadHolder&& other) : lock_(other.lock_) {
 276       other.lock_ = nullptr;
 277     }
 278
 279     // down-grade
 280     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 281       upgraded.lock_ = nullptr;
 282       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 283     }
 284
 285     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 286       writer.lock_ = nullptr;
 287       if (lock_) lock_->unlock_and_lock_shared();
 288     }
 289
 290     ReadHolder& operator=(ReadHolder&& other) {
 291       using std::swap;
 292       swap(lock_, other.lock_);
 293       return *this;
 294     }
 295
 296     ReadHolder(const ReadHolder& other) = delete;
 297     ReadHolder& operator=(const ReadHolder& other) = delete;
 298
 299     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 300
 301     void reset(RWSpinLock* lock = nullptr) {
 302       if (lock == lock_) return;
 303       if (lock_) lock_->unlock_shared();
 304       lock_ = lock;
 305       if (lock_) lock_->lock_shared();
 306     }
 307
 308     void swap(ReadHolder* other) {
 309       std::swap(lock_, other->lock_);
 310     }
 311
 312    private:
 313     friend class UpgradedHolder;
 314     friend class WriteHolder;
 315     RWSpinLock* lock_;
 316   };
 317
 318   class UpgradedHolder {
 319    public:
 320     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 321       if (lock_) lock_->lock_upgrade();
 322     }
 323
 324     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 325       lock_->lock_upgrade();
 326     }
 327
 328     explicit UpgradedHolder(WriteHolder&& writer) {
 329       lock_ = writer.lock_;
 330       writer.lock_ = nullptr;
 331       if (lock_) lock_->unlock_and_lock_upgrade();
 332     }
 333
 334     UpgradedHolder(UpgradedHolder&& other) : lock_(other.lock_) {
 335       other.lock_ = nullptr;
 336     }
 337
 338     UpgradedHolder& operator =(UpgradedHolder&& other) {
 339       using std::swap;
 340       swap(lock_, other.lock_);
 341       return *this;
 342     }
 343
 344     UpgradedHolder(const UpgradedHolder& other) = delete;
 345     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 346
 347     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 348
 349     void reset(RWSpinLock* lock = nullptr) {
 350       if (lock == lock_) return;
 351       if (lock_) lock_->unlock_upgrade();
 352       lock_ = lock;
 353       if (lock_) lock_->lock_upgrade();
 354     }
 355
 356     void swap(UpgradedHolder* other) {
 357       using std::swap;
 358       swap(lock_, other->lock_);
 359     }
 360
 361    private:
 362     friend class WriteHolder;
 363     friend class ReadHolder;
 364     RWSpinLock* lock_;
 365   };
 366
 367   class WriteHolder {
 368    public:
 369     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 370       if (lock_) lock_->lock();
 371     }
 372
 373     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 374       lock_->lock();
 375     }
 376
 377     // promoted from an upgrade lock holder
 378     explicit WriteHolder(UpgradedHolder&& upgraded) {
 379       lock_ = upgraded.lock_;
 380       upgraded.lock_ = nullptr;
 381       if (lock_) lock_->unlock_upgrade_and_lock();
 382     }
 383
 384     WriteHolder(WriteHolder&& other) : lock_(other.lock_) {
 385       other.lock_ = nullptr;
 386     }
 387
 388     WriteHolder& operator =(WriteHolder&& other) {
 389       using std::swap;
 390       swap(lock_, other.lock_);
 391       return *this;
 392     }
 393
 394     WriteHolder(const WriteHolder& other) = delete;
 395     WriteHolder& operator =(const WriteHolder& other) = delete;
 396
 397     ~WriteHolder () { if (lock_) lock_->unlock(); }
 398
 399     void reset(RWSpinLock* lock = nullptr) {
 400       if (lock == lock_) return;
 401       if (lock_) lock_->unlock();
 402       lock_ = lock;
 403       if (lock_) lock_->lock();
 404     }
 405
 406     void swap(WriteHolder* other) {
 407       using std::swap;
 408       swap(lock_, other->lock_);
 409     }
 410
 411    private:
 412     friend class ReadHolder;
 413     friend class UpgradedHolder;
 414     RWSpinLock* lock_;
 415   };
 416
 417   // Synchronized<> adaptors
 418   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 419   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 420   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 421   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 422
 423  private:
 424   std::atomic<int32_t> bits_;
 425 };
 426
 427
 428 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 429 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 430
 431 namespace detail {
 432 template <size_t kBitWidth> struct RWTicketIntTrait {
 433   static_assert(kBitWidth == 32 || kBitWidth == 64,
 434       "bit width has to be either 32 or 64 ");
 435 };
 436
 437 template <>
 438 struct RWTicketIntTrait<64> {
 439   typedef uint64_t FullInt;
 440   typedef uint32_t HalfInt;
 441   typedef uint16_t QuarterInt;
 442
 443 #ifdef __SSE2__
 444   static __m128i make128(const uint16_t v[4]) {
 445     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 446   }
 447   static inline __m128i fromInteger(uint64_t from) {
 448     return _mm_cvtsi64_si128(from);
 449   }
 450   static inline uint64_t toInteger(__m128i in) {
 451     return _mm_cvtsi128_si64(in);
 452   }
 453   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 454     return toInteger(_mm_add_epi16(in, kDelta));
 455   }
 456 #endif
 457 };
 458
 459 template <>
 460 struct RWTicketIntTrait<32> {
 461   typedef uint32_t FullInt;
 462   typedef uint16_t HalfInt;
 463   typedef uint8_t QuarterInt;
 464
 465 #ifdef __SSE2__
 466   static __m128i make128(const uint8_t v[4]) {
 467     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 468         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 469   }
 470   static inline __m128i fromInteger(uint32_t from) {
 471     return _mm_cvtsi32_si128(from);
 472   }
 473   static inline uint32_t toInteger(__m128i in) {
 474     return _mm_cvtsi128_si32(in);
 475   }
 476   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 477     return toInteger(_mm_add_epi8(in, kDelta));
 478   }
 479 #endif
 480 };
 481 }  // detail
 482
 483
 484 template<size_t kBitWidth, bool kFavorWriter=false>
 485 class RWTicketSpinLockT : boost::noncopyable {
 486   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 487   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 488   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 489   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 490     QuarterInt;
 491
 492   union RWTicket {
 493     FullInt whole;
 494     HalfInt readWrite;
 495     __extension__ struct {
 496       QuarterInt write;
 497       QuarterInt read;
 498       QuarterInt users;
 499     };
 500   } ticket;
 501
 502  private: // Some x64-specific utilities for atomic access to ticket.
 503   template<class T> static T load_acquire(T* addr) {
 504     T t = *addr; // acquire barrier
 505     asm volatile("" : : : "memory");
 506     return t;
 507   }
 508
 509   template<class T>
 510   static void store_release(T* addr, T v) {
 511     asm volatile("" : : : "memory");
 512     *addr = v; // release barrier
 513   }
 514
 515  public:
 516
 517   RWTicketSpinLockT() {
 518     store_release(&ticket.whole, FullInt(0));
 519   }
 520
 521   void lock() {
 522     if (kFavorWriter) {
 523       writeLockAggressive();
 524     } else {
 525       writeLockNice();
 526     }
 527   }
 528
 529   /*
 530    * Both try_lock and try_lock_shared diverge in our implementation from the
 531    * lock algorithm described in the link above.
 532    *
 533    * In the read case, it is undesirable that the readers could wait
 534    * for another reader (before increasing ticket.read in the other
 535    * implementation).  Our approach gives up on
 536    * first-come-first-serve, but our benchmarks showed improve
 537    * performance for both readers and writers under heavily contended
 538    * cases, particularly when the number of threads exceeds the number
 539    * of logical CPUs.
 540    *
 541    * We have writeLockAggressive() using the original implementation
 542    * for a writer, which gives some advantage to the writer over the
 543    * readers---for that path it is guaranteed that the writer will
 544    * acquire the lock after all the existing readers exit.
 545    */
 546   bool try_lock() {
 547     RWTicket t;
 548     FullInt old = t.whole = load_acquire(&ticket.whole);
 549     if (t.users != t.write) return false;
 550     ++t.users;
 551     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 552   }
 553
 554   /*
 555    * Call this if you want to prioritize writer to avoid starvation.
 556    * Unlike writeLockNice, immediately acquires the write lock when
 557    * the existing readers (arriving before the writer) finish their
 558    * turns.
 559    */
 560   void writeLockAggressive() {
 561     // sched_yield() is needed here to avoid a pathology if the number
 562     // of threads attempting concurrent writes is >= the number of real
 563     // cores allocated to this process. This is less likely than the
 564     // corresponding situation in lock_shared(), but we still want to
 565     // avoid it
 566     int count = 0;
 567     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 568     while (val != load_acquire(&ticket.write)) {
 569       asm volatile("pause");
 570       if (UNLIKELY(++count > 1000)) sched_yield();
 571     }
 572   }
 573
 574   // Call this when the writer should be nicer to the readers.
 575   void writeLockNice() {
 576     // Here it doesn't cpu-relax the writer.
 577     //
 578     // This is because usually we have many more readers than the
 579     // writers, so the writer has less chance to get the lock when
 580     // there are a lot of competing readers.  The aggressive spinning
 581     // can help to avoid starving writers.
 582     //
 583     // We don't worry about sched_yield() here because the caller
 584     // has already explicitly abandoned fairness.
 585     while (!try_lock()) {}
 586   }
 587
 588   // Atomically unlock the write-lock from writer and acquire the read-lock.
 589   void unlock_and_lock_shared() {
 590     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 591   }
 592
 593   // Release writer permission on the lock.
 594   void unlock() {
 595     RWTicket t;
 596     t.whole = load_acquire(&ticket.whole);
 597     FullInt old = t.whole;
 598
 599 #ifdef __SSE2__
 600     // SSE2 can reduce the lock and unlock overhead by 10%
 601     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 602     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 603     __m128i m = IntTraitType::fromInteger(old);
 604     t.whole = IntTraitType::addParallel(m, kDelta);
 605 #else
 606     ++t.read;
 607     ++t.write;
 608 #endif
 609     store_release(&ticket.readWrite, t.readWrite);
 610   }
 611
 612   void lock_shared() {
 613     // sched_yield() is important here because we can't grab the
 614     // shared lock if there is a pending writeLockAggressive, so we
 615     // need to let threads that already have a shared lock complete
 616     int count = 0;
 617     while (!LIKELY(try_lock_shared())) {
 618       asm volatile("pause");
 619       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 620     }
 621   }
 622
 623   bool try_lock_shared() {
 624     RWTicket t, old;
 625     old.whole = t.whole = load_acquire(&ticket.whole);
 626     old.users = old.read;
 627 #ifdef  __SSE2__
 628     // SSE2 may reduce the total lock and unlock overhead by 10%
 629     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 630     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 631     __m128i m = IntTraitType::fromInteger(old.whole);
 632     t.whole = IntTraitType::addParallel(m, kDelta);
 633 #else
 634     ++t.read;
 635     ++t.users;
 636 #endif
 637     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 638   }
 639
 640   void unlock_shared() {
 641     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 642   }
 643
 644   class WriteHolder;
 645
 646   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 647   class ReadHolder : boost::noncopyable {
 648    public:
 649     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 650       lock_(lock) {
 651       if (lock_) lock_->lock_shared();
 652     }
 653
 654     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 655       if (lock_) lock_->lock_shared();
 656     }
 657
 658     // atomically unlock the write-lock from writer and acquire the read-lock
 659     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 660       std::swap(this->lock_, writer->lock_);
 661       if (lock_) {
 662         lock_->unlock_and_lock_shared();
 663       }
 664     }
 665
 666     ~ReadHolder() {
 667       if (lock_) lock_->unlock_shared();
 668     }
 669
 670     void reset(RWSpinLock *lock = nullptr) {
 671       if (lock_) lock_->unlock_shared();
 672       lock_ = lock;
 673       if (lock_) lock_->lock_shared();
 674     }
 675
 676     void swap(ReadHolder *other) {
 677       std::swap(this->lock_, other->lock_);
 678     }
 679
 680    private:
 681     RWSpinLock *lock_;
 682   };
 683
 684   class WriteHolder : boost::noncopyable {
 685    public:
 686     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 687       if (lock_) lock_->lock();
 688     }
 689     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 690       if (lock_) lock_->lock();
 691     }
 692
 693     ~WriteHolder() {
 694       if (lock_) lock_->unlock();
 695     }
 696
 697     void reset(RWSpinLock *lock = nullptr) {
 698       if (lock == lock_) return;
 699       if (lock_) lock_->unlock();
 700       lock_ = lock;
 701       if (lock_) lock_->lock();
 702     }
 703
 704     void swap(WriteHolder *other) {
 705       std::swap(this->lock_, other->lock_);
 706     }
 707
 708    private:
 709     friend class ReadHolder;
 710     RWSpinLock *lock_;
 711   };
 712
 713   // Synchronized<> adaptors.
 714   friend void acquireRead(RWTicketSpinLockT& mutex) {
 715     mutex.lock_shared();
 716   }
 717   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 718     mutex.lock();
 719   }
 720   friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
 721                                unsigned int milliseconds) {
 722     mutex.lock();
 723     return true;
 724   }
 725   friend void releaseRead(RWTicketSpinLockT& mutex) {
 726     mutex.unlock_shared();
 727   }
 728   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 729     mutex.unlock();
 730   }
 731 };
 732
 733 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 734 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 735
 736 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 737
 738 }  // namespace folly
 739
 740 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 741 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 742 #endif
 743
 744 #endif  // FOLLY_RWSPINLOCK_H_