folly/RWSpinLock.h

   1 /*
   2  * Copyright 2012 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Two Read-Write spin lock implementations.
  19  *
  20  *  Ref: http://locklessinc.com/articles/locks
  21  *
  22  *  Both locks here are faster than pthread_rwlock and have very low
  23  *  overhead (usually 20-30ns).  They don't use any system mutexes and
  24  *  are very compact (4/8 bytes), so are suitable for per-instance
  25  *  based locking, particularly when contention is not expected.
  26  *
  27  *  In most cases, RWSpinLock is a reasonable choice.  It has minimal
  28  *  overhead, and comparable contention performance when the number of
  29  *  competing threads is less than or equal to the number of logical
  30  *  CPUs.  Even as the number of threads gets larger, RWSpinLock can
  31  *  still be very competitive in READ, although it is slower on WRITE,
  32  *  and also inherently unfair to writers.
  33  *
  34  *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
  35  *  your application really needs a lot more threads, and a
  36  *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
  37  *
  38  *  Caveats:
  39  *
  40  *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
  41  *    based systems.
  42  *
  43  *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
  44  *    readers and writers.
  45  *
  46  *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
  47  *    readers and writers.
  48  *
  49  *    RWSpinLock handles 2^30 - 1 concurrent readers.
  50  *
  51  * @author Xin Liu <xliux@fb.com>
  52  */
  53
  54 #ifndef FOLLY_RWSPINLOCK_H_
  55 #define FOLLY_RWSPINLOCK_H_
  56
  57 /*
  58 ========================================================================
  59 Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
  60 ========================================================================
  61
  62 ------------------------------------------------------------------------------
  63 1. Single thread benchmark (read/write lock + unlock overhead)
  64 Benchmark                                    Iters   Total t    t/iter iter/sec
  65 -------------------------------------------------------------------------------
  66 *      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
  67 +30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
  68 +85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
  69 +96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
  70 +85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
  71 +96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
  72 +85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
  73 +29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
  74 +85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
  75 +30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
  76 + 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
  77 + 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
  78
  79 ------------------------------------------------------------------------------
  80 2. Contention Benchmark      90% read  10% write
  81 Benchmark                    hits       average    min       max        sigma
  82 ------------------------------------------------------------------------------
  83 ---------- 8  threads ------------
  84 RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
  85 RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
  86 RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
  87 RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
  88 pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
  89 pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
  90
  91 ---------- 16 threads ------------
  92 RWSpinLock       Write       124236     237ns      78ns      261us      801ns
  93 RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
  94 RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
  95 RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
  96 pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
  97 pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
  98
  99 ---------- 50 threads ------------
 100 RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
 101 RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
 102 RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
 103 RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
 104 pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
 105 pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
 106
 107 */
 108
 109 #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64__) || \
 110     defined(ARCH_K8))
 111 #define RW_SPINLOCK_USE_X86_INTRINSIC_
 112 #include <x86intrin.h>
 113 #else
 114 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 115 #endif
 116
 117 #include <atomic>
 118 #include <string>
 119 #include <algorithm>
 120 #include <boost/noncopyable.hpp>
 121
 122 #include <sched.h>
 123 #include <glog/logging.h>
 124
 125 #include "folly/Likely.h"
 126
 127 namespace folly {
 128
 129 /*
 130  * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
 131  * a nice writer and don't expect a lot of write/read contention, or
 132  * when you need small rwlocks since you are creating a large number
 133  * of them.
 134  *
 135  * Note that the unfairness here is extreme: if the lock is
 136  * continually accessed for read, writers will never get a chance.  If
 137  * the lock can be that highly contended this class is probably not an
 138  * ideal choice anyway.
 139  *
 140  * It currently implements most of the Lockable, SharedLockable and
 141  * UpgradeLockable concepts except the TimedLockable related locking/unlocking
 142  * interfaces.
 143  */
 144 class RWSpinLock : boost::noncopyable {
 145   enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
 146  public:
 147   RWSpinLock() : bits_(0) {}
 148
 149   // Lockable Concept
 150   void lock() {
 151     int count = 0;
 152     while (!LIKELY(try_lock())) {
 153       if (++count > 1000) sched_yield();
 154     }
 155   }
 156
 157   // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
 158   void unlock() {
 159     static_assert(READER > WRITER + UPGRADED, "wrong bits!");
 160     bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
 161   }
 162
 163   // SharedLockable Concept
 164   void lock_shared() {
 165     int count = 0;
 166     while (!LIKELY(try_lock_shared())) {
 167       if (++count > 1000) sched_yield();
 168     }
 169   }
 170
 171   void unlock_shared() {
 172     bits_.fetch_add(-READER, std::memory_order_release);
 173   }
 174
 175   // Downgrade the lock from writer status to reader status.
 176   void unlock_and_lock_shared() {
 177     bits_.fetch_add(READER, std::memory_order_acquire);
 178     unlock();
 179   }
 180
 181   // UpgradeLockable Concept
 182   void lock_upgrade() {
 183     int count = 0;
 184     while (!try_lock_upgrade()) {
 185       if (++count > 1000) sched_yield();
 186     }
 187   }
 188
 189   void unlock_upgrade() {
 190     bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
 191   }
 192
 193   // unlock upgrade and try to acquire write lock
 194   void unlock_upgrade_and_lock() {
 195     int64_t count = 0;
 196     while (!try_unlock_upgrade_and_lock()) {
 197       if (++count > 1000) sched_yield();
 198     }
 199   }
 200
 201   // unlock upgrade and read lock atomically
 202   void unlock_upgrade_and_lock_shared() {
 203     bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
 204   }
 205
 206   void unlock_shared_and_lock_upgrade() {
 207     lock_upgrade();
 208     unlock_shared();
 209   }
 210
 211   // write unlock and upgrade lock atomically
 212   void unlock_and_lock_upgrade() {
 213     // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
 214     // the same time when other threads are trying do try_lock_upgrade().
 215     bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 216     bits_.fetch_add(-WRITER, std::memory_order_release);
 217   }
 218
 219
 220   // Attempt to acquire writer permission. Return false if we didn't get it.
 221   bool try_lock() {
 222     int32_t expect = 0;
 223     return bits_.compare_exchange_strong(expect, WRITER,
 224       std::memory_order_acq_rel);
 225   }
 226
 227   // Try to get reader permission on the lock. This can fail if we
 228   // find out someone is a writer.
 229   bool try_lock_shared() {
 230     // fetch_add is considerably (100%) faster than compare_exchange,
 231     // so here we are optimizing for the common (lock success) case.
 232     int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
 233     if (UNLIKELY(value & WRITER)) {
 234       bits_.fetch_add(-READER, std::memory_order_release);
 235       return false;
 236     }
 237     return true;
 238   }
 239
 240   // try to unlock upgrade and write lock atomically
 241   bool try_unlock_upgrade_and_lock() {
 242     int32_t expect = UPGRADED;
 243     return bits_.compare_exchange_strong(expect, WRITER,
 244         std::memory_order_acq_rel);
 245   }
 246
 247   // try to acquire an upgradable lock.
 248   bool try_lock_upgrade() {
 249     int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
 250
 251     // Note: when failed, we cannot flip the UPGRADED bit back,
 252     // as in this case there is either another upgrade lock or a write lock.
 253     // If it's a write lock, the bit will get cleared up when that lock's done
 254     // with unlock().
 255     return ((value & (UPGRADED | WRITER)) == 0);
 256   }
 257
 258   // mainly for debugging purposes.
 259   int32_t bits() const { return bits_.load(std::memory_order_acquire); }
 260
 261   class ReadHolder;
 262   class UpgradedHolder;
 263   class WriteHolder;
 264
 265   class ReadHolder {
 266    public:
 267     explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 268       if (lock_) lock_->lock_shared();
 269     }
 270
 271     explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
 272       lock_->lock_shared();
 273     }
 274
 275     ReadHolder(ReadHolder&& other) : lock_(other.lock_) {
 276       other.lock_ = nullptr;
 277     }
 278
 279     // down-grade
 280     explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
 281       upgraded.lock_ = nullptr;
 282       if (lock_) lock_->unlock_upgrade_and_lock_shared();
 283     }
 284
 285     explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
 286       writer.lock_ = nullptr;
 287       if (lock_) lock_->unlock_and_lock_shared();
 288     }
 289
 290     ReadHolder& operator=(ReadHolder&& other) {
 291       using std::swap;
 292       swap(lock_, other.lock_);
 293       return *this;
 294     }
 295
 296     ReadHolder(const ReadHolder& other) = delete;
 297     ReadHolder& operator=(const ReadHolder& other) = delete;
 298
 299     ~ReadHolder() { if (lock_) lock_->unlock_shared(); }
 300
 301     void reset(RWSpinLock* lock = nullptr) {
 302       if (lock == lock_) return;
 303       if (lock_) lock_->unlock_shared();
 304       lock_ = lock;
 305       if (lock_) lock_->lock_shared();
 306     }
 307
 308     void swap(ReadHolder* other) {
 309       std::swap(lock_, other->lock_);
 310     }
 311
 312    private:
 313     friend class UpgradedHolder;
 314     friend class WriteHolder;
 315     RWSpinLock* lock_;
 316   };
 317
 318   class UpgradedHolder {
 319    public:
 320     explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 321       if (lock_) lock_->lock_upgrade();
 322     }
 323
 324     explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
 325       lock_->lock_upgrade();
 326     }
 327
 328     explicit UpgradedHolder(ReadHolder&& reader) {
 329       lock_ = reader.lock_;
 330       reader.lock_ = nullptr;
 331       if (lock_) lock_->unlock_shared_and_lock_upgrade();
 332     }
 333
 334     explicit UpgradedHolder(WriteHolder&& writer) {
 335       lock_ = writer.lock_;
 336       writer.lock_ = nullptr;
 337       if (lock_) lock_->unlock_and_lock_upgrade();
 338     }
 339
 340     UpgradedHolder(UpgradedHolder&& other) : lock_(other.lock_) {
 341       other.lock_ = nullptr;
 342     }
 343
 344     UpgradedHolder& operator =(UpgradedHolder&& other) {
 345       using std::swap;
 346       swap(lock_, other.lock_);
 347       return *this;
 348     }
 349
 350     UpgradedHolder(const UpgradedHolder& other) = delete;
 351     UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
 352
 353     ~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
 354
 355     void reset(RWSpinLock* lock = nullptr) {
 356       if (lock == lock_) return;
 357       if (lock_) lock_->unlock_upgrade();
 358       lock_ = lock;
 359       if (lock_) lock_->lock_upgrade();
 360     }
 361
 362     void swap(UpgradedHolder* other) {
 363       using std::swap;
 364       swap(lock_, other->lock_);
 365     }
 366
 367    private:
 368     friend class WriteHolder;
 369     friend class ReadHolder;
 370     RWSpinLock* lock_;
 371   };
 372
 373   class WriteHolder {
 374    public:
 375     explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
 376       if (lock_) lock_->lock();
 377     }
 378
 379     explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
 380       lock_->lock();
 381     }
 382
 383     // promoted from an upgrade lock holder
 384     explicit WriteHolder(UpgradedHolder&& upgraded) {
 385       lock_ = upgraded.lock_;
 386       upgraded.lock_ = nullptr;
 387       if (lock_) lock_->unlock_upgrade_and_lock();
 388     }
 389
 390     WriteHolder(WriteHolder&& other) : lock_(other.lock_) {
 391       other.lock_ = nullptr;
 392     }
 393
 394     WriteHolder& operator =(WriteHolder&& other) {
 395       using std::swap;
 396       swap(lock_, other.lock_);
 397       return *this;
 398     }
 399
 400     WriteHolder(const WriteHolder& other) = delete;
 401     WriteHolder& operator =(const WriteHolder& other) = delete;
 402
 403     ~WriteHolder () { if (lock_) lock_->unlock(); }
 404
 405     void reset(RWSpinLock* lock = nullptr) {
 406       if (lock == lock_) return;
 407       if (lock_) lock_->unlock();
 408       lock_ = lock;
 409       if (lock_) lock_->lock();
 410     }
 411
 412     void swap(WriteHolder* other) {
 413       using std::swap;
 414       swap(lock_, other->lock_);
 415     }
 416
 417    private:
 418     friend class ReadHolder;
 419     friend class UpgradedHolder;
 420     RWSpinLock* lock_;
 421   };
 422
 423   // Synchronized<> adaptors
 424   friend void acquireRead(RWSpinLock& l) { return l.lock_shared(); }
 425   friend void acquireReadWrite(RWSpinLock& l) { return l.lock(); }
 426   friend void releaseRead(RWSpinLock& l) { return l.unlock_shared(); }
 427   friend void releaseReadWrite(RWSpinLock& l) { return l.unlock(); }
 428
 429  private:
 430   std::atomic<int32_t> bits_;
 431 };
 432
 433
 434 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 435 // A more balanced Read-Write spin lock implemented based on GCC intrinsics.
 436
 437 namespace detail {
 438 template <size_t kBitWidth> struct RWTicketIntTrait {
 439   static_assert(kBitWidth == 32 || kBitWidth == 64,
 440       "bit width has to be either 32 or 64 ");
 441 };
 442
 443 template <>
 444 struct RWTicketIntTrait<64> {
 445   typedef uint64_t FullInt;
 446   typedef uint32_t HalfInt;
 447   typedef uint16_t QuarterInt;
 448
 449 #ifdef __SSE2__
 450   static __m128i make128(const uint16_t v[4]) {
 451     return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 452   }
 453   static inline __m128i fromInteger(uint64_t from) {
 454     return _mm_cvtsi64_si128(from);
 455   }
 456   static inline uint64_t toInteger(__m128i in) {
 457     return _mm_cvtsi128_si64(in);
 458   }
 459   static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
 460     return toInteger(_mm_add_epi16(in, kDelta));
 461   }
 462 #endif
 463 };
 464
 465 template <>
 466 struct RWTicketIntTrait<32> {
 467   typedef uint32_t FullInt;
 468   typedef uint16_t HalfInt;
 469   typedef uint8_t QuarterInt;
 470
 471 #ifdef __SSE2__
 472   static __m128i make128(const uint8_t v[4]) {
 473     return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 474         0, 0, 0, 0, v[3], v[2], v[1], v[0]);
 475   }
 476   static inline __m128i fromInteger(uint32_t from) {
 477     return _mm_cvtsi32_si128(from);
 478   }
 479   static inline uint32_t toInteger(__m128i in) {
 480     return _mm_cvtsi128_si32(in);
 481   }
 482   static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
 483     return toInteger(_mm_add_epi8(in, kDelta));
 484   }
 485 #endif
 486 };
 487 }  // detail
 488
 489
 490 template<size_t kBitWidth, bool kFavorWriter=false>
 491 class RWTicketSpinLockT : boost::noncopyable {
 492   typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
 493   typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
 494   typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
 495   typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
 496     QuarterInt;
 497
 498   union RWTicket {
 499     FullInt whole;
 500     HalfInt readWrite;
 501     __extension__ struct {
 502       QuarterInt write;
 503       QuarterInt read;
 504       QuarterInt users;
 505     };
 506   } ticket;
 507
 508  private: // Some x64-specific utilities for atomic access to ticket.
 509   template<class T> static T load_acquire(T* addr) {
 510     T t = *addr; // acquire barrier
 511     asm volatile("" : : : "memory");
 512     return t;
 513   }
 514
 515   template<class T>
 516   static void store_release(T* addr, T v) {
 517     asm volatile("" : : : "memory");
 518     *addr = v; // release barrier
 519   }
 520
 521  public:
 522
 523   RWTicketSpinLockT() {
 524     store_release(&ticket.whole, FullInt(0));
 525   }
 526
 527   void lock() {
 528     if (kFavorWriter) {
 529       writeLockAggressive();
 530     } else {
 531       writeLockNice();
 532     }
 533   }
 534
 535   /*
 536    * Both try_lock and try_lock_shared diverge in our implementation from the
 537    * lock algorithm described in the link above.
 538    *
 539    * In the read case, it is undesirable that the readers could wait
 540    * for another reader (before increasing ticket.read in the other
 541    * implementation).  Our approach gives up on
 542    * first-come-first-serve, but our benchmarks showed improve
 543    * performance for both readers and writers under heavily contended
 544    * cases, particularly when the number of threads exceeds the number
 545    * of logical CPUs.
 546    *
 547    * We have writeLockAggressive() using the original implementation
 548    * for a writer, which gives some advantage to the writer over the
 549    * readers---for that path it is guaranteed that the writer will
 550    * acquire the lock after all the existing readers exit.
 551    */
 552   bool try_lock() {
 553     RWTicket t;
 554     FullInt old = t.whole = load_acquire(&ticket.whole);
 555     if (t.users != t.write) return false;
 556     ++t.users;
 557     return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
 558   }
 559
 560   /*
 561    * Call this if you want to prioritize writer to avoid starvation.
 562    * Unlike writeLockNice, immediately acquires the write lock when
 563    * the existing readers (arriving before the writer) finish their
 564    * turns.
 565    */
 566   void writeLockAggressive() {
 567     // sched_yield() is needed here to avoid a pathology if the number
 568     // of threads attempting concurrent writes is >= the number of real
 569     // cores allocated to this process. This is less likely than the
 570     // corresponding situation in lock_shared(), but we still want to
 571     // avoid it
 572     int count = 0;
 573     QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
 574     while (val != load_acquire(&ticket.write)) {
 575       asm volatile("pause");
 576       if (UNLIKELY(++count > 1000)) sched_yield();
 577     }
 578   }
 579
 580   // Call this when the writer should be nicer to the readers.
 581   void writeLockNice() {
 582     // Here it doesn't cpu-relax the writer.
 583     //
 584     // This is because usually we have many more readers than the
 585     // writers, so the writer has less chance to get the lock when
 586     // there are a lot of competing readers.  The aggressive spinning
 587     // can help to avoid starving writers.
 588     //
 589     // We don't worry about sched_yield() here because the caller
 590     // has already explicitly abandoned fairness.
 591     while (!try_lock()) {}
 592   }
 593
 594   // Atomically unlock the write-lock from writer and acquire the read-lock.
 595   void unlock_and_lock_shared() {
 596     QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
 597   }
 598
 599   // Release writer permission on the lock.
 600   void unlock() {
 601     RWTicket t;
 602     t.whole = load_acquire(&ticket.whole);
 603     FullInt old = t.whole;
 604
 605 #ifdef __SSE2__
 606     // SSE2 can reduce the lock and unlock overhead by 10%
 607     static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 };   // write/read/user
 608     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 609     __m128i m = IntTraitType::fromInteger(old);
 610     t.whole = IntTraitType::addParallel(m, kDelta);
 611 #else
 612     ++t.read;
 613     ++t.write;
 614 #endif
 615     store_release(&ticket.readWrite, t.readWrite);
 616   }
 617
 618   void lock_shared() {
 619     // sched_yield() is important here because we can't grab the
 620     // shared lock if there is a pending writeLockAggressive, so we
 621     // need to let threads that already have a shared lock complete
 622     int count = 0;
 623     while (!LIKELY(try_lock_shared())) {
 624       asm volatile("pause");
 625       if (UNLIKELY((++count & 1023) == 0)) sched_yield();
 626     }
 627   }
 628
 629   bool try_lock_shared() {
 630     RWTicket t, old;
 631     old.whole = t.whole = load_acquire(&ticket.whole);
 632     old.users = old.read;
 633 #ifdef  __SSE2__
 634     // SSE2 may reduce the total lock and unlock overhead by 10%
 635     static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 };   // write/read/user
 636     static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
 637     __m128i m = IntTraitType::fromInteger(old.whole);
 638     t.whole = IntTraitType::addParallel(m, kDelta);
 639 #else
 640     ++t.read;
 641     ++t.users;
 642 #endif
 643     return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
 644   }
 645
 646   void unlock_shared() {
 647     QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
 648   }
 649
 650   class WriteHolder;
 651
 652   typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
 653   class ReadHolder : boost::noncopyable {
 654    public:
 655     explicit ReadHolder(RWSpinLock *lock = nullptr) :
 656       lock_(lock) {
 657       if (lock_) lock_->lock_shared();
 658     }
 659
 660     explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
 661       if (lock_) lock_->lock_shared();
 662     }
 663
 664     // atomically unlock the write-lock from writer and acquire the read-lock
 665     explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
 666       std::swap(this->lock_, writer->lock_);
 667       if (lock_) {
 668         lock_->unlock_and_lock_shared();
 669       }
 670     }
 671
 672     ~ReadHolder() {
 673       if (lock_) lock_->unlock_shared();
 674     }
 675
 676     void reset(RWSpinLock *lock = nullptr) {
 677       if (lock_) lock_->unlock_shared();
 678       lock_ = lock;
 679       if (lock_) lock_->lock_shared();
 680     }
 681
 682     void swap(ReadHolder *other) {
 683       std::swap(this->lock_, other->lock_);
 684     }
 685
 686    private:
 687     RWSpinLock *lock_;
 688   };
 689
 690   class WriteHolder : boost::noncopyable {
 691    public:
 692     explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
 693       if (lock_) lock_->lock();
 694     }
 695     explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
 696       if (lock_) lock_->lock();
 697     }
 698
 699     ~WriteHolder() {
 700       if (lock_) lock_->unlock();
 701     }
 702
 703     void reset(RWSpinLock *lock = nullptr) {
 704       if (lock == lock_) return;
 705       if (lock_) lock_->unlock();
 706       lock_ = lock;
 707       if (lock_) lock_->lock();
 708     }
 709
 710     void swap(WriteHolder *other) {
 711       std::swap(this->lock_, other->lock_);
 712     }
 713
 714    private:
 715     friend class ReadHolder;
 716     RWSpinLock *lock_;
 717   };
 718
 719   // Synchronized<> adaptors.
 720   friend void acquireRead(RWTicketSpinLockT& mutex) {
 721     mutex.lock_shared();
 722   }
 723   friend void acquireReadWrite(RWTicketSpinLockT& mutex) {
 724     mutex.lock();
 725   }
 726   friend bool acquireReadWrite(RWTicketSpinLockT& mutex,
 727                                unsigned int milliseconds) {
 728     mutex.lock();
 729     return true;
 730   }
 731   friend void releaseRead(RWTicketSpinLockT& mutex) {
 732     mutex.unlock_shared();
 733   }
 734   friend void releaseReadWrite(RWTicketSpinLockT& mutex) {
 735     mutex.unlock();
 736   }
 737 };
 738
 739 typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
 740 typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
 741
 742 #endif  // RW_SPINLOCK_USE_X86_INTRINSIC_
 743
 744 }  // namespace folly
 745
 746 #ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
 747 #undef RW_SPINLOCK_USE_X86_INTRINSIC_
 748 #endif
 749
 750 #endif  // FOLLY_RWSPINLOCK_H_