2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
22 #include <folly/detail/Futex.h>
23 #include <folly/Portability.h>
25 #if defined(__clang__)
26 #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
28 #define NO_SANITIZE_ADDRESS
34 * Tiny exclusive lock that packs four lock slots into a single
35 * byte. Each slot is an independent real, sleeping lock. The default
36 * lock and unlock functions operate on slot zero, which modifies only
37 * the low two bits of the host byte.
39 * You should zero-initialize the bits of a MicroLock that you intend
42 * If you're not space-constrained, prefer std::mutex, which will
43 * likely be faster, since it has more than two bits of information to
46 * You are free to put a MicroLock in a union with some other object.
47 * If, for example, you want to use the bottom two bits of a pointer
48 * as a lock, you can put a MicroLock in a union with the pointer and
49 * limit yourself to MicroLock slot zero, which will use the two
50 * least-significant bits in the bottom byte.
52 * (Note that such a union is safe only because MicroLock is based on
53 * a character type, and even under a strict interpretation of C++'s
54 * aliasing rules, character types may alias anything.)
56 * MicroLock uses a dirty trick: it actually operates on the full
57 * word-size, word-aligned bit of memory into which it is embedded.
58 * It never modifies bits outside the ones it's defined to modify, but
59 * it _accesses_ all the bits in the word for purposes of
62 * The MaxSpins template parameter controls the number of times we
63 * spin trying to acquire the lock. MaxYields controls the number of
64 * times we call sched_yield; once we've tried to acquire the lock
65 * MaxSpins + MaxYields times, we sleep on the lock futex.
66 * By adjusting these parameters, you can make MicroLock behave as
67 * much or as little like a conventional spinlock as you'd like.
72 * With the default template options, the timings for uncontended
73 * acquire-then-release come out as follows on Intel(R) Xeon(R) CPU
74 * E5-2660 0 @ 2.20GHz, in @mode/opt, as of the master tree at Tue, 01
77 * ========================================================================
78 * folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s
79 * ========================================================================
80 * MicroSpinLockUncontendedBenchmark 13.46ns 74.28M
81 * PicoSpinLockUncontendedBenchmark 14.99ns 66.71M
82 * MicroLockUncontendedBenchmark 27.06ns 36.96M
83 * StdMutexUncontendedBenchmark 25.18ns 39.72M
84 * VirtualFunctionCall 1.72ns 579.78M
85 * ========================================================================
87 * (The virtual dispatch benchmark is provided for scale.)
89 * The contended case for MicroLock is likely to be worse compared to
90 * std::mutex than the contended case is. Make sure to benchmark your
91 * particular workload.
97 #if defined(__SANITIZE_ADDRESS__) && !defined(__clang__) && \
98 (defined(__GNUC__) || defined(__GNUG__))
103 inline detail::Futex<>* word() const;
104 inline uint32_t baseShift(unsigned slot) const;
105 inline uint32_t heldBit(unsigned slot) const;
106 inline uint32_t waitBit(unsigned slot) const;
107 static void lockSlowPath(uint32_t oldWord,
108 detail::Futex<>* wordPtr,
109 uint32_t slotHeldBit,
114 inline void unlock(unsigned slot) NO_SANITIZE_ADDRESS;
115 inline void unlock() { unlock(0); }
116 // Initializes all the slots.
117 inline void init() { lock_ = 0; }
120 inline detail::Futex<>* MicroLockCore::word() const {
121 uintptr_t lockptr = (uintptr_t)&lock_;
122 lockptr &= ~(sizeof(uint32_t) - 1);
123 return (detail::Futex<>*)lockptr;
126 inline unsigned MicroLockCore::baseShift(unsigned slot) const {
127 assert(slot < CHAR_BIT / 2);
129 unsigned offset_bytes = (unsigned)((uintptr_t)&lock_ - (uintptr_t)word());
131 return kIsLittleEndian
132 ? offset_bytes * CHAR_BIT + slot * 2
133 : CHAR_BIT * (sizeof(uint32_t) - offset_bytes - 1) + slot * 2;
136 inline uint32_t MicroLockCore::heldBit(unsigned slot) const {
137 return 1U << (baseShift(slot) + 0);
140 inline uint32_t MicroLockCore::waitBit(unsigned slot) const {
141 return 1U << (baseShift(slot) + 1);
144 void MicroLockCore::unlock(unsigned slot) {
145 detail::Futex<>* wordPtr = word();
149 oldWord = wordPtr->load(std::memory_order_relaxed);
151 assert(oldWord & heldBit(slot));
152 newWord = oldWord & ~(heldBit(slot) | waitBit(slot));
153 } while (!wordPtr->compare_exchange_weak(
154 oldWord, newWord, std::memory_order_release, std::memory_order_relaxed));
156 if (oldWord & waitBit(slot)) {
157 // We don't track the number of waiters, so wake everyone
158 (void)wordPtr->futexWake(std::numeric_limits<int>::max(), heldBit(slot));
162 template <unsigned MaxSpins = 1000, unsigned MaxYields = 0>
163 class MicroLockBase : public MicroLockCore {
165 inline void lock(unsigned slot) NO_SANITIZE_ADDRESS;
166 inline void lock() { lock(0); }
167 inline bool try_lock(unsigned slot) NO_SANITIZE_ADDRESS;
168 inline bool try_lock() { return try_lock(0); }
171 template <unsigned MaxSpins, unsigned MaxYields>
172 bool MicroLockBase<MaxSpins, MaxYields>::try_lock(unsigned slot) {
174 // N.B. You might think that try_lock is just the fast path of lock,
175 // but you'd be wrong. Keep in mind that other parts of our host
176 // word might be changing while we take the lock! We're not allowed
177 // to fail spuriously if the lock is in fact not held, even if other
178 // people are concurrently modifying other parts of the word.
180 // We need to loop until we either see firm evidence that somebody
181 // else has the lock (by looking at heldBit) or see our CAS succeed.
182 // A failed CAS by itself does not indicate lock-acquire failure.
184 detail::Futex<>* wordPtr = word();
185 uint32_t oldWord = wordPtr->load(std::memory_order_relaxed);
187 if (oldWord & heldBit(slot)) {
190 } while (!wordPtr->compare_exchange_weak(oldWord,
191 oldWord | heldBit(slot),
192 std::memory_order_acquire,
193 std::memory_order_relaxed));
198 template <unsigned MaxSpins, unsigned MaxYields>
199 void MicroLockBase<MaxSpins, MaxYields>::lock(unsigned slot) {
201 static_assert(MaxSpins + MaxYields < (unsigned)-1, "overflow");
203 detail::Futex<>* wordPtr = word();
205 oldWord = wordPtr->load(std::memory_order_relaxed);
206 if ((oldWord & heldBit(slot)) == 0 &&
207 wordPtr->compare_exchange_weak(oldWord,
208 oldWord | heldBit(slot),
209 std::memory_order_acquire,
210 std::memory_order_relaxed)) {
211 // Fast uncontended case: memory_order_acquire above is our barrier
213 // lockSlowPath doesn't have any slot-dependent computation; it
214 // just shifts the input bit. Make sure its shifting produces the
215 // same result a call to waitBit for our slot would.
216 assert(heldBit(slot) << 1 == waitBit(slot));
217 // lockSlowPath emits its own memory barrier
218 lockSlowPath(oldWord, wordPtr, heldBit(slot), MaxSpins, MaxYields);
222 typedef MicroLockBase<> MicroLock;