2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
25 #include <folly/Likely.h>
26 #include <folly/detail/Futex.h>
27 #include <folly/detail/MemoryIdler.h>
28 #include <folly/portability/Asm.h>
32 /// A Baton allows a thread to block once and be awoken. The single
33 /// poster version (with SinglePoster == true) captures a single
34 /// handoff, and during its lifecycle (from construction/reset to
35 /// destruction/reset) a baton must either be post()ed and wait()ed
36 /// exactly once each, or not at all.
38 /// The multi-poster version (SinglePoster == false) allows multiple
39 /// concurrent handoff attempts, the first of which completes the
40 /// handoff and the rest if any are idempotent.
42 /// Baton includes no internal padding, and is only 4 bytes in size.
43 /// Any alignment or padding to avoid false sharing is up to the user.
45 /// This is basically a stripped-down semaphore that supports (only a
46 /// single call to sem_post, when SinglePoster == true) and a single
49 /// The non-blocking version (Blocking == false) provides more speed
50 /// by using only load acquire and store release operations in the
51 /// critical path, at the cost of disallowing blocking and timing out.
53 /// The current posix semaphore sem_t isn't too bad, but this provides
54 /// more a bit more speed, inlining, smaller size, a guarantee that
55 /// the implementation won't change, and compatibility with
56 /// DeterministicSchedule. By having a much more restrictive
57 /// lifecycle we can also add a bunch of assertions that can help to
58 /// catch race conditions ahead of time.
60 template <typename> class Atom = std::atomic,
61 bool SinglePoster = true, // single vs multiple posters
62 bool Blocking = true> // blocking vs spinning
64 constexpr Baton() : state_(INIT) {}
66 Baton(Baton const&) = delete;
67 Baton& operator=(Baton const&) = delete;
69 /// It is an error to destroy a Baton on which a thread is currently
70 /// wait()ing. In practice this means that the waiter usually takes
71 /// responsibility for destroying the Baton.
73 // The docblock for this function says that it can't be called when
74 // there is a concurrent waiter. We assume a strong version of this
75 // requirement in which the caller must _know_ that this is true, they
76 // are not allowed to be merely lucky. If two threads are involved,
77 // the destroying thread must actually have synchronized with the
78 // waiting thread after wait() returned. To convey causality the the
79 // waiting thread must have used release semantics and the destroying
80 // thread must have used acquire semantics for that communication,
81 // so we are guaranteed to see the post-wait() value of state_,
82 // which cannot be WAITING.
84 // Note that since we only care about a single memory location,
85 // the only two plausible memory orders here are relaxed and seq_cst.
86 assert(state_.load(std::memory_order_relaxed) != WAITING);
89 /// Equivalent to destroying the Baton and creating a new one. It is
90 /// a bug to call this while there is a waiting thread, so in practice
91 /// the waiter will be the one that resets the baton.
93 // See ~Baton for a discussion about why relaxed is okay here
94 assert(state_.load(std::memory_order_relaxed) != WAITING);
96 // We use a similar argument to justify the use of a relaxed store
97 // here. Since both wait() and post() are required to be called
98 // only once per lifetime, no thread can actually call those methods
99 // correctly after a reset() unless it synchronizes with the thread
100 // that performed the reset(). If a post() or wait() on another thread
101 // didn't synchronize, then regardless of what operation we performed
102 // here there would be a race on proper use of the Baton's spec
103 // (although not on any particular load and store). Put another way,
104 // we don't need to synchronize here because anybody that might rely
105 // on such synchronization is required by the baton rules to perform
106 // an additional synchronization that has the desired effect anyway.
108 // There is actually a similar argument to be made about the
109 // constructor, in which the fenceless constructor initialization
110 // of state_ is piggybacked on whatever synchronization mechanism
111 // distributes knowledge of the Baton's existence
112 state_.store(INIT, std::memory_order_relaxed);
115 /// Causes wait() to wake up. For each lifetime of a Baton (where a
116 /// lifetime starts at construction or reset() and ends at
117 /// destruction or reset()) there can be at most one call to post(),
118 /// in the single poster version. Any thread may call post().
121 /// Non-blocking version
124 auto state = state_.load(std::memory_order_relaxed);
125 return (state == INIT || state == EARLY_DELIVERY);
127 state_.store(EARLY_DELIVERY, std::memory_order_release);
131 /// Blocking versions
134 /// Single poster version
136 uint32_t before = state_.load(std::memory_order_acquire);
138 assert(before == INIT || before == WAITING || before == TIMED_OUT);
140 if (before == INIT &&
141 state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
145 assert(before == WAITING || before == TIMED_OUT);
147 if (before == TIMED_OUT) {
151 assert(before == WAITING);
152 state_.store(LATE_DELIVERY, std::memory_order_release);
155 /// Multi-poster version
158 uint32_t before = state_.load(std::memory_order_acquire);
160 if (before == INIT &&
161 state_.compare_exchange_strong(before, EARLY_DELIVERY)) {
165 if (before == TIMED_OUT) {
169 if (before == EARLY_DELIVERY || before == LATE_DELIVERY) {
170 // The reason for not simply returning (without the following
171 // atomic operation) is to avoid the following case:
174 // local1.post(); local2.post(); global.wait();
175 // global.post(); global.post(); local1.try_wait() == true;
176 // local2.try_wait() == false;
178 if (state_.fetch_add(0) != before) {
184 assert(before == WAITING);
185 if (!state_.compare_exchange_weak(before, LATE_DELIVERY)) {
194 /// Waits until post() has been called in the current Baton lifetime.
195 /// May be called at most once during a Baton lifetime (construction
196 /// |reset until destruction|reset). If post is called before wait in
197 /// the current lifetime then this method returns immediately.
199 /// The restriction that there can be at most one wait() per lifetime
200 /// could be relaxed somewhat without any perf or size regressions,
201 /// but by making this condition very restrictive we can provide better
202 /// checking in debug builds.
203 FOLLY_ALWAYS_INLINE void wait() {
211 /// Similar to wait, but doesn't block the thread if it hasn't been posted.
213 /// try_wait has the following semantics:
214 /// - It is ok to call try_wait any number times on the same baton until
215 /// try_wait reports that the baton has been posted.
216 /// - It is ok to call timed_wait or wait on the same baton if try_wait
217 /// reports that baton hasn't been posted.
218 /// - If try_wait indicates that the baton has been posted, it is invalid to
219 /// call wait, try_wait or timed_wait on the same baton without resetting
221 /// @return true if baton has been posted, false othewise
222 FOLLY_ALWAYS_INLINE bool try_wait() const {
223 auto s = state_.load(std::memory_order_acquire);
224 assert(s == INIT || s == EARLY_DELIVERY);
225 return LIKELY(s == EARLY_DELIVERY);
228 /// Similar to wait, but with a timeout. The thread is unblocked if the
230 /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
231 /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
232 /// words, after try_wait_for the caller can't invoke
233 /// wait/try_wait/try_wait_for/try_wait_until
234 /// again on the same baton without resetting it.
236 /// @param timeout Time until which the thread can block
237 /// @return true if the baton was posted to before timeout,
239 template <typename Rep, typename Period>
240 FOLLY_ALWAYS_INLINE bool try_wait_for(
241 const std::chrono::duration<Rep, Period>& timeout) {
243 Blocking, "Non-blocking Baton does not support try_wait_for.");
249 auto deadline = std::chrono::steady_clock::now() + timeout;
250 return tryWaitUntilSlow(deadline);
253 /// Similar to wait, but with a deadline. The thread is unblocked if the
254 /// deadline expires.
255 /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
256 /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
257 /// words, after try_wait_until the caller can't invoke
258 /// wait/try_wait/try_wait_for/try_wait_until
259 /// again on the same baton without resetting it.
261 /// @param deadline Time until which the thread can block
262 /// @return true if the baton was posted to before deadline,
264 template <typename Clock, typename Duration>
265 FOLLY_ALWAYS_INLINE bool try_wait_until(
266 const std::chrono::time_point<Clock, Duration>& deadline) {
268 Blocking, "Non-blocking Baton does not support try_wait_until.");
274 return tryWaitUntilSlow(deadline);
277 /// Alias to try_wait_for. Deprecated.
278 template <typename Rep, typename Period>
279 FOLLY_ALWAYS_INLINE bool timed_wait(
280 const std::chrono::duration<Rep, Period>& timeout) {
281 return try_wait_for(timeout);
284 /// Alias to try_wait_until. Deprecated.
285 template <typename Clock, typename Duration>
286 FOLLY_ALWAYS_INLINE bool timed_wait(
287 const std::chrono::time_point<Clock, Duration>& deadline) {
288 return try_wait_until(deadline);
292 enum State : uint32_t {
301 // Must be positive. If multiple threads are actively using a
302 // higher-level data structure that uses batons internally, it is
303 // likely that the post() and wait() calls happen almost at the same
304 // time. In this state, we lose big 50% of the time if the wait goes
305 // to sleep immediately. On circa-2013 devbox hardware it costs about
306 // 7 usec to FUTEX_WAIT and then be awoken (half the t/iter as the
307 // posix_sem_pingpong test in BatonTests). We can improve our chances
308 // of EARLY_DELIVERY by spinning for a bit, although we have to balance
309 // this against the loss if we end up sleeping any way. Spins on this
310 // hw take about 7 nanos (all but 0.5 nanos is the pause instruction).
311 // We give ourself 300 spins, which is about 2 usec of waiting. As a
312 // partial consolation, since we are using the pause instruction we
313 // are giving a speed boost to the colocated hyperthread.
314 PreBlockAttempts = 300,
317 // Spin for "some time" (see discussion on PreBlockAttempts) waiting
320 // @return true if we received an early delivery during the wait,
321 // false otherwise. If the function returns true then
322 // state_ is guaranteed to be EARLY_DELIVERY
323 bool spinWaitForEarlyDelivery() {
325 PreBlockAttempts > 0,
326 "isn't this assert clearer than an uninitialized variable warning?");
327 for (int i = 0; i < PreBlockAttempts; ++i) {
332 // The pause instruction is the polite way to spin, but it doesn't
333 // actually affect correctness to omit it if we don't have it.
334 // Pausing donates the full capabilities of the current core to
335 // its other hyperthreads for a dozen cycles or so
336 asm_volatile_pause();
342 FOLLY_NOINLINE void waitSlow() {
343 if (spinWaitForEarlyDelivery()) {
344 assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY);
349 while (!try_wait()) {
350 std::this_thread::yield();
355 // guess we have to block :(
356 uint32_t expected = INIT;
357 if (!state_.compare_exchange_strong(expected, WAITING)) {
358 // CAS failed, last minute reprieve
359 assert(expected == EARLY_DELIVERY);
364 detail::MemoryIdler::futexWait(state_, WAITING);
366 // state_ is the truth even if FUTEX_WAIT reported a matching
367 // FUTEX_WAKE, since we aren't using type-stable storage and we
368 // don't guarantee reuse. The scenario goes like this: thread
369 // A's last touch of a Baton is a call to wake(), which stores
370 // LATE_DELIVERY and gets an unlucky context switch before delivering
371 // the corresponding futexWake. Thread B sees LATE_DELIVERY
372 // without consuming a futex event, because it calls futexWait
373 // with an expected value of WAITING and hence doesn't go to sleep.
374 // B returns, so the Baton's memory is reused and becomes another
375 // Baton (or a reuse of this one). B calls futexWait on the new
376 // Baton lifetime, then A wakes up and delivers a spurious futexWake
377 // to the same memory location. B's futexWait will then report a
378 // consumed wake event even though state_ is still WAITING.
380 // It would be possible to add an extra state_ dance to communicate
381 // that the futexWake has been sent so that we can be sure to consume
382 // it before returning, but that would be a perf and complexity hit.
383 uint32_t s = state_.load(std::memory_order_acquire);
384 assert(s == WAITING || s == LATE_DELIVERY);
386 if (s == LATE_DELIVERY) {
393 template <typename Clock, typename Duration>
394 FOLLY_NOINLINE bool tryWaitUntilSlow(
395 const std::chrono::time_point<Clock, Duration>& deadline) {
396 if (spinWaitForEarlyDelivery()) {
397 assert(state_.load(std::memory_order_acquire) == EARLY_DELIVERY);
401 // guess we have to block :(
402 uint32_t expected = INIT;
403 if (!state_.compare_exchange_strong(expected, WAITING)) {
404 // CAS failed, last minute reprieve
405 assert(expected == EARLY_DELIVERY);
410 auto rv = state_.futexWaitUntil(WAITING, deadline);
411 if (rv == folly::detail::FutexResult::TIMEDOUT) {
412 state_.store(TIMED_OUT, std::memory_order_release);
416 uint32_t s = state_.load(std::memory_order_acquire);
417 assert(s == WAITING || s == LATE_DELIVERY);
418 if (s == LATE_DELIVERY) {
424 detail::Futex<Atom> state_;