2 * Copyright 2017-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
23 #include <glog/logging.h>
25 #include <folly/concurrency/CacheLocality.h>
26 #include <folly/experimental/hazptr/hazptr.h>
27 #include <folly/synchronization/SaturatingSemaphore.h>
31 /// UnboundedQueue supports a variety of options for unbounded
32 /// dynamically expanding an shrinking queues, including variations of:
33 /// - Single vs. multiple producers
34 /// - Single vs. multiple consumers
35 /// - Blocking vs. spin-waiting
36 /// - Non-waiting, timed, and waiting consumer operations.
37 /// Producer operations never wait or fail (unless out-of-memory).
39 /// Template parameters:
41 /// - SingleProducer: true if there can be only one producer at a
43 /// - SingleConsumer: true if there can be only one consumer at a
45 /// - MayBlock: true if consumers may block, false if they only
46 /// spins. A performance tuning parameter.
47 /// - LgSegmentSize (default 8): Log base 2 of number of elements per
48 /// segment. A performance tuning parameter. See below.
49 /// - LgAlign (default 7): Log base 2 of alignment directive; can be
50 /// used to balance scalability (avoidance of false sharing) with
51 /// memory efficiency.
53 /// When to use UnboundedQueue:
54 /// - If a small bound may lead to deadlock or performance degradation
55 /// under bursty patterns.
56 /// - If there is no risk of the queue growing too much.
58 /// When not to use UnboundedQueue:
59 /// - If there is risk of the queue growing too much and a large bound
60 /// is acceptable, then use DynamicBoundedQueue.
61 /// - If the queue must not allocate on enqueue or it must have a
62 /// small bound, then use fixed-size MPMCQueue or (if non-blocking
63 /// SPSC) ProducerConsumerQueue.
66 /// USPSCQueue<T, MayBlock, LgSegmentSize, LgAlign>
67 /// UMPSCQueue<T, MayBlock, LgSegmentSize, LgAlign>
68 /// USPMCQueue<T, MayBlock, LgSegmentSize, LgAlign>
69 /// UMPMCQueue<T, MayBlock, LgSegmentSize, LgAlign>
72 /// Producer operations never wait or fail (unless OOM)
73 /// void enqueue(const T&);
74 /// void enqueue(T&&);
75 /// Adds an element to the end of the queue.
77 /// Consumer operations:
79 /// Extracts an element from the front of the queue. Waits
80 /// until an element is available if needed.
81 /// bool try_dequeue(T&);
82 /// Tries to extracts an element from the front of the queue
83 /// if available. Returns true if successful, false otherwise.
84 /// bool try_dequeue_until(T&, time_point& deadline);
85 /// Tries to extracts an element from the front of the queue
86 /// if available until the specified deadline. Returns true
87 /// if successful, false otherwise.
88 /// bool try_dequeue_for(T&, duration&);
89 /// Tries to extracts an element from the front of the queue
90 /// if available for for the specified duration. Returns true
91 /// if successful, false otherwise.
93 /// Secondary functions:
95 /// Returns an estimate of the size of the queue.
97 /// Returns true only if the queue was empty during the call.
98 /// Note: size() and empty() are guaranteed to be accurate only if
99 /// the queue is not changed concurrently.
103 /// /* UMPSC, doesn't block, 1024 int elements per segment */
104 /// UMPSCQueue<int, false, 10> q;
108 /// ASSERT_FALSE(q.empty());
109 /// ASSERT_EQ(q.size(), 3);
113 /// ASSERT_TRUE(try_dequeue(v));
115 /// ASSERT_TRUE(try_dequeue_until(v, now() + seconds(1)));
117 /// ASSERT_TRUE(q.empty());
118 /// ASSERT_EQ(q.size(), 0);
119 /// ASSERT_FALSE(try_dequeue(v));
120 /// ASSERT_FALSE(try_dequeue_for(v, microseconds(100)));
124 /// - The queue is composed of one or more segments. Each segment has
125 /// a fixed size of 2^LgSegmentSize entries. Each segment is used
127 /// - Each entry is composed of a futex and a single element.
128 /// - The queue contains two 64-bit ticket variables. The producer
129 /// ticket counts the number of producer tickets isued so far, and
130 /// the same for the consumer ticket. Each ticket number corresponds
131 /// to a specific entry in a specific segment.
132 /// - The queue maintains two pointers, head and tail. Head points to
133 /// the segment that corresponds to the current consumer
134 /// ticket. Similarly, tail pointer points to the segment that
135 /// corresponds to the producer ticket.
136 /// - Segments are organized as a singly linked list.
137 /// - The producer with the first ticket in the current producer
138 /// segment is solely responsible for allocating and linking the
140 /// - The producer with the last ticket in the current producer
141 /// segment is solely responsible for advancing the tail pointer to
142 /// the next segment.
143 /// - Similarly, the consumer with the last ticket in the current
144 /// consumer segment is solely responsible for advancing the head
145 /// pointer to the next segment. It must ensure that head never
149 /// - An empty queue contains one segment. A nonempty queue contains
150 /// one or two more segment than fits its contents.
151 /// - Removed segments are not reclaimed until there are no threads,
152 /// producers or consumers, have references to them or their
153 /// predessors. That is, a lagging thread may delay the reclamation
154 /// of a chain of removed segments.
155 /// - The template parameter LgAlign can be used to reduce memory usage
156 /// at the cost of increased chance of false sharing.
158 /// Performance considerations:
159 /// - All operations take constant time, excluding the costs of
160 /// allocation, reclamation, interence from other threads, and
161 /// waiting for actions by other threads.
162 /// - In general, using the single producer and or single consumer
163 /// variants yields better performance than the MP and MC
165 /// - SPSC without blocking is the fastest configuration. It doesn't
166 /// include any read-modify-write atomic operations, full fences, or
167 /// system calls in the critical path.
168 /// - MP adds a fetch_add to the critical path of each producer operation.
169 /// - MC adds a fetch_add or compare_exchange to the critical path of
170 /// each consumer operation.
171 /// - The possibility of consumers blocking, even if they never do,
172 /// adds a compare_exchange to the crtical path of each producer
174 /// - MPMC, SPMC, MPSC require the use of a deferred reclamation
175 /// mechanism to guarantee that segments removed from the linked
176 /// list, i.e., unreachable from the head pointer, are reclaimed
177 /// only after they are no longer needed by any lagging producers or
179 /// - The overheads of segment allocation and reclamation are intended
180 /// to be mostly out of the critical path of the queue's throughput.
181 /// - If the template parameter LgSegmentSize is changed, it should be
182 /// set adequately high to keep the amortized cost of allocation and
184 /// - Another consideration is that the queue is guaranteed to have
185 /// enough space for a number of consumers equal to 2^LgSegmentSize
186 /// for local blocking. Excess waiting consumers spin.
187 /// - It is recommended to measure perforamnce with different variants
188 /// when applicable, e.g., UMPMC vs UMPSC. Depending on the use
189 /// case, sometimes the variant with the higher sequential overhead
190 /// may yield better results due to, for example, more favorable
191 /// producer-consumer balance or favorable timining for avoiding
199 size_t LgSegmentSize = 8,
201 template <typename> class Atom = std::atomic>
202 class UnboundedQueue {
203 using Ticket = uint64_t;
207 static constexpr bool SPSC = SingleProducer && SingleConsumer;
208 static constexpr size_t Stride = SPSC || (LgSegmentSize <= 1) ? 1 : 27;
209 static constexpr size_t SegmentSize = 1u << LgSegmentSize;
210 static constexpr size_t Align = 1u << LgAlign;
213 std::is_nothrow_destructible<T>::value,
214 "T must be nothrow_destructible");
215 static_assert((Stride & 1) == 1, "Stride must be odd");
216 static_assert(LgSegmentSize < 32, "LgSegmentSize must be < 32");
217 static_assert(LgAlign < 16, "LgAlign must be < 16");
220 Atom<Segment*> head_;
221 Atom<Ticket> consumerTicket_;
223 Atom<Segment*> tail_;
224 Atom<Ticket> producerTicket_;
229 setProducerTicket(0);
230 setConsumerTicket(0);
231 Segment* s = new Segment(0);
239 for (Segment* s = head(); s; s = next) {
240 next = s->nextSegment();
246 FOLLY_ALWAYS_INLINE void enqueue(const T& arg) {
250 FOLLY_ALWAYS_INLINE void enqueue(T&& arg) {
251 enqueueImpl(std::move(arg));
255 FOLLY_ALWAYS_INLINE void dequeue(T& item) noexcept {
260 FOLLY_ALWAYS_INLINE bool try_dequeue(T& item) noexcept {
261 return tryDequeueUntil(item, std::chrono::steady_clock::time_point::min());
264 /** try_dequeue_until */
265 template <typename Clock, typename Duration>
266 FOLLY_ALWAYS_INLINE bool try_dequeue_until(
268 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
269 return tryDequeueUntil(item, deadline);
272 /** try_dequeue_for */
273 template <typename Rep, typename Period>
274 FOLLY_ALWAYS_INLINE bool try_dequeue_for(
276 const std::chrono::duration<Rep, Period>& duration) noexcept {
277 if (LIKELY(try_dequeue(item))) {
280 return tryDequeueUntil(item, std::chrono::steady_clock::now() + duration);
284 size_t size() const noexcept {
285 auto p = producerTicket();
286 auto c = consumerTicket();
287 return p > c ? p - c : 0;
291 bool empty() const noexcept {
292 auto c = consumerTicket();
293 auto p = producerTicket();
299 template <typename Arg>
300 FOLLY_ALWAYS_INLINE void enqueueImpl(Arg&& arg) {
303 enqueueCommon(s, std::forward<Arg>(arg));
305 // Using hazptr_holder instead of hazptr_local because it is
306 // possible that the T ctor happens to use hazard pointers.
307 folly::hazptr::hazptr_holder hptr;
308 Segment* s = hptr.get_protected(tail_);
309 enqueueCommon(s, std::forward<Arg>(arg));
314 template <typename Arg>
315 FOLLY_ALWAYS_INLINE void enqueueCommon(Segment* s, Arg&& arg) {
316 Ticket t = fetchIncrementProducerTicket();
317 if (!SingleProducer) {
318 s = findSegment(s, t);
320 DCHECK_GE(t, s->minTicket());
321 DCHECK_LT(t, s->minTicket() + SegmentSize);
322 size_t idx = index(t);
323 Entry& e = s->entry(idx);
324 e.putItem(std::forward<Arg>(arg));
325 if (responsibleForAlloc(t)) {
326 allocNextSegment(s, t + SegmentSize);
328 if (responsibleForAdvance(t)) {
334 FOLLY_ALWAYS_INLINE void dequeueImpl(T& item) noexcept {
337 dequeueCommon(s, item);
339 // Using hazptr_holder instead of hazptr_local because it is
340 // possible to call the T dtor and it may happen to use hazard
342 folly::hazptr::hazptr_holder hptr;
343 Segment* s = hptr.get_protected(head_);
344 dequeueCommon(s, item);
349 FOLLY_ALWAYS_INLINE void dequeueCommon(Segment* s, T& item) noexcept {
350 Ticket t = fetchIncrementConsumerTicket();
351 if (!SingleConsumer) {
352 s = findSegment(s, t);
354 size_t idx = index(t);
355 Entry& e = s->entry(idx);
357 if (responsibleForAdvance(t)) {
362 /** tryDequeueUntil */
363 template <typename Clock, typename Duration>
364 FOLLY_ALWAYS_INLINE bool tryDequeueUntil(
366 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
367 if (SingleConsumer) {
369 return tryDequeueUntilSC(s, item, deadline);
371 // Using hazptr_holder instead of hazptr_local because it is
372 // possible to call ~T() and it may happen to use hazard pointers.
373 folly::hazptr::hazptr_holder hptr;
374 Segment* s = hptr.get_protected(head_);
375 return ryDequeueUntilMC(s, item, deadline);
379 /** ryDequeueUntilSC */
380 template <typename Clock, typename Duration>
381 FOLLY_ALWAYS_INLINE bool tryDequeueUntilSC(
384 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
385 Ticket t = consumerTicket();
386 DCHECK_GE(t, s->minTicket());
387 DCHECK_LT(t, (s->minTicket() + SegmentSize));
388 size_t idx = index(t);
389 Entry& e = s->entry(idx);
390 if (!e.tryWaitUntil(deadline)) {
393 setConsumerTicket(t + 1);
395 if (responsibleForAdvance(t)) {
401 /** tryDequeueUntilMC */
402 template <typename Clock, typename Duration>
403 FOLLY_ALWAYS_INLINE bool ryDequeueUntilMC(
406 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
408 Ticket t = consumerTicket();
409 if (UNLIKELY(t >= (s->minTicket() + SegmentSize))) {
410 s = tryGetNextSegmentUntil(s, deadline);
412 return false; // timed out
416 size_t idx = index(t);
417 Entry& e = s->entry(idx);
418 if (!e.tryWaitUntil(deadline)) {
421 if (!consumerTicket_.compare_exchange_weak(
422 t, t + 1, std::memory_order_acq_rel, std::memory_order_acquire)) {
426 if (responsibleForAdvance(t)) {
435 Segment* findSegment(Segment* s, const Ticket t) const noexcept {
436 while (UNLIKELY(t >= (s->minTicket() + SegmentSize))) {
437 auto deadline = std::chrono::steady_clock::time_point::max();
438 s = tryGetNextSegmentUntil(s, deadline);
439 DCHECK(s != nullptr);
444 /** tryGetNextSegmentUntil */
445 template <typename Clock, typename Duration>
446 Segment* tryGetNextSegmentUntil(
448 const std::chrono::time_point<Clock, Duration>& deadline) const noexcept {
449 // The following loop will not spin indefinitely (as long as the
450 // number of concurrently waiting consumers does not exceeds
451 // SegmentSize and the OS scheduler does not pause ready threads
452 // indefinitely). Under such conditions, the algorithm guarantees
453 // that the producer reponsible for advancing the tail pointer to
454 // the next segment has already acquired its ticket.
455 while (tail() == s) {
456 if (deadline < Clock::time_point::max() && deadline > Clock::now()) {
459 asm_volatile_pause();
461 Segment* next = s->nextSegment();
462 DCHECK(next != nullptr);
466 /** allocNextSegment */
467 void allocNextSegment(Segment* s, const Ticket t) {
468 Segment* next = new Segment(t);
470 next->acquire_ref_safe(); // hazptr
472 DCHECK(s->nextSegment() == nullptr);
473 s->setNextSegment(next);
477 void advanceTail(Segment* s) noexcept {
478 Segment* next = s->nextSegment();
479 if (!SingleProducer) {
480 // The following loop will not spin indefinitely (as long as the
481 // OS scheduler does not pause ready threads indefinitely). The
482 // algorithm guarantees that the producer reponsible for setting
483 // the next pointer has already acquired its ticket.
484 while (next == nullptr) {
485 asm_volatile_pause();
486 next = s->nextSegment();
489 DCHECK(next != nullptr);
494 void advanceHead(Segment* s) noexcept {
495 auto deadline = std::chrono::steady_clock::time_point::max();
496 Segment* next = tryGetNextSegmentUntil(s, deadline);
497 DCHECK(next != nullptr);
502 /** reclaimSegment */
503 void reclaimSegment(Segment* s) noexcept {
507 s->retire(); // hazptr
511 FOLLY_ALWAYS_INLINE size_t index(Ticket t) const noexcept {
512 return (t * Stride) & (SegmentSize - 1);
515 FOLLY_ALWAYS_INLINE bool responsibleForAlloc(Ticket t) const noexcept {
516 return (t & (SegmentSize - 1)) == 0;
519 FOLLY_ALWAYS_INLINE bool responsibleForAdvance(Ticket t) const noexcept {
520 return (t & (SegmentSize - 1)) == (SegmentSize - 1);
523 FOLLY_ALWAYS_INLINE Segment* head() const noexcept {
524 return head_.load(std::memory_order_acquire);
527 FOLLY_ALWAYS_INLINE Segment* tail() const noexcept {
528 return tail_.load(std::memory_order_acquire);
531 FOLLY_ALWAYS_INLINE Ticket producerTicket() const noexcept {
532 return producerTicket_.load(std::memory_order_acquire);
535 FOLLY_ALWAYS_INLINE Ticket consumerTicket() const noexcept {
536 return consumerTicket_.load(std::memory_order_acquire);
539 void setHead(Segment* s) noexcept {
540 head_.store(s, std::memory_order_release);
543 void setTail(Segment* s) noexcept {
544 tail_.store(s, std::memory_order_release);
547 FOLLY_ALWAYS_INLINE void setProducerTicket(Ticket t) noexcept {
548 producerTicket_.store(t, std::memory_order_release);
551 FOLLY_ALWAYS_INLINE void setConsumerTicket(Ticket t) noexcept {
552 consumerTicket_.store(t, std::memory_order_release);
555 FOLLY_ALWAYS_INLINE Ticket fetchIncrementConsumerTicket() noexcept {
556 if (SingleConsumer) {
557 Ticket oldval = consumerTicket();
558 setConsumerTicket(oldval + 1);
561 return consumerTicket_.fetch_add(1, std::memory_order_acq_rel);
565 FOLLY_ALWAYS_INLINE Ticket fetchIncrementProducerTicket() noexcept {
566 if (SingleProducer) {
567 Ticket oldval = producerTicket();
568 setProducerTicket(oldval + 1);
571 return producerTicket_.fetch_add(1, std::memory_order_acq_rel);
579 folly::SaturatingSemaphore<MayBlock, Atom> flag_;
580 typename std::aligned_storage<sizeof(T), alignof(T)>::type item_;
583 template <typename Arg>
584 FOLLY_ALWAYS_INLINE void putItem(Arg&& arg) {
585 new (&item_) T(std::forward<Arg>(arg));
589 FOLLY_ALWAYS_INLINE void takeItem(T& item) noexcept {
594 template <typename Clock, typename Duration>
595 FOLLY_ALWAYS_INLINE bool tryWaitUntil(
596 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
597 return flag_.try_wait_until(deadline);
601 FOLLY_ALWAYS_INLINE void getItem(T& item) noexcept {
602 item = std::move(*(itemPtr()));
606 FOLLY_ALWAYS_INLINE T* itemPtr() noexcept {
607 return static_cast<T*>(static_cast<void*>(&item_));
610 FOLLY_ALWAYS_INLINE void destroyItem() noexcept {
618 class Segment : public folly::hazptr::hazptr_obj_base_refcounted<Segment> {
619 Atom<Segment*> next_;
621 bool marked_; // used for iterative deletion
623 Entry b_[SegmentSize];
626 explicit Segment(const Ticket t)
627 : next_(nullptr), min_(t), marked_(false) {}
630 if (!SPSC && !marked_) {
631 Segment* next = nextSegment();
633 if (!next->release_ref()) { // hazptr
637 next = s->nextSegment();
644 Segment* nextSegment() const noexcept {
645 return next_.load(std::memory_order_acquire);
648 void setNextSegment(Segment* s) noexcept {
649 next_.store(s, std::memory_order_release);
652 FOLLY_ALWAYS_INLINE Ticket minTicket() const noexcept {
653 DCHECK_EQ((min_ & (SegmentSize - 1)), 0);
657 FOLLY_ALWAYS_INLINE Entry& entry(size_t index) noexcept {
669 size_t LgSegmentSize = 8,
671 template <typename> class Atom = std::atomic>
673 UnboundedQueue<T, true, true, MayBlock, LgSegmentSize, LgAlign, Atom>;
678 size_t LgSegmentSize = 8,
680 template <typename> class Atom = std::atomic>
682 UnboundedQueue<T, false, true, MayBlock, LgSegmentSize, LgAlign, Atom>;
687 size_t LgSegmentSize = 8,
689 template <typename> class Atom = std::atomic>
691 UnboundedQueue<T, true, false, MayBlock, LgSegmentSize, LgAlign, Atom>;
696 size_t LgSegmentSize = 8,
698 template <typename> class Atom = std::atomic>
700 UnboundedQueue<T, false, false, MayBlock, LgSegmentSize, LgAlign, Atom>;