2 * Copyright 2017-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
23 #include <glog/logging.h>
25 #include <folly/concurrency/CacheLocality.h>
26 #include <folly/experimental/hazptr/hazptr.h>
27 #include <folly/synchronization/SaturatingSemaphore.h>
31 /// UnboundedQueue supports a variety of options for unbounded
32 /// dynamically expanding an shrinking queues, including variations of:
33 /// - Single vs. multiple producers
34 /// - Single vs. multiple consumers
35 /// - Blocking vs. spin-waiting
36 /// - Non-waiting, timed, and waiting consumer operations.
37 /// Producer operations never wait or fail (unless out-of-memory).
39 /// Template parameters:
41 /// - SingleProducer: true if there can be only one producer at a
43 /// - SingleConsumer: true if there can be only one consumer at a
45 /// - MayBlock: true if consumers may block, false if they only
46 /// spin. A performance tuning parameter.
47 /// - LgSegmentSize (default 8): Log base 2 of number of elements per
48 /// segment. A performance tuning parameter. See below.
49 /// - LgAlign (default 7): Log base 2 of alignment directive; can be
50 /// used to balance scalability (avoidance of false sharing) with
51 /// memory efficiency.
53 /// When to use UnboundedQueue:
54 /// - If a small bound may lead to deadlock or performance degradation
55 /// under bursty patterns.
56 /// - If there is no risk of the queue growing too much.
58 /// When not to use UnboundedQueue:
59 /// - If there is risk of the queue growing too much and a large bound
60 /// is acceptable, then use DynamicBoundedQueue.
61 /// - If the queue must not allocate on enqueue or it must have a
62 /// small bound, then use fixed-size MPMCQueue or (if non-blocking
63 /// SPSC) ProducerConsumerQueue.
66 /// USPSCQueue<T, MayBlock, LgSegmentSize, LgAlign>
67 /// UMPSCQueue<T, MayBlock, LgSegmentSize, LgAlign>
68 /// USPMCQueue<T, MayBlock, LgSegmentSize, LgAlign>
69 /// UMPMCQueue<T, MayBlock, LgSegmentSize, LgAlign>
72 /// Producer operations never wait or fail (unless OOM)
73 /// void enqueue(const T&);
74 /// void enqueue(T&&);
75 /// Adds an element to the end of the queue.
77 /// Consumer operations:
79 /// Extracts an element from the front of the queue. Waits
80 /// until an element is available if needed.
81 /// bool try_dequeue(T&);
82 /// Tries to extract an element from the front of the queue
83 /// if available. Returns true if successful, false otherwise.
84 /// bool try_dequeue_until(T&, time_point& deadline);
85 /// Tries to extract an element from the front of the queue
86 /// if available until the specified deadline. Returns true
87 /// if successful, false otherwise.
88 /// bool try_dequeue_for(T&, duration&);
89 /// Tries to extract an element from the front of the queue if
90 /// available for until the expiration of the specified
91 /// duration. Returns true if successful, false otherwise.
93 /// Secondary functions:
95 /// Returns an estimate of the size of the queue.
97 /// Returns true only if the queue was empty during the call.
98 /// Note: size() and empty() are guaranteed to be accurate only if
99 /// the queue is not changed concurrently.
103 /// /* UMPSC, doesn't block, 1024 int elements per segment */
104 /// UMPSCQueue<int, false, 10> q;
108 /// ASSERT_FALSE(q.empty());
109 /// ASSERT_EQ(q.size(), 3);
113 /// ASSERT_TRUE(try_dequeue(v));
115 /// ASSERT_TRUE(try_dequeue_until(v, now() + seconds(1)));
117 /// ASSERT_TRUE(q.empty());
118 /// ASSERT_EQ(q.size(), 0);
119 /// ASSERT_FALSE(try_dequeue(v));
120 /// ASSERT_FALSE(try_dequeue_for(v, microseconds(100)));
124 /// - The queue is composed of one or more segments. Each segment has
125 /// a fixed size of 2^LgSegmentSize entries. Each segment is used
127 /// - Each entry is composed of a futex and a single element.
128 /// - The queue contains two 64-bit ticket variables. The producer
129 /// ticket counts the number of producer tickets issued so far, and
130 /// the same for the consumer ticket. Each ticket number corresponds
131 /// to a specific entry in a specific segment.
132 /// - The queue maintains two pointers, head and tail. Head points to
133 /// the segment that corresponds to the current consumer
134 /// ticket. Similarly, tail pointer points to the segment that
135 /// corresponds to the producer ticket.
136 /// - Segments are organized as a singly linked list.
137 /// - The producer with the first ticket in the current producer
138 /// segment is solely responsible for allocating and linking the
140 /// - The producer with the last ticket in the current producer
141 /// segment is solely responsible for advancing the tail pointer to
142 /// the next segment.
143 /// - Similarly, the consumer with the last ticket in the current
144 /// consumer segment is solely responsible for advancing the head
145 /// pointer to the next segment. It must ensure that head never
149 /// - An empty queue contains one segment. A nonempty queue contains
150 /// one or two more segment than fits its contents.
151 /// - Removed segments are not reclaimed until there are no threads,
152 /// producers or consumers, have references to them or their
153 /// predecessors. That is, a lagging thread may delay the reclamation
154 /// of a chain of removed segments.
155 /// - The template parameter LgAlign can be used to reduce memory usage
156 /// at the cost of increased chance of false sharing.
158 /// Performance considerations:
159 /// - All operations take constant time, excluding the costs of
160 /// allocation, reclamation, interference from other threads, and
161 /// waiting for actions by other threads.
162 /// - In general, using the single producer and or single consumer
163 /// variants yield better performance than the MP and MC
165 /// - SPSC without blocking is the fastest configuration. It doesn't
166 /// include any read-modify-write atomic operations, full fences, or
167 /// system calls in the critical path.
168 /// - MP adds a fetch_add to the critical path of each producer operation.
169 /// - MC adds a fetch_add or compare_exchange to the critical path of
170 /// each consumer operation.
171 /// - The possibility of consumers blocking, even if they never do,
172 /// adds a compare_exchange to the critical path of each producer
174 /// - MPMC, SPMC, MPSC require the use of a deferred reclamation
175 /// mechanism to guarantee that segments removed from the linked
176 /// list, i.e., unreachable from the head pointer, are reclaimed
177 /// only after they are no longer needed by any lagging producers or
179 /// - The overheads of segment allocation and reclamation are intended
180 /// to be mostly out of the critical path of the queue's throughput.
181 /// - If the template parameter LgSegmentSize is changed, it should be
182 /// set adequately high to keep the amortized cost of allocation and
184 /// - Another consideration is that the queue is guaranteed to have
185 /// enough space for a number of consumers equal to 2^LgSegmentSize
186 /// for local blocking. Excess waiting consumers spin.
187 /// - It is recommended to measure performance with different variants
188 /// when applicable, e.g., UMPMC vs UMPSC. Depending on the use
189 /// case, sometimes the variant with the higher sequential overhead
190 /// may yield better results due to, for example, more favorable
191 /// producer-consumer balance or favorable timing for avoiding
199 size_t LgSegmentSize = 8,
201 template <typename> class Atom = std::atomic>
202 class UnboundedQueue {
203 using Ticket = uint64_t;
207 static constexpr bool SPSC = SingleProducer && SingleConsumer;
208 static constexpr size_t Stride = SPSC || (LgSegmentSize <= 1) ? 1 : 27;
209 static constexpr size_t SegmentSize = 1u << LgSegmentSize;
210 static constexpr size_t Align = 1u << LgAlign;
213 std::is_nothrow_destructible<T>::value,
214 "T must be nothrow_destructible");
215 static_assert((Stride & 1) == 1, "Stride must be odd");
216 static_assert(LgSegmentSize < 32, "LgSegmentSize must be < 32");
217 static_assert(LgAlign < 16, "LgAlign must be < 16");
219 alignas(Align) Atom<Segment*> head_;
220 Atom<Ticket> consumerTicket_;
221 alignas(Align) Atom<Segment*> tail_;
222 Atom<Ticket> producerTicket_;
227 setProducerTicket(0);
228 setConsumerTicket(0);
229 Segment* s = new Segment(0);
237 for (Segment* s = head(); s; s = next) {
238 next = s->nextSegment();
244 FOLLY_ALWAYS_INLINE void enqueue(const T& arg) {
248 FOLLY_ALWAYS_INLINE void enqueue(T&& arg) {
249 enqueueImpl(std::move(arg));
253 FOLLY_ALWAYS_INLINE void dequeue(T& item) noexcept {
258 FOLLY_ALWAYS_INLINE bool try_dequeue(T& item) noexcept {
259 return tryDequeueUntil(item, std::chrono::steady_clock::time_point::min());
262 /** try_dequeue_until */
263 template <typename Clock, typename Duration>
264 FOLLY_ALWAYS_INLINE bool try_dequeue_until(
266 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
267 return tryDequeueUntil(item, deadline);
270 /** try_dequeue_for */
271 template <typename Rep, typename Period>
272 FOLLY_ALWAYS_INLINE bool try_dequeue_for(
274 const std::chrono::duration<Rep, Period>& duration) noexcept {
275 if (LIKELY(try_dequeue(item))) {
278 return tryDequeueUntil(item, std::chrono::steady_clock::now() + duration);
282 size_t size() const noexcept {
283 auto p = producerTicket();
284 auto c = consumerTicket();
285 return p > c ? p - c : 0;
289 bool empty() const noexcept {
290 auto c = consumerTicket();
291 auto p = producerTicket();
297 template <typename Arg>
298 FOLLY_ALWAYS_INLINE void enqueueImpl(Arg&& arg) {
301 enqueueCommon(s, std::forward<Arg>(arg));
303 // Using hazptr_holder instead of hazptr_local because it is
304 // possible that the T ctor happens to use hazard pointers.
305 folly::hazptr::hazptr_holder hptr;
306 Segment* s = hptr.get_protected(tail_);
307 enqueueCommon(s, std::forward<Arg>(arg));
312 template <typename Arg>
313 FOLLY_ALWAYS_INLINE void enqueueCommon(Segment* s, Arg&& arg) {
314 Ticket t = fetchIncrementProducerTicket();
315 if (!SingleProducer) {
316 s = findSegment(s, t);
318 DCHECK_GE(t, s->minTicket());
319 DCHECK_LT(t, s->minTicket() + SegmentSize);
320 size_t idx = index(t);
321 Entry& e = s->entry(idx);
322 e.putItem(std::forward<Arg>(arg));
323 if (responsibleForAlloc(t)) {
324 allocNextSegment(s, t + SegmentSize);
326 if (responsibleForAdvance(t)) {
332 FOLLY_ALWAYS_INLINE void dequeueImpl(T& item) noexcept {
335 dequeueCommon(s, item);
337 // Using hazptr_holder instead of hazptr_local because it is
338 // possible to call the T dtor and it may happen to use hazard
340 folly::hazptr::hazptr_holder hptr;
341 Segment* s = hptr.get_protected(head_);
342 dequeueCommon(s, item);
347 FOLLY_ALWAYS_INLINE void dequeueCommon(Segment* s, T& item) noexcept {
348 Ticket t = fetchIncrementConsumerTicket();
349 if (!SingleConsumer) {
350 s = findSegment(s, t);
352 size_t idx = index(t);
353 Entry& e = s->entry(idx);
355 if (responsibleForAdvance(t)) {
360 /** tryDequeueUntil */
361 template <typename Clock, typename Duration>
362 FOLLY_ALWAYS_INLINE bool tryDequeueUntil(
364 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
365 if (SingleConsumer) {
367 return tryDequeueUntilSC(s, item, deadline);
369 // Using hazptr_holder instead of hazptr_local because it is
370 // possible to call ~T() and it may happen to use hazard pointers.
371 folly::hazptr::hazptr_holder hptr;
372 Segment* s = hptr.get_protected(head_);
373 return ryDequeueUntilMC(s, item, deadline);
377 /** ryDequeueUntilSC */
378 template <typename Clock, typename Duration>
379 FOLLY_ALWAYS_INLINE bool tryDequeueUntilSC(
382 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
383 Ticket t = consumerTicket();
384 DCHECK_GE(t, s->minTicket());
385 DCHECK_LT(t, (s->minTicket() + SegmentSize));
386 size_t idx = index(t);
387 Entry& e = s->entry(idx);
388 if (!e.tryWaitUntil(deadline)) {
391 setConsumerTicket(t + 1);
393 if (responsibleForAdvance(t)) {
399 /** tryDequeueUntilMC */
400 template <typename Clock, typename Duration>
401 FOLLY_ALWAYS_INLINE bool ryDequeueUntilMC(
404 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
406 Ticket t = consumerTicket();
407 if (UNLIKELY(t >= (s->minTicket() + SegmentSize))) {
408 s = tryGetNextSegmentUntil(s, deadline);
410 return false; // timed out
414 size_t idx = index(t);
415 Entry& e = s->entry(idx);
416 if (!e.tryWaitUntil(deadline)) {
419 if (!consumerTicket_.compare_exchange_weak(
420 t, t + 1, std::memory_order_acq_rel, std::memory_order_acquire)) {
424 if (responsibleForAdvance(t)) {
433 Segment* findSegment(Segment* s, const Ticket t) const noexcept {
434 while (UNLIKELY(t >= (s->minTicket() + SegmentSize))) {
435 auto deadline = std::chrono::steady_clock::time_point::max();
436 s = tryGetNextSegmentUntil(s, deadline);
437 DCHECK(s != nullptr);
442 /** tryGetNextSegmentUntil */
443 template <typename Clock, typename Duration>
444 Segment* tryGetNextSegmentUntil(
446 const std::chrono::time_point<Clock, Duration>& deadline) const noexcept {
447 // The following loop will not spin indefinitely (as long as the
448 // number of concurrently waiting consumers does not exceeds
449 // SegmentSize and the OS scheduler does not pause ready threads
450 // indefinitely). Under such conditions, the algorithm guarantees
451 // that the producer reponsible for advancing the tail pointer to
452 // the next segment has already acquired its ticket.
453 while (tail() == s) {
454 if (deadline < Clock::time_point::max() && deadline > Clock::now()) {
457 asm_volatile_pause();
459 Segment* next = s->nextSegment();
460 DCHECK(next != nullptr);
464 /** allocNextSegment */
465 void allocNextSegment(Segment* s, const Ticket t) {
466 Segment* next = new Segment(t);
468 next->acquire_ref_safe(); // hazptr
470 DCHECK(s->nextSegment() == nullptr);
471 s->setNextSegment(next);
475 void advanceTail(Segment* s) noexcept {
476 Segment* next = s->nextSegment();
477 if (!SingleProducer) {
478 // The following loop will not spin indefinitely (as long as the
479 // OS scheduler does not pause ready threads indefinitely). The
480 // algorithm guarantees that the producer reponsible for setting
481 // the next pointer has already acquired its ticket.
482 while (next == nullptr) {
483 asm_volatile_pause();
484 next = s->nextSegment();
487 DCHECK(next != nullptr);
492 void advanceHead(Segment* s) noexcept {
493 auto deadline = std::chrono::steady_clock::time_point::max();
494 Segment* next = tryGetNextSegmentUntil(s, deadline);
495 DCHECK(next != nullptr);
500 /** reclaimSegment */
501 void reclaimSegment(Segment* s) noexcept {
505 s->retire(); // hazptr
509 FOLLY_ALWAYS_INLINE size_t index(Ticket t) const noexcept {
510 return (t * Stride) & (SegmentSize - 1);
513 FOLLY_ALWAYS_INLINE bool responsibleForAlloc(Ticket t) const noexcept {
514 return (t & (SegmentSize - 1)) == 0;
517 FOLLY_ALWAYS_INLINE bool responsibleForAdvance(Ticket t) const noexcept {
518 return (t & (SegmentSize - 1)) == (SegmentSize - 1);
521 FOLLY_ALWAYS_INLINE Segment* head() const noexcept {
522 return head_.load(std::memory_order_acquire);
525 FOLLY_ALWAYS_INLINE Segment* tail() const noexcept {
526 return tail_.load(std::memory_order_acquire);
529 FOLLY_ALWAYS_INLINE Ticket producerTicket() const noexcept {
530 return producerTicket_.load(std::memory_order_acquire);
533 FOLLY_ALWAYS_INLINE Ticket consumerTicket() const noexcept {
534 return consumerTicket_.load(std::memory_order_acquire);
537 void setHead(Segment* s) noexcept {
538 head_.store(s, std::memory_order_release);
541 void setTail(Segment* s) noexcept {
542 tail_.store(s, std::memory_order_release);
545 FOLLY_ALWAYS_INLINE void setProducerTicket(Ticket t) noexcept {
546 producerTicket_.store(t, std::memory_order_release);
549 FOLLY_ALWAYS_INLINE void setConsumerTicket(Ticket t) noexcept {
550 consumerTicket_.store(t, std::memory_order_release);
553 FOLLY_ALWAYS_INLINE Ticket fetchIncrementConsumerTicket() noexcept {
554 if (SingleConsumer) {
555 Ticket oldval = consumerTicket();
556 setConsumerTicket(oldval + 1);
559 return consumerTicket_.fetch_add(1, std::memory_order_acq_rel);
563 FOLLY_ALWAYS_INLINE Ticket fetchIncrementProducerTicket() noexcept {
564 if (SingleProducer) {
565 Ticket oldval = producerTicket();
566 setProducerTicket(oldval + 1);
569 return producerTicket_.fetch_add(1, std::memory_order_acq_rel);
577 folly::SaturatingSemaphore<MayBlock, Atom> flag_;
578 typename std::aligned_storage<sizeof(T), alignof(T)>::type item_;
581 template <typename Arg>
582 FOLLY_ALWAYS_INLINE void putItem(Arg&& arg) {
583 new (&item_) T(std::forward<Arg>(arg));
587 FOLLY_ALWAYS_INLINE void takeItem(T& item) noexcept {
592 template <typename Clock, typename Duration>
593 FOLLY_ALWAYS_INLINE bool tryWaitUntil(
594 const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
595 return flag_.try_wait_until(deadline);
599 FOLLY_ALWAYS_INLINE void getItem(T& item) noexcept {
600 item = std::move(*(itemPtr()));
604 FOLLY_ALWAYS_INLINE T* itemPtr() noexcept {
605 return static_cast<T*>(static_cast<void*>(&item_));
608 FOLLY_ALWAYS_INLINE void destroyItem() noexcept {
616 class Segment : public folly::hazptr::hazptr_obj_base_refcounted<Segment> {
617 Atom<Segment*> next_;
619 bool marked_; // used for iterative deletion
621 Entry b_[SegmentSize];
624 explicit Segment(const Ticket t)
625 : next_(nullptr), min_(t), marked_(false) {}
628 if (!SPSC && !marked_) {
629 Segment* next = nextSegment();
631 if (!next->release_ref()) { // hazptr
635 next = s->nextSegment();
642 Segment* nextSegment() const noexcept {
643 return next_.load(std::memory_order_acquire);
646 void setNextSegment(Segment* s) noexcept {
647 next_.store(s, std::memory_order_release);
650 FOLLY_ALWAYS_INLINE Ticket minTicket() const noexcept {
651 DCHECK_EQ((min_ & (SegmentSize - 1)), 0);
655 FOLLY_ALWAYS_INLINE Entry& entry(size_t index) noexcept {
667 size_t LgSegmentSize = 8,
669 template <typename> class Atom = std::atomic>
671 UnboundedQueue<T, true, true, MayBlock, LgSegmentSize, LgAlign, Atom>;
676 size_t LgSegmentSize = 8,
678 template <typename> class Atom = std::atomic>
680 UnboundedQueue<T, false, true, MayBlock, LgSegmentSize, LgAlign, Atom>;
685 size_t LgSegmentSize = 8,
687 template <typename> class Atom = std::atomic>
689 UnboundedQueue<T, true, false, MayBlock, LgSegmentSize, LgAlign, Atom>;
694 size_t LgSegmentSize = 8,
696 template <typename> class Atom = std::atomic>
698 UnboundedQueue<T, false, false, MayBlock, LgSegmentSize, LgAlign, Atom>;