2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <type_traits>
23 #include <boost/noncopyable.hpp>
24 #include <folly/AtomicStruct.h>
25 #include <folly/Portability.h>
26 #include <folly/detail/CacheLocality.h>
27 #include <folly/portability/SysMman.h>
28 #include <folly/portability/Unistd.h>
30 // Ignore shadowing warnings within this file, so includers can use -Wshadow.
32 FOLLY_GCC_DISABLE_WARNING("-Wshadow")
37 template <typename Pool>
38 struct IndexedMemPoolRecycler;
43 bool EagerRecycleWhenTrivial = false,
44 bool EagerRecycleWhenNotTrivial = true>
45 struct IndexedMemPoolTraits {
46 static constexpr bool eagerRecycle() {
47 return std::is_trivial<T>::value ? EagerRecycleWhenTrivial
48 : EagerRecycleWhenNotTrivial;
51 /// Called when the element pointed to by ptr is allocated for the
53 static void initialize(T* ptr) {
54 if (!eagerRecycle()) {
59 /// Called when the element pointed to by ptr is freed at the pool
61 static void cleanup(T* ptr) {
62 if (!eagerRecycle()) {
67 /// Called when the element is allocated with the arguments forwarded from
68 /// IndexedMemPool::allocElem.
69 template <typename... Args>
70 static void onAllocate(T* ptr, Args&&... args) {
72 sizeof...(Args) == 0 || eagerRecycle(),
73 "emplace-style allocation requires eager recycle, "
74 "which is defaulted only for non-trivial types");
76 new (ptr) T(std::forward<Args>(args)...);
80 /// Called when the element is recycled.
81 static void onRecycle(T* ptr) {
88 /// IndexedMemPool traits that implements the lazy lifecycle strategy. In this
89 /// strategy elements are default-constructed the first time they are allocated,
90 /// and destroyed when the pool itself is destroyed.
92 using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>;
94 /// IndexedMemPool traits that implements the eager lifecycle strategy. In this
95 /// strategy elements are constructed when they are allocated from the pool and
96 /// destroyed when recycled.
98 using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>;
100 /// Instances of IndexedMemPool dynamically allocate and then pool their
101 /// element type (T), returning 4-byte integer indices that can be passed
102 /// to the pool's operator[] method to access or obtain pointers to the
103 /// actual elements. The memory backing items returned from the pool
104 /// will always be readable, even if items have been returned to the pool.
105 /// These two features are useful for lock-free algorithms. The indexing
106 /// behavior makes it easy to build tagged pointer-like-things, since
107 /// a large number of elements can be managed using fewer bits than a
108 /// full pointer. The access-after-free behavior makes it safe to read
109 /// from T-s even after they have been recycled, since it is guaranteed
110 /// that the memory won't have been returned to the OS and unmapped
111 /// (the algorithm must still use a mechanism to validate that the read
112 /// was correct, but it doesn't have to worry about page faults), and if
113 /// the elements use internal sequence numbers it can be guaranteed that
114 /// there won't be an ABA match due to the element being overwritten with
115 /// a different type that has the same bit pattern.
117 /// The object lifecycle strategy is controlled by the Traits parameter.
118 /// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to
119 /// construct objects when they are allocated from the pool and destroy
120 /// them when they are recycled. In this mode allocIndex and allocElem
121 /// have emplace-like semantics. In another strategy, implemented by
122 /// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the
123 /// first time they are removed from the pool, and deleted when the pool
124 /// itself is deleted. By default the first mode is used for non-trivial
125 /// T, and the second is used for trivial T. Clients can customize the
126 /// object lifecycle by providing their own Traits implementation.
127 /// See IndexedMemPoolTraits for a Traits example.
129 /// IMPORTANT: Space for extra elements is allocated to account for those
130 /// that are inaccessible because they are in other local lists, so the
131 /// actual number of items that can be allocated ranges from capacity to
132 /// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if
133 /// you are trying to maximize the capacity of the pool while constraining
134 /// the bit size of the resulting pointers, because the pointers will
135 /// actually range up to the boosted capacity. See maxIndexForCapacity
136 /// and capacityForMaxIndex.
138 /// To avoid contention, NumLocalLists_ free lists of limited (less than
139 /// or equal to LocalListLimit_) size are maintained, and each thread
140 /// retrieves and returns entries from its associated local list. If the
141 /// local list becomes too large then elements are placed in bulk in a
142 /// global free list. This allows items to be efficiently recirculated
143 /// from consumers to producers. AccessSpreader is used to access the
144 /// local lists, so there is no performance advantage to having more
145 /// local lists than L1 caches.
147 /// The pool mmap-s the entire necessary address space when the pool is
148 /// constructed, but delays element construction. This means that only
149 /// elements that are actually returned to the caller get paged into the
150 /// process's resident set (RSS).
153 uint32_t NumLocalLists_ = 32,
154 uint32_t LocalListLimit_ = 200,
155 template <typename> class Atom = std::atomic,
156 typename Traits = IndexedMemPoolTraits<T>>
157 struct IndexedMemPool : boost::noncopyable {
158 typedef T value_type;
160 typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
163 static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
165 NumLocalLists = NumLocalLists_,
166 LocalListLimit = LocalListLimit_
169 // these are public because clients may need to reason about the number
170 // of bits required to hold indices from a pool, given its capacity
172 static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
173 // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
175 return uint32_t(std::min(
176 uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
177 uint64_t(std::numeric_limits<uint32_t>::max() - 1)));
180 static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
181 return maxIndex - (NumLocalLists - 1) * LocalListLimit;
185 /// Constructs a pool that can allocate at least _capacity_ elements,
186 /// even if all the local lists are full
187 explicit IndexedMemPool(uint32_t capacity)
188 : actualCapacity_(maxIndexForCapacity(capacity))
190 , globalHead_(TaggedPtr{})
192 const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
193 size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
194 mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
195 assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
196 assert((mmapLength_ % pagesize) == 0);
198 slots_ = static_cast<Slot*>(mmap(nullptr, mmapLength_,
199 PROT_READ | PROT_WRITE,
200 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
201 if (slots_ == MAP_FAILED) {
202 assert(errno == ENOMEM);
203 throw std::bad_alloc();
207 /// Destroys all of the contained elements
209 for (uint32_t i = maxAllocatedIndex(); i > 0; --i) {
210 Traits::cleanup(&slots_[i].elem);
212 munmap(slots_, mmapLength_);
215 /// Returns a lower bound on the number of elements that may be
216 /// simultaneously allocated and not yet recycled. Because of the
217 /// local lists it is possible that more elements than this are returned
219 uint32_t capacity() {
220 return capacityForMaxIndex(actualCapacity_);
223 /// Returns the maximum index of elements ever allocated in this pool
224 /// including elements that have been recycled.
225 uint32_t maxAllocatedIndex() const {
226 // Take the minimum since it is possible that size_ > actualCapacity_.
227 // This can happen if there are multiple concurrent requests
228 // when size_ == actualCapacity_ - 1.
229 return std::min(uint32_t(size_), uint32_t(actualCapacity_));
232 /// Finds a slot with a non-zero index, emplaces a T there if we're
233 /// using the eager recycle lifecycle mode, and returns the index,
234 /// or returns 0 if no elements are available. Passes a pointer to
235 /// the element to Traits::onAllocate before the slot is marked as
237 template <typename ...Args>
238 uint32_t allocIndex(Args&&... args) {
239 auto idx = localPop(localHead());
242 Traits::onAllocate(&s.elem, std::forward<Args>(args)...);
248 /// If an element is available, returns a std::unique_ptr to it that will
249 /// recycle the element to the pool when it is reclaimed, otherwise returns
250 /// a null (falsy) std::unique_ptr. Passes a pointer to the element to
251 /// Traits::onAllocate before the slot is marked as allocated.
252 template <typename ...Args>
253 UniquePtr allocElem(Args&&... args) {
254 auto idx = allocIndex(std::forward<Args>(args)...);
255 T* ptr = idx == 0 ? nullptr : &slot(idx).elem;
256 return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
259 /// Gives up ownership previously granted by alloc()
260 void recycleIndex(uint32_t idx) {
261 assert(isAllocated(idx));
262 Traits::onRecycle(&slot(idx).elem);
263 localPush(localHead(), idx);
266 /// Provides access to the pooled element referenced by idx
267 T& operator[](uint32_t idx) {
268 return slot(idx).elem;
271 /// Provides access to the pooled element referenced by idx
272 const T& operator[](uint32_t idx) const {
273 return slot(idx).elem;
276 /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
277 /// pool.locateElem(nullptr) == 0
278 uint32_t locateElem(const T* elem) const {
283 static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
285 auto slot = reinterpret_cast<const Slot*>(
286 reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
287 auto rv = uint32_t(slot - slots_);
289 // this assert also tests that rv is in range
290 assert(elem == &(*this)[rv]);
294 /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
295 bool isAllocated(uint32_t idx) const {
296 return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-1);
305 Atom<uint32_t> localNext;
306 Atom<uint32_t> globalNext;
308 Slot() : localNext{}, globalNext{} {}
314 // size is bottom 8 bits, tag in top 24. g++'s code generation for
315 // bitfields seems to depend on the phase of the moon, plus we can
316 // do better because we can rely on other checks to avoid masking
321 SizeMask = (1U << SizeBits) - 1,
322 TagIncr = 1U << SizeBits,
325 uint32_t size() const {
326 return tagAndSize & SizeMask;
329 TaggedPtr withSize(uint32_t repl) const {
330 assert(repl <= LocalListLimit);
331 return TaggedPtr{ idx, (tagAndSize & ~SizeMask) | repl };
334 TaggedPtr withSizeIncr() const {
335 assert(size() < LocalListLimit);
336 return TaggedPtr{ idx, tagAndSize + 1 };
339 TaggedPtr withSizeDecr() const {
341 return TaggedPtr{ idx, tagAndSize - 1 };
344 TaggedPtr withIdx(uint32_t repl) const {
345 return TaggedPtr{ repl, tagAndSize + TagIncr };
348 TaggedPtr withEmpty() const {
349 return withIdx(0).withSize(0);
353 struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING LocalList {
354 AtomicStruct<TaggedPtr,Atom> head;
356 LocalList() : head(TaggedPtr{}) {}
361 /// the number of bytes allocated from mmap, which is a multiple of
362 /// the page size of the machine
365 /// the actual number of slots that we will allocate, to guarantee
366 /// that we will satisfy the capacity requested at construction time.
367 /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
368 /// and occupy slots_[1..actualCapacity_].
369 uint32_t actualCapacity_;
371 /// this records the number of slots that have actually been constructed.
372 /// To allow use of atomic ++ instead of CAS, we let this overflow.
373 /// The actual number of constructed elements is min(actualCapacity_,
375 Atom<uint32_t> size_;
377 /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
378 /// actually constructed. Note that slots_[0] is not constructed or used
379 FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Slot* slots_;
381 /// use AccessSpreader to find your list. We use stripes instead of
382 /// thread-local to avoid the need to grow or shrink on thread start
383 /// or join. These are heads of lists chained with localNext
384 LocalList local_[NumLocalLists];
386 /// this is the head of a list of node chained by globalNext, that are
387 /// themselves each the head of a list chained by localNext
388 FOLLY_ALIGN_TO_AVOID_FALSE_SHARING AtomicStruct<TaggedPtr,Atom> globalHead_;
390 ///////////// private methods
392 uint32_t slotIndex(uint32_t idx) const {
394 idx <= actualCapacity_ &&
395 idx <= size_.load(std::memory_order_acquire));
399 Slot& slot(uint32_t idx) {
400 return slots_[slotIndex(idx)];
403 const Slot& slot(uint32_t idx) const {
404 return slots_[slotIndex(idx)];
407 // localHead references a full list chained by localNext. s should
408 // reference slot(localHead), it is passed as a micro-optimization
409 void globalPush(Slot& s, uint32_t localHead) {
411 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
412 s.globalNext.store(gh.idx, std::memory_order_relaxed);
413 if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
420 // idx references a single node
421 void localPush(AtomicStruct<TaggedPtr,Atom>& head, uint32_t idx) {
423 TaggedPtr h = head.load(std::memory_order_acquire);
425 s.localNext.store(h.idx, std::memory_order_relaxed);
427 if (h.size() == LocalListLimit) {
428 // push will overflow local list, steal it instead
429 if (head.compare_exchange_strong(h, h.withEmpty())) {
430 // steal was successful, put everything in the global list
435 // local list has space
436 if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
441 // h was updated by failing CAS
445 // returns 0 if empty
446 uint32_t globalPop() {
448 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
450 globalHead_.compare_exchange_strong(
453 slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
454 // global list is empty, or pop was successful
460 // returns 0 if allocation failed
461 uint32_t localPop(AtomicStruct<TaggedPtr,Atom>& head) {
463 TaggedPtr h = head.load(std::memory_order_acquire);
465 // local list is non-empty, try to pop
466 Slot& s = slot(h.idx);
467 auto next = s.localNext.load(std::memory_order_relaxed);
468 if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
475 uint32_t idx = globalPop();
477 // global list is empty, allocate and construct new slot
478 if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
479 (idx = ++size_) > actualCapacity_) {
483 Traits::initialize(&slot(idx).elem);
488 auto next = s.localNext.load(std::memory_order_relaxed);
489 if (head.compare_exchange_strong(
490 h, h.withIdx(next).withSize(LocalListLimit))) {
491 // global list moved to local list, keep head for us
494 // local bulk push failed, return idx to the global list and try again
499 AtomicStruct<TaggedPtr,Atom>& localHead() {
500 auto stripe = detail::AccessSpreader<Atom>::current(NumLocalLists);
501 return local_[stripe].head;
504 void markAllocated(Slot& slot) {
505 slot.localNext.store(uint32_t(-1), std::memory_order_release);
511 /// This is a stateful Deleter functor, which allows std::unique_ptr
512 /// to track elements allocated from an IndexedMemPool by tracking the
513 /// associated pool. See IndexedMemPool::allocElem.
514 template <typename Pool>
515 struct IndexedMemPoolRecycler {
518 explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
520 IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs)
522 IndexedMemPoolRecycler& operator= (const IndexedMemPoolRecycler<Pool>& rhs)
525 void operator()(typename Pool::value_type* elem) const {
526 pool->recycleIndex(pool->locateElem(elem));