2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <type_traits>
22 #include <boost/noncopyable.hpp>
23 #include <folly/AtomicStruct.h>
24 #include <folly/detail/CacheLocality.h>
25 #include <folly/portability/SysMman.h>
26 #include <folly/portability/Unistd.h>
28 // Ignore shadowing warnings within this file, so includers can use -Wshadow.
29 #pragma GCC diagnostic push
30 #pragma GCC diagnostic ignored "-Wshadow"
35 template <typename Pool>
36 struct IndexedMemPoolRecycler;
39 /// Instances of IndexedMemPool dynamically allocate and then pool their
40 /// element type (T), returning 4-byte integer indices that can be passed
41 /// to the pool's operator[] method to access or obtain pointers to the
42 /// actual elements. The memory backing items returned from the pool
43 /// will always be readable, even if items have been returned to the pool.
44 /// These two features are useful for lock-free algorithms. The indexing
45 /// behavior makes it easy to build tagged pointer-like-things, since
46 /// a large number of elements can be managed using fewer bits than a
47 /// full pointer. The access-after-free behavior makes it safe to read
48 /// from T-s even after they have been recycled, since it is guaranteed
49 /// that the memory won't have been returned to the OS and unmapped
50 /// (the algorithm must still use a mechanism to validate that the read
51 /// was correct, but it doesn't have to worry about page faults), and if
52 /// the elements use internal sequence numbers it can be guaranteed that
53 /// there won't be an ABA match due to the element being overwritten with
54 /// a different type that has the same bit pattern.
56 /// IndexedMemPool has two object lifecycle strategies. The first
57 /// is to construct objects when they are allocated from the pool and
58 /// destroy them when they are recycled. In this mode allocIndex and
59 /// allocElem have emplace-like semantics. In the second mode, objects
60 /// are default-constructed the first time they are removed from the pool,
61 /// and deleted when the pool itself is deleted. By default the first
62 /// mode is used for non-trivial T, and the second is used for trivial T.
64 /// IMPORTANT: Space for extra elements is allocated to account for those
65 /// that are inaccessible because they are in other local lists, so the
66 /// actual number of items that can be allocated ranges from capacity to
67 /// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if
68 /// you are trying to maximize the capacity of the pool while constraining
69 /// the bit size of the resulting pointers, because the pointers will
70 /// actually range up to the boosted capacity. See maxIndexForCapacity
71 /// and capacityForMaxIndex.
73 /// To avoid contention, NumLocalLists_ free lists of limited (less than
74 /// or equal to LocalListLimit_) size are maintained, and each thread
75 /// retrieves and returns entries from its associated local list. If the
76 /// local list becomes too large then elements are placed in bulk in a
77 /// global free list. This allows items to be efficiently recirculated
78 /// from consumers to producers. AccessSpreader is used to access the
79 /// local lists, so there is no performance advantage to having more
80 /// local lists than L1 caches.
82 /// The pool mmap-s the entire necessary address space when the pool is
83 /// constructed, but delays element construction. This means that only
84 /// elements that are actually returned to the caller get paged into the
85 /// process's resident set (RSS).
87 int NumLocalLists_ = 32,
88 int LocalListLimit_ = 200,
89 template<typename> class Atom = std::atomic,
90 bool EagerRecycleWhenTrivial = false,
91 bool EagerRecycleWhenNotTrivial = true>
92 struct IndexedMemPool : boost::noncopyable {
95 typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
98 static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
100 NumLocalLists = NumLocalLists_,
101 LocalListLimit = LocalListLimit_
105 static constexpr bool eagerRecycle() {
106 return std::is_trivial<T>::value
107 ? EagerRecycleWhenTrivial : EagerRecycleWhenNotTrivial;
110 // these are public because clients may need to reason about the number
111 // of bits required to hold indices from a pool, given its capacity
113 static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
114 // index of uint32_t(-1) == UINT32_MAX is reserved for isAllocated tracking
115 return std::min(uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
116 uint64_t(uint32_t(-1) - 1));
119 static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
120 return maxIndex - (NumLocalLists - 1) * LocalListLimit;
124 /// Constructs a pool that can allocate at least _capacity_ elements,
125 /// even if all the local lists are full
126 explicit IndexedMemPool(uint32_t capacity)
127 : actualCapacity_(maxIndexForCapacity(capacity))
129 , globalHead_(TaggedPtr{})
131 const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
132 size_t pagesize = sysconf(_SC_PAGESIZE);
133 mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
134 assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
135 assert((mmapLength_ % pagesize) == 0);
137 slots_ = static_cast<Slot*>(mmap(nullptr, mmapLength_,
138 PROT_READ | PROT_WRITE,
139 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
140 if (slots_ == MAP_FAILED) {
141 assert(errno == ENOMEM);
142 throw std::bad_alloc();
146 /// Destroys all of the contained elements
148 if (!eagerRecycle()) {
149 for (size_t i = size_; i > 0; --i) {
153 munmap(slots_, mmapLength_);
156 /// Returns a lower bound on the number of elements that may be
157 /// simultaneously allocated and not yet recycled. Because of the
158 /// local lists it is possible that more elements than this are returned
161 return capacityForMaxIndex(actualCapacity_);
164 /// Finds a slot with a non-zero index, emplaces a T there if we're
165 /// using the eager recycle lifecycle mode, and returns the index,
166 /// or returns 0 if no elements are available.
167 template <typename ...Args>
168 uint32_t allocIndex(Args&&... args) {
169 static_assert(sizeof...(Args) == 0 || eagerRecycle(),
170 "emplace-style allocation requires eager recycle, "
171 "which is defaulted only for non-trivial types");
172 auto idx = localPop(localHead());
173 if (idx != 0 && eagerRecycle()) {
174 T* ptr = &slot(idx).elem;
175 new (ptr) T(std::forward<Args>(args)...);
180 /// If an element is available, returns a std::unique_ptr to it that will
181 /// recycle the element to the pool when it is reclaimed, otherwise returns
182 /// a null (falsy) std::unique_ptr
183 template <typename ...Args>
184 UniquePtr allocElem(Args&&... args) {
185 auto idx = allocIndex(std::forward<Args>(args)...);
186 T* ptr = idx == 0 ? nullptr : &slot(idx).elem;
187 return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
190 /// Gives up ownership previously granted by alloc()
191 void recycleIndex(uint32_t idx) {
192 assert(isAllocated(idx));
193 if (eagerRecycle()) {
196 localPush(localHead(), idx);
199 /// Provides access to the pooled element referenced by idx
200 T& operator[](uint32_t idx) {
201 return slot(idx).elem;
204 /// Provides access to the pooled element referenced by idx
205 const T& operator[](uint32_t idx) const {
206 return slot(idx).elem;
209 /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
210 /// pool.locateElem(nullptr) == 0
211 uint32_t locateElem(const T* elem) const {
216 static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
218 auto slot = reinterpret_cast<const Slot*>(
219 reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
220 auto rv = slot - slots_;
222 // this assert also tests that rv is in range
223 assert(elem == &(*this)[rv]);
227 /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
228 bool isAllocated(uint32_t idx) const {
229 return slot(idx).localNext == uint32_t(-1);
241 Slot() : localNext{}, globalNext{} {}
247 // size is bottom 8 bits, tag in top 24. g++'s code generation for
248 // bitfields seems to depend on the phase of the moon, plus we can
249 // do better because we can rely on other checks to avoid masking
254 SizeMask = (1U << SizeBits) - 1,
255 TagIncr = 1U << SizeBits,
258 uint32_t size() const {
259 return tagAndSize & SizeMask;
262 TaggedPtr withSize(uint32_t repl) const {
263 assert(repl <= LocalListLimit);
264 return TaggedPtr{ idx, (tagAndSize & ~SizeMask) | repl };
267 TaggedPtr withSizeIncr() const {
268 assert(size() < LocalListLimit);
269 return TaggedPtr{ idx, tagAndSize + 1 };
272 TaggedPtr withSizeDecr() const {
274 return TaggedPtr{ idx, tagAndSize - 1 };
277 TaggedPtr withIdx(uint32_t repl) const {
278 return TaggedPtr{ repl, tagAndSize + TagIncr };
281 TaggedPtr withEmpty() const {
282 return withIdx(0).withSize(0);
286 struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING LocalList {
287 AtomicStruct<TaggedPtr,Atom> head;
289 LocalList() : head(TaggedPtr{}) {}
294 /// the actual number of slots that we will allocate, to guarantee
295 /// that we will satisfy the capacity requested at construction time.
296 /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
297 /// and occupy slots_[1..actualCapacity_].
298 size_t actualCapacity_;
300 /// the number of bytes allocated from mmap, which is a multiple of
301 /// the page size of the machine
304 /// this records the number of slots that have actually been constructed.
305 /// To allow use of atomic ++ instead of CAS, we let this overflow.
306 /// The actual number of constructed elements is min(actualCapacity_,
308 Atom<uint32_t> size_;
310 /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
311 /// actually constructed. Note that slots_[0] is not constructed or used
312 FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Slot* slots_;
314 /// use AccessSpreader to find your list. We use stripes instead of
315 /// thread-local to avoid the need to grow or shrink on thread start
316 /// or join. These are heads of lists chained with localNext
317 LocalList local_[NumLocalLists];
319 /// this is the head of a list of node chained by globalNext, that are
320 /// themselves each the head of a list chained by localNext
321 FOLLY_ALIGN_TO_AVOID_FALSE_SHARING AtomicStruct<TaggedPtr,Atom> globalHead_;
323 ///////////// private methods
325 size_t slotIndex(uint32_t idx) const {
327 idx <= actualCapacity_ &&
328 idx <= size_.load(std::memory_order_acquire));
332 Slot& slot(uint32_t idx) {
333 return slots_[slotIndex(idx)];
336 const Slot& slot(uint32_t idx) const {
337 return slots_[slotIndex(idx)];
340 // localHead references a full list chained by localNext. s should
341 // reference slot(localHead), it is passed as a micro-optimization
342 void globalPush(Slot& s, uint32_t localHead) {
344 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
345 s.globalNext = gh.idx;
346 if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
353 // idx references a single node
354 void localPush(AtomicStruct<TaggedPtr,Atom>& head, uint32_t idx) {
356 TaggedPtr h = head.load(std::memory_order_acquire);
360 if (h.size() == LocalListLimit) {
361 // push will overflow local list, steal it instead
362 if (head.compare_exchange_strong(h, h.withEmpty())) {
363 // steal was successful, put everything in the global list
368 // local list has space
369 if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
374 // h was updated by failing CAS
378 // returns 0 if empty
379 uint32_t globalPop() {
381 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
382 if (gh.idx == 0 || globalHead_.compare_exchange_strong(
383 gh, gh.withIdx(slot(gh.idx).globalNext))) {
384 // global list is empty, or pop was successful
390 // returns 0 if allocation failed
391 uint32_t localPop(AtomicStruct<TaggedPtr,Atom>& head) {
393 TaggedPtr h = head.load(std::memory_order_acquire);
395 // local list is non-empty, try to pop
396 Slot& s = slot(h.idx);
397 if (head.compare_exchange_strong(
398 h, h.withIdx(s.localNext).withSizeDecr())) {
400 s.localNext = uint32_t(-1);
406 uint32_t idx = globalPop();
408 // global list is empty, allocate and construct new slot
409 if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
410 (idx = ++size_) > actualCapacity_) {
414 // default-construct it now if we aren't going to construct and
415 // destroy on each allocation
416 if (!eagerRecycle()) {
417 T* ptr = &slot(idx).elem;
420 slot(idx).localNext = uint32_t(-1);
425 if (head.compare_exchange_strong(
426 h, h.withIdx(s.localNext).withSize(LocalListLimit))) {
427 // global list moved to local list, keep head for us
428 s.localNext = uint32_t(-1);
431 // local bulk push failed, return idx to the global list and try again
436 AtomicStruct<TaggedPtr,Atom>& localHead() {
437 auto stripe = detail::AccessSpreader<Atom>::current(NumLocalLists);
438 return local_[stripe].head;
444 /// This is a stateful Deleter functor, which allows std::unique_ptr
445 /// to track elements allocated from an IndexedMemPool by tracking the
446 /// associated pool. See IndexedMemPool::allocElem.
447 template <typename Pool>
448 struct IndexedMemPoolRecycler {
451 explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
453 IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs)
455 IndexedMemPoolRecycler& operator= (const IndexedMemPoolRecycler<Pool>& rhs)
458 void operator()(typename Pool::value_type* elem) const {
459 pool->recycleIndex(pool->locateElem(elem));
467 # pragma GCC diagnostic pop