2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef FOLLY_INDEXEDMEMPOOL_H
18 #define FOLLY_INDEXEDMEMPOOL_H
24 #include <boost/noncopyable.hpp>
25 #include <folly/AtomicStruct.h>
26 #include <folly/detail/CacheLocality.h>
31 template <typename Pool>
32 struct IndexedMemPoolRecycler;
35 /// Instances of IndexedMemPool dynamically allocate and then pool
36 /// their element type (T), returning 4-byte integer indices that can be
37 /// passed to the pool's operator[] method to access or obtain pointers
38 /// to the actual elements. Once they are constructed, elements are
39 /// never destroyed. These two features are useful for lock-free
40 /// algorithms. The indexing behavior makes it easy to build tagged
41 /// pointer-like-things, since a large number of elements can be managed
42 /// using fewer bits than a full pointer. The pooling behavior makes
43 /// it safe to read from T-s even after they have been recycled, since
44 /// it is guaranteed that the memory won't have been returned to the OS
45 /// and unmapped (the algorithm must still use a mechanism to validate
46 /// that the read was correct, but it doesn't have to worry about page
47 /// faults), and if the elements use internal sequence numbers it can be
48 /// guaranteed that there won't be an ABA match due to the element being
49 /// overwritten with a different type that has the same bit pattern.
51 /// IMPORTANT: Space for extra elements is allocated to account for those
52 /// that are inaccessible because they are in other local lists, so the
53 /// actual number of items that can be allocated ranges from capacity to
54 /// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if
55 /// you are trying to maximize the capacity of the pool while constraining
56 /// the bit size of the resulting pointers, because the pointers will
57 /// actually range up to the boosted capacity. See maxIndexForCapacity
58 /// and capacityForMaxIndex.
60 /// To avoid contention, NumLocalLists_ free lists of limited (less than
61 /// or equal to LocalListLimit_) size are maintained, and each thread
62 /// retrieves and returns entries from its associated local list. If the
63 /// local list becomes too large then elements are placed in bulk in a
64 /// global free list. This allows items to be efficiently recirculated
65 /// from consumers to producers. AccessSpreader is used to access the
66 /// local lists, so there is no performance advantage to having more
67 /// local lists than L1 caches.
69 /// The pool mmap-s the entire necessary address space when the pool is
70 /// constructed, but delays element construction. This means that only
71 /// elements that are actually returned to the caller get paged into the
72 /// process's resident set (RSS).
74 int NumLocalLists_ = 32,
75 int LocalListLimit_ = 200,
76 template<typename> class Atom = std::atomic>
77 struct IndexedMemPool : boost::noncopyable {
80 typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
83 static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
85 NumLocalLists = NumLocalLists_,
86 LocalListLimit = LocalListLimit_
90 // these are public because clients may need to reason about the number
91 // of bits required to hold indices from a pool, given its capacity
93 static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
94 // index of uint32_t(-1) == UINT32_MAX is reserved for isAllocated tracking
95 return std::min(uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
96 uint64_t(uint32_t(-1) - 1));
99 static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
100 return maxIndex - (NumLocalLists - 1) * LocalListLimit;
104 /// Constructs a pool that can allocate at least _capacity_ elements,
105 /// even if all the local lists are full
106 explicit IndexedMemPool(uint32_t capacity)
107 : actualCapacity_(maxIndexForCapacity(capacity))
109 , globalHead_(TaggedPtr{})
111 const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
112 long pagesize = sysconf(_SC_PAGESIZE);
113 mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
114 assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
115 assert((mmapLength_ % pagesize) == 0);
117 slots_ = static_cast<Slot*>(mmap(nullptr, mmapLength_,
118 PROT_READ | PROT_WRITE,
119 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
120 if (slots_ == nullptr) {
121 assert(errno == ENOMEM);
122 throw std::bad_alloc();
126 /// Destroys all of the contained elements
128 for (size_t i = size_; i > 0; --i) {
131 munmap(slots_, mmapLength_);
134 /// Returns a lower bound on the number of elements that may be
135 /// simultaneously allocated and not yet recycled. Because of the
136 /// local lists it is possible that more elements than this are returned
139 return capacityForMaxIndex(actualCapacity_);
142 /// Grants ownership of (*this)[retval], or returns 0 if no elements
144 uint32_t allocIndex() {
145 return localPop(localHead());
148 /// If an element is available, returns a std::unique_ptr to it that will
149 /// recycle the element to the pool when it is reclaimed, otherwise returns
150 /// a null (falsy) std::unique_ptr
151 UniquePtr allocElem() {
152 auto idx = allocIndex();
153 auto ptr = idx == 0 ? nullptr : &slot(idx).elem;
154 return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
157 /// Gives up ownership previously granted by alloc()
158 void recycleIndex(uint32_t idx) {
159 assert(isAllocated(idx));
160 localPush(localHead(), idx);
163 /// Provides access to the pooled element referenced by idx
164 T& operator[](uint32_t idx) {
165 return slot(idx).elem;
168 /// Provides access to the pooled element referenced by idx
169 const T& operator[](uint32_t idx) const {
170 return slot(idx).elem;
173 /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
174 /// pool.locateElem(nullptr) == 0
175 uint32_t locateElem(const T* elem) const {
180 static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
182 auto slot = reinterpret_cast<const Slot*>(
183 reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
184 auto rv = slot - slots_;
186 // this assert also tests that rv is in range
187 assert(elem == &(*this)[rv]);
191 /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
192 bool isAllocated(uint32_t idx) const {
193 return slot(idx).localNext == uint32_t(-1);
205 Slot() : localNext{}, globalNext{} {}
211 // size is bottom 8 bits, tag in top 24. g++'s code generation for
212 // bitfields seems to depend on the phase of the moon, plus we can
213 // do better because we can rely on other checks to avoid masking
218 SizeMask = (1U << SizeBits) - 1,
219 TagIncr = 1U << SizeBits,
222 uint32_t size() const {
223 return tagAndSize & SizeMask;
226 TaggedPtr withSize(uint32_t repl) const {
227 assert(repl <= LocalListLimit);
228 return TaggedPtr{ idx, (tagAndSize & ~SizeMask) | repl };
231 TaggedPtr withSizeIncr() const {
232 assert(size() < LocalListLimit);
233 return TaggedPtr{ idx, tagAndSize + 1 };
236 TaggedPtr withSizeDecr() const {
238 return TaggedPtr{ idx, tagAndSize - 1 };
241 TaggedPtr withIdx(uint32_t repl) const {
242 return TaggedPtr{ repl, tagAndSize + TagIncr };
245 TaggedPtr withEmpty() const {
246 return withIdx(0).withSize(0);
250 struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING LocalList {
251 AtomicStruct<TaggedPtr,Atom> head;
253 LocalList() : head(TaggedPtr{}) {}
258 /// the actual number of slots that we will allocate, to guarantee
259 /// that we will satisfy the capacity requested at construction time.
260 /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
261 /// and occupy slots_[1..actualCapacity_].
262 size_t actualCapacity_;
264 /// the number of bytes allocated from mmap, which is a multiple of
265 /// the page size of the machine
268 /// this records the number of slots that have actually been constructed.
269 /// To allow use of atomic ++ instead of CAS, we let this overflow.
270 /// The actual number of constructed elements is min(actualCapacity_,
272 std::atomic<uint32_t> size_;
274 /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
275 /// actually constructed. Note that slots_[0] is not constructed or used
276 Slot* FOLLY_ALIGN_TO_AVOID_FALSE_SHARING slots_;
278 /// use AccessSpreader to find your list. We use stripes instead of
279 /// thread-local to avoid the need to grow or shrink on thread start
280 /// or join. These are heads of lists chained with localNext
281 LocalList local_[NumLocalLists];
283 /// this is the head of a list of node chained by globalNext, that are
284 /// themselves each the head of a list chained by localNext
285 AtomicStruct<TaggedPtr,Atom> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING globalHead_;
287 ///////////// private methods
289 size_t slotIndex(uint32_t idx) const {
291 idx <= actualCapacity_ &&
292 idx <= size_.load(std::memory_order_acquire));
296 Slot& slot(uint32_t idx) {
297 return slots_[slotIndex(idx)];
300 const Slot& slot(uint32_t idx) const {
301 return slots_[slotIndex(idx)];
304 // localHead references a full list chained by localNext. s should
305 // reference slot(localHead), it is passed as a micro-optimization
306 void globalPush(Slot& s, uint32_t localHead) {
308 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
309 s.globalNext = gh.idx;
310 if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
317 // idx references a single node
318 void localPush(AtomicStruct<TaggedPtr,Atom>& head, uint32_t idx) {
320 TaggedPtr h = head.load(std::memory_order_acquire);
324 if (h.size() == LocalListLimit) {
325 // push will overflow local list, steal it instead
326 if (head.compare_exchange_strong(h, h.withEmpty())) {
327 // steal was successful, put everything in the global list
332 // local list has space
333 if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
338 // h was updated by failing CAS
342 // returns 0 if empty
343 uint32_t globalPop() {
345 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
346 if (gh.idx == 0 || globalHead_.compare_exchange_strong(
347 gh, gh.withIdx(slot(gh.idx).globalNext))) {
348 // global list is empty, or pop was successful
354 // returns 0 if allocation failed
355 uint32_t localPop(AtomicStruct<TaggedPtr,Atom>& head) {
357 TaggedPtr h = head.load(std::memory_order_acquire);
359 // local list is non-empty, try to pop
360 Slot& s = slot(h.idx);
361 if (head.compare_exchange_strong(
362 h, h.withIdx(s.localNext).withSizeDecr())) {
364 s.localNext = uint32_t(-1);
370 uint32_t idx = globalPop();
372 // global list is empty, allocate and construct new slot
373 if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
374 (idx = ++size_) > actualCapacity_) {
379 new (&slot(idx)) T();
380 slot(idx).localNext = uint32_t(-1);
385 if (head.compare_exchange_strong(
386 h, h.withIdx(s.localNext).withSize(LocalListLimit))) {
387 // global list moved to local list, keep head for us
388 s.localNext = uint32_t(-1);
391 // local bulk push failed, return idx to the global list and try again
396 AtomicStruct<TaggedPtr,Atom>& localHead() {
397 auto stripe = detail::AccessSpreader<Atom>::current(NumLocalLists);
398 return local_[stripe].head;
404 /// This is a stateful Deleter functor, which allows std::unique_ptr
405 /// to track elements allocated from an IndexedMemPool by tracking the
406 /// associated pool. See IndexedMemPool::allocElem.
407 template <typename Pool>
408 struct IndexedMemPoolRecycler {
411 explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
413 IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs)
415 IndexedMemPoolRecycler& operator= (const IndexedMemPoolRecycler<Pool>& rhs)
418 void operator()(typename Pool::value_type* elem) const {
419 pool->recycleIndex(pool->locateElem(elem));