2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author: Xin Liu <xliux@fb.com>
19 // A concurrent skip list (CSL) implementation.
20 // Ref: http://www.cs.tau.ac.il/~shanir/nir-pubs-web/Papers/OPODIS2006-BA.pdf
24 This implements a sorted associative container that supports only
25 unique keys. (Similar to std::set.)
29 1. Small memory overhead: ~40% less memory overhead compared with
30 std::set (1.6 words per node versus 3). It has an minimum of 4
31 words (7 words if there nodes got deleted) per-list overhead
34 2. Read accesses (count, find iterator, skipper) are lock-free and
35 mostly wait-free (the only wait a reader may need to do is when
36 the node it is visiting is in a pending stage, i.e. deleting,
37 adding and not fully linked). Write accesses (remove, add) need
38 to acquire locks, but locks are local to the predecessor nodes
39 and/or successor nodes.
41 3. Good high contention performance, comparable single-thread
42 performance. In the multithreaded case (12 workers), CSL tested
43 10x faster than a RWSpinLocked std::set for an averaged sized
46 Comparable read performance to std::set when single threaded,
47 especially when the list size is large, and scales better to
48 larger lists: when the size is small, CSL can be 20-50% slower on
49 find()/contains(). As the size gets large (> 1M elements),
50 find()/contains() can be 30% faster.
52 Iterating through a skiplist is similar to iterating through a
53 linked list, thus is much (2-6x) faster than on a std::set
54 (tree-based). This is especially true for short lists due to
55 better cache locality. Based on that, it's also faster to
56 intersect two skiplists.
58 4. Lazy removal with GC support. The removed nodes get deleted when
59 the last Accessor to the skiplist is destroyed.
63 1. Write operations are usually 30% slower than std::set in a single
66 2. Need to have a head node for each list, which has a 4 word
69 3. When the list is quite small (< 1000 elements), single threaded
70 benchmarks show CSL can be 10x slower than std:set.
72 4. The interface requires using an Accessor to access the skiplist.
75 5. Currently x64 only, due to use of MicroSpinLock.
77 6. Freed nodes will not be reclaimed as long as there are ongoing
82 typedef ConcurrentSkipList<int> SkipListT;
83 shared_ptr<SkipListT> sl(SkipListT::createInstance(init_head_height);
85 // It's usually good practice to hold an accessor only during
86 // its necessary life cycle (but not in a tight loop as
87 // Accessor creation incurs ref-counting overhead).
89 // Holding it longer delays garbage-collecting the deleted
91 SkipListT::Accessor accessor(sl);
94 for (auto &elem : accessor) {
95 // use elem to access data
100 Another useful type is the Skipper accessor. This is useful if you
101 want to skip to locations in the way std::lower_bound() works,
102 i.e. it can be used for going through the list by skipping to the
103 node no less than a specified key. The Skipper keeps its location as
104 state, which makes it convenient for things like implementing
105 intersection of two sets efficiently, as it can start from the last
109 SkipListT::Accessor accessor(sl);
110 SkipListT::Skipper skipper(accessor);
113 CHECK_LE(30, *skipper);
116 // GC may happen when the accessor gets destructed.
120 #ifndef FOLLY_CONCURRENT_SKIP_LIST_H_
121 #define FOLLY_CONCURRENT_SKIP_LIST_H_
127 #include <type_traits>
128 #include <boost/iterator/iterator_facade.hpp>
129 #include <glog/logging.h>
131 #include <folly/ConcurrentSkipList-inl.h>
132 #include <folly/Likely.h>
133 #include <folly/Memory.h>
134 #include <folly/SmallLocks.h>
139 typename Comp = std::less<T>,
140 // All nodes are allocated using provided SimpleAllocator,
141 // it should be thread-safe.
142 typename NodeAlloc = SysAlloc,
144 class ConcurrentSkipList {
145 // MAX_HEIGHT needs to be at least 2 to suppress compiler
146 // warnings/errors (Werror=uninitialized tiggered due to preds_[1]
147 // being treated as a scalar in the compiler).
148 static_assert(MAX_HEIGHT >= 2 && MAX_HEIGHT < 64,
149 "MAX_HEIGHT can only be in the range of [2, 64)");
150 typedef std::unique_lock<folly::MicroSpinLock> ScopedLocker;
151 typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
154 typedef detail::SkipListNode<T> NodeType;
155 typedef T value_type;
158 typedef detail::csl_iterator<value_type, NodeType> iterator;
159 typedef detail::csl_iterator<const value_type, const NodeType> const_iterator;
164 explicit ConcurrentSkipList(int height, const NodeAlloc& alloc = NodeAlloc())
166 head_(NodeType::create(recycler_.alloc(), height, value_type(), true)),
169 // Convenient function to get an Accessor to a new instance.
170 static Accessor create(int height = 1, const NodeAlloc& alloc = NodeAlloc()) {
171 return Accessor(createInstance(height, alloc));
174 // Create a shared_ptr skiplist object with initial head height.
175 static std::shared_ptr<SkipListType> createInstance(
176 int height = 1, const NodeAlloc& alloc = NodeAlloc()) {
177 return std::make_shared<ConcurrentSkipList>(height, alloc);
180 //===================================================================
181 // Below are implementation details.
182 // Please see ConcurrentSkipList::Accessor for stdlib-like APIs.
183 //===================================================================
185 ~ConcurrentSkipList() {
186 /* static */ if (NodeType::template destroyIsNoOp<NodeAlloc>()) {
187 // Avoid traversing the list if using arena allocator.
190 for (NodeType* current = head_.load(std::memory_order_relaxed); current; ) {
191 NodeType* tmp = current->skip(0);
192 NodeType::destroy(recycler_.alloc(), current);
198 static bool greater(const value_type &data, const NodeType *node) {
199 return node && Comp()(node->data(), data);
202 static bool less(const value_type &data, const NodeType *node) {
203 return (node == nullptr) || Comp()(data, node->data());
206 static int findInsertionPoint(NodeType *cur, int cur_layer,
207 const value_type &data,
208 NodeType *preds[], NodeType *succs[]) {
210 NodeType *pred = cur;
211 NodeType *foundNode = nullptr;
212 for (int layer = cur_layer; layer >= 0; --layer) {
213 NodeType *node = pred->skip(layer);
214 while (greater(data, node)) {
216 node = node->skip(layer);
218 if (foundLayer == -1 && !less(data, node)) { // the two keys equal
224 // if found, succs[0..foundLayer] need to point to the cached foundNode,
225 // as foundNode might be deleted at the same time thus pred->skip() can
226 // return NULL or another node.
227 succs[layer] = foundNode ? foundNode : node;
232 size_t size() const { return size_.load(std::memory_order_relaxed); }
235 return head_.load(std::memory_order_consume)->height();
238 int maxLayer() const { return height() - 1; }
240 size_t incrementSize(int delta) {
241 return size_.fetch_add(delta, std::memory_order_relaxed) + delta;
244 // Returns the node if found, nullptr otherwise.
245 NodeType* find(const value_type &data) {
246 auto ret = findNode(data);
247 if (ret.second && !ret.first->markedForRemoval()) return ret.first;
251 // lock all the necessary nodes for changing (adding or removing) the list.
252 // returns true if all the lock acquried successfully and the related nodes
253 // are all validate (not in certain pending states), false otherwise.
254 bool lockNodesForChange(int nodeHeight,
255 ScopedLocker guards[MAX_HEIGHT],
256 NodeType *preds[MAX_HEIGHT],
257 NodeType *succs[MAX_HEIGHT],
259 NodeType *pred, *succ, *prevPred = nullptr;
261 for (int layer = 0; valid && layer < nodeHeight; ++layer) {
263 DCHECK(pred != nullptr) << "layer=" << layer << " height=" << height()
264 << " nodeheight=" << nodeHeight;
266 if (pred != prevPred) {
267 guards[layer] = pred->acquireGuard();
270 valid = !pred->markedForRemoval() &&
271 pred->skip(layer) == succ; // check again after locking
273 if (adding) { // when adding a node, the succ shouldn't be going away
274 valid = valid && (succ == nullptr || !succ->markedForRemoval());
281 // Returns a paired value:
282 // pair.first always stores the pointer to the node with the same input key.
283 // It could be either the newly added data, or the existed data in the
284 // list with the same key.
285 // pair.second stores whether the data is added successfully:
286 // 0 means not added, otherwise reutrns the new size.
288 std::pair<NodeType*, size_t> addOrGetData(U &&data) {
289 NodeType *preds[MAX_HEIGHT], *succs[MAX_HEIGHT];
294 int layer = findInsertionPointGetMaxLayer(data, preds, succs, &max_layer);
297 NodeType *nodeFound = succs[layer];
298 DCHECK(nodeFound != nullptr);
299 if (nodeFound->markedForRemoval()) {
300 continue; // if it's getting deleted retry finding node.
302 // wait until fully linked.
303 while (UNLIKELY(!nodeFound->fullyLinked())) {}
304 return std::make_pair(nodeFound, 0);
307 // need to capped at the original height -- the real height may have grown
308 int nodeHeight = detail::SkipListRandomHeight::instance()->
309 getHeight(max_layer + 1);
311 ScopedLocker guards[MAX_HEIGHT];
312 if (!lockNodesForChange(nodeHeight, guards, preds, succs)) {
313 continue; // give up the locks and retry until all valid
316 // locks acquired and all valid, need to modify the links under the locks.
318 NodeType::create(recycler_.alloc(), nodeHeight, std::forward<U>(data));
319 for (int layer = 0; layer < nodeHeight; ++layer) {
320 newNode->setSkip(layer, succs[layer]);
321 preds[layer]->setSkip(layer, newNode);
324 newNode->setFullyLinked();
325 newSize = incrementSize(1);
331 detail::SkipListRandomHeight::instance()->getSizeLimit(hgt);
333 if (hgt < MAX_HEIGHT && newSize > sizeLimit) {
336 CHECK_GT(newSize, 0);
337 return std::make_pair(newNode, newSize);
340 bool remove(const value_type &data) {
341 NodeType *nodeToDelete = nullptr;
342 ScopedLocker nodeGuard;
343 bool isMarked = false;
345 NodeType* preds[MAX_HEIGHT], *succs[MAX_HEIGHT];
349 int layer = findInsertionPointGetMaxLayer(data, preds, succs, &max_layer);
350 if (!isMarked && (layer < 0 || !okToDelete(succs[layer], layer))) {
355 nodeToDelete = succs[layer];
356 nodeHeight = nodeToDelete->height();
357 nodeGuard = nodeToDelete->acquireGuard();
358 if (nodeToDelete->markedForRemoval()) return false;
359 nodeToDelete->setMarkedForRemoval();
363 // acquire pred locks from bottom layer up
364 ScopedLocker guards[MAX_HEIGHT];
365 if (!lockNodesForChange(nodeHeight, guards, preds, succs, false)) {
366 continue; // this will unlock all the locks
369 for (int layer = nodeHeight - 1; layer >= 0; --layer) {
370 preds[layer]->setSkip(layer, nodeToDelete->skip(layer));
376 recycle(nodeToDelete);
380 const value_type *first() const {
381 auto node = head_.load(std::memory_order_consume)->skip(0);
382 return node ? &node->data() : nullptr;
385 const value_type *last() const {
386 NodeType *pred = head_.load(std::memory_order_consume);
387 NodeType *node = nullptr;
388 for (int layer = maxLayer(); layer >= 0; --layer) {
390 node = pred->skip(layer);
391 if (node) pred = node;
392 } while (node != nullptr);
394 return pred == head_.load(std::memory_order_relaxed)
395 ? nullptr : &pred->data();
398 static bool okToDelete(NodeType *candidate, int layer) {
399 DCHECK(candidate != nullptr);
400 return candidate->fullyLinked() &&
401 candidate->maxLayer() == layer &&
402 !candidate->markedForRemoval();
405 // find node for insertion/deleting
406 int findInsertionPointGetMaxLayer(const value_type &data,
407 NodeType *preds[], NodeType *succs[], int *max_layer) const {
408 *max_layer = maxLayer();
409 return findInsertionPoint(head_.load(std::memory_order_consume),
410 *max_layer, data, preds, succs);
413 // Find node for access. Returns a paired values:
414 // pair.first = the first node that no-less than data value
415 // pair.second = 1 when the data value is founded, or 0 otherwise.
416 // This is like lower_bound, but not exact: we could have the node marked for
417 // removal so still need to check that.
418 std::pair<NodeType*, int> findNode(const value_type &data) const {
419 return findNodeDownRight(data);
422 // Find node by first stepping down then stepping right. Based on benchmark
423 // results, this is slightly faster than findNodeRightDown for better
424 // localality on the skipping pointers.
425 std::pair<NodeType*, int> findNodeDownRight(const value_type &data) const {
426 NodeType *pred = head_.load(std::memory_order_consume);
427 int ht = pred->height();
428 NodeType *node = nullptr;
433 for (; ht > 0 && less(data, pred->skip(ht - 1)); --ht) {}
434 if (ht == 0) return std::make_pair(pred->skip(0), 0); // not found
436 node = pred->skip(--ht); // node <= data now
438 while (greater(data, node)) {
440 node = node->skip(ht);
442 found = !less(data, node);
444 return std::make_pair(node, found);
447 // find node by first stepping right then stepping down.
448 // We still keep this for reference purposes.
449 std::pair<NodeType*, int> findNodeRightDown(const value_type &data) const {
450 NodeType *pred = head_.load(std::memory_order_consume);
451 NodeType *node = nullptr;
452 auto top = maxLayer();
454 for (int layer = top; !found && layer >= 0; --layer) {
455 node = pred->skip(layer);
456 while (greater(data, node)) {
458 node = node->skip(layer);
460 found = !less(data, node);
462 return std::make_pair(node, found);
465 NodeType* lower_bound(const value_type &data) const {
466 auto node = findNode(data).first;
467 while (node != nullptr && node->markedForRemoval()) {
468 node = node->skip(0);
473 void growHeight(int height) {
474 NodeType* oldHead = head_.load(std::memory_order_consume);
475 if (oldHead->height() >= height) { // someone else already did this
480 NodeType::create(recycler_.alloc(), height, value_type(), true);
482 { // need to guard the head node in case others are adding/removing
483 // nodes linked to the head.
484 ScopedLocker g = oldHead->acquireGuard();
485 newHead->copyHead(oldHead);
486 NodeType* expected = oldHead;
487 if (!head_.compare_exchange_strong(expected, newHead,
488 std::memory_order_release)) {
489 // if someone has already done the swap, just return.
490 NodeType::destroy(recycler_.alloc(), newHead);
493 oldHead->setMarkedForRemoval();
498 void recycle(NodeType *node) {
502 detail::NodeRecycler<NodeType, NodeAlloc> recycler_;
503 std::atomic<NodeType*> head_;
504 std::atomic<size_t> size_;
507 template<typename T, typename Comp, typename NodeAlloc, int MAX_HEIGHT>
508 class ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT>::Accessor {
509 typedef detail::SkipListNode<T> NodeType;
510 typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
512 typedef T value_type;
514 typedef T& reference;
516 typedef const T& const_reference;
517 typedef const T* const_pointer;
518 typedef size_t size_type;
519 typedef Comp key_compare;
520 typedef Comp value_compare;
522 typedef typename SkipListType::iterator iterator;
523 typedef typename SkipListType::const_iterator const_iterator;
524 typedef typename SkipListType::Skipper Skipper;
526 explicit Accessor(std::shared_ptr<ConcurrentSkipList> skip_list)
527 : slHolder_(std::move(skip_list))
529 sl_ = slHolder_.get();
530 DCHECK(sl_ != nullptr);
531 sl_->recycler_.addRef();
534 // Unsafe initializer: the caller assumes the responsibility to keep
535 // skip_list valid during the whole life cycle of the Acessor.
536 explicit Accessor(ConcurrentSkipList *skip_list) : sl_(skip_list) {
537 DCHECK(sl_ != nullptr);
538 sl_->recycler_.addRef();
541 Accessor(const Accessor &accessor) :
543 slHolder_(accessor.slHolder_) {
544 sl_->recycler_.addRef();
547 Accessor& operator=(const Accessor &accessor) {
548 if (this != &accessor) {
549 slHolder_ = accessor.slHolder_;
550 sl_->recycler_.releaseRef();
552 sl_->recycler_.addRef();
558 sl_->recycler_.releaseRef();
561 bool empty() const { return sl_->size() == 0; }
562 size_t size() const { return sl_->size(); }
563 size_type max_size() const { return std::numeric_limits<size_type>::max(); }
565 // returns end() if the value is not in the list, otherwise returns an
566 // iterator pointing to the data, and it's guaranteed that the data is valid
567 // as far as the Accessor is hold.
568 iterator find(const key_type &value) { return iterator(sl_->find(value)); }
569 const_iterator find(const key_type &value) const {
570 return iterator(sl_->find(value));
572 size_type count(const key_type &data) const { return contains(data); }
574 iterator begin() const {
575 NodeType* head = sl_->head_.load(std::memory_order_consume);
576 return iterator(head->next());
578 iterator end() const { return iterator(nullptr); }
579 const_iterator cbegin() const { return begin(); }
580 const_iterator cend() const { return end(); }
583 typename=typename std::enable_if<std::is_convertible<U, T>::value>::type>
584 std::pair<iterator, bool> insert(U&& data) {
585 auto ret = sl_->addOrGetData(std::forward<U>(data));
586 return std::make_pair(iterator(ret.first), ret.second);
588 size_t erase(const key_type &data) { return remove(data); }
590 iterator lower_bound(const key_type &data) const {
591 return iterator(sl_->lower_bound(data));
594 size_t height() const { return sl_->height(); }
596 // first() returns pointer to the first element in the skiplist, or
599 // last() returns the pointer to the last element in the skiplist,
600 // nullptr if list is empty.
602 // Note: As concurrent writing can happen, first() is not
603 // guaranteed to be the min_element() in the list. Similarly
604 // last() is not guaranteed to be the max_element(), and both of them can
605 // be invalid (i.e. nullptr), so we name them differently from front() and
607 const key_type *first() const { return sl_->first(); }
608 const key_type *last() const { return sl_->last(); }
610 // Try to remove the last element in the skip list.
612 // Returns true if we removed it, false if either the list is empty
613 // or a race condition happened (i.e. the used-to-be last element
614 // was already removed by another thread).
616 auto last = sl_->last();
617 return last ? sl_->remove(*last) : false;
620 std::pair<key_type*, bool> addOrGetData(const key_type &data) {
621 auto ret = sl_->addOrGetData(data);
622 return std::make_pair(&ret.first->data(), ret.second);
625 SkipListType* skiplist() const { return sl_; }
628 // TODO:(xliu) remove these.
629 // Returns true if the node is added successfully, false if not, i.e. the
630 // node with the same key already existed in the list.
631 bool contains(const key_type &data) const { return sl_->find(data); }
632 bool add(const key_type &data) { return sl_->addOrGetData(data).second; }
633 bool remove(const key_type &data) { return sl_->remove(data); }
637 std::shared_ptr<SkipListType> slHolder_;
640 // implements forward iterator concept.
641 template<typename ValT, typename NodeT>
642 class detail::csl_iterator :
643 public boost::iterator_facade<csl_iterator<ValT, NodeT>,
644 ValT, boost::forward_traversal_tag> {
646 typedef ValT value_type;
647 typedef value_type& reference;
648 typedef value_type* pointer;
649 typedef ptrdiff_t difference_type;
651 explicit csl_iterator(NodeT* node = nullptr) : node_(node) {}
653 template<typename OtherVal, typename OtherNode>
654 csl_iterator(const csl_iterator<OtherVal, OtherNode> &other,
655 typename std::enable_if<std::is_convertible<OtherVal, ValT>::value>::type*
656 = 0) : node_(other.node_) {}
658 size_t nodeSize() const {
659 return node_ == nullptr ? 0 :
660 node_->height() * sizeof(NodeT*) + sizeof(*this);
663 bool good() const { return node_ != nullptr; }
666 friend class boost::iterator_core_access;
667 template<class,class> friend class csl_iterator;
669 void increment() { node_ = node_->next(); };
670 bool equal(const csl_iterator& other) const { return node_ == other.node_; }
671 value_type& dereference() const { return node_->data(); }
677 template<typename T, typename Comp, typename NodeAlloc, int MAX_HEIGHT>
678 class ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT>::Skipper {
679 typedef detail::SkipListNode<T> NodeType;
680 typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
681 typedef typename SkipListType::Accessor Accessor;
684 typedef T value_type;
685 typedef T& reference;
687 typedef ptrdiff_t difference_type;
689 Skipper(const std::shared_ptr<SkipListType>& skipList) :
690 accessor_(skipList) {
694 Skipper(const Accessor& accessor) : accessor_(accessor) {
699 // need to cache the head node
700 NodeType* head_node = head();
701 headHeight_ = head_node->height();
702 for (int i = 0; i < headHeight_; ++i) {
703 preds_[i] = head_node;
704 succs_[i] = head_node->skip(i);
706 int max_layer = maxLayer();
707 for (int i = 0; i < max_layer; ++i) {
710 hints_[max_layer] = max_layer;
713 // advance to the next node in the list.
714 Skipper& operator ++() {
715 preds_[0] = succs_[0];
716 succs_[0] = preds_[0]->skip(0);
717 int height = curHeight();
718 for (int i = 1; i < height && preds_[0] == succs_[i]; ++i) {
719 preds_[i] = succs_[i];
720 succs_[i] = preds_[i]->skip(i);
725 bool good() const { return succs_[0] != nullptr; }
727 int maxLayer() const { return headHeight_ - 1; }
729 int curHeight() const {
730 // need to cap the height to the cached head height, as the current node
731 // might be some newly inserted node and also during the time period the
732 // head height may have grown.
733 return succs_[0] ? std::min(headHeight_, succs_[0]->height()) : 0;
736 const value_type &data() const {
737 DCHECK(succs_[0] != nullptr);
738 return succs_[0]->data();
741 value_type &operator *() const {
742 DCHECK(succs_[0] != nullptr);
743 return succs_[0]->data();
746 value_type *operator->() {
747 DCHECK(succs_[0] != nullptr);
748 return &succs_[0]->data();
752 * Skip to the position whose data is no less than the parameter.
753 * (I.e. the lower_bound).
755 * Returns true if the data is found, false otherwise.
757 bool to(const value_type &data) {
758 int layer = curHeight() - 1;
759 if (layer < 0) return false; // reaches the end of the list
761 int lyr = hints_[layer];
762 int max_layer = maxLayer();
763 while (SkipListType::greater(data, succs_[lyr]) && lyr < max_layer) {
766 hints_[layer] = lyr; // update the hint
768 int foundLayer = SkipListType::
769 findInsertionPoint(preds_[lyr], lyr, data, preds_, succs_);
770 if (foundLayer < 0) return false;
772 DCHECK(succs_[0] != nullptr) << "lyr=" << lyr
773 << "; max_layer=" << max_layer;
774 return !succs_[0]->markedForRemoval();
778 NodeType* head() const {
779 return accessor_.skiplist()->head_.load(std::memory_order_consume);
784 NodeType *succs_[MAX_HEIGHT], *preds_[MAX_HEIGHT];
785 uint8_t hints_[MAX_HEIGHT];
790 #endif // FOLLY_CONCURRENT_SKIP_LIST_H_