2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author: Xin Liu <xliux@fb.com>
19 // A concurrent skip list (CSL) implementation.
20 // Ref: http://www.cs.tau.ac.il/~shanir/nir-pubs-web/Papers/OPODIS2006-BA.pdf
24 This implements a sorted associative container that supports only
25 unique keys. (Similar to std::set.)
29 1. Small memory overhead: ~40% less memory overhead compared with
30 std::set (1.6 words per node versus 3). It has an minimum of 4
31 words (7 words if there nodes got deleted) per-list overhead
34 2. Read accesses (count, find iterator, skipper) are lock-free and
35 mostly wait-free (the only wait a reader may need to do is when
36 the node it is visiting is in a pending stage, i.e. deleting,
37 adding and not fully linked). Write accesses (remove, add) need
38 to acquire locks, but locks are local to the predecessor nodes
39 and/or successor nodes.
41 3. Good high contention performance, comparable single-thread
42 performance. In the multithreaded case (12 workers), CSL tested
43 10x faster than a RWSpinLocked std::set for an averaged sized
46 Comparable read performance to std::set when single threaded,
47 especially when the list size is large, and scales better to
48 larger lists: when the size is small, CSL can be 20-50% slower on
49 find()/contains(). As the size gets large (> 1M elements),
50 find()/contains() can be 30% faster.
52 Iterating through a skiplist is similar to iterating through a
53 linked list, thus is much (2-6x) faster than on a std::set
54 (tree-based). This is especially true for short lists due to
55 better cache locality. Based on that, it's also faster to
56 intersect two skiplists.
58 4. Lazy removal with GC support. The removed nodes get deleted when
59 the last Accessor to the skiplist is destroyed.
63 1. Write operations are usually 30% slower than std::set in a single
66 2. Need to have a head node for each list, which has a 4 word
69 3. When the list is quite small (< 1000 elements), single threaded
70 benchmarks show CSL can be 10x slower than std:set.
72 4. The interface requires using an Accessor to access the skiplist.
75 5. Currently x64 only, due to use of MicroSpinLock.
77 6. Freed nodes will not be reclaimed as long as there are ongoing
82 typedef ConcurrentSkipList<int> SkipListT;
83 shared_ptr<SkipListT> sl(SkipListT::createInstance(init_head_height);
85 // It's usually good practice to hold an accessor only during
86 // its necessary life cycle (but not in a tight loop as
87 // Accessor creation incurs ref-counting overhead).
89 // Holding it longer delays garbage-collecting the deleted
91 SkipListT::Accessor accessor(sl);
94 for (auto &elem : accessor) {
95 // use elem to access data
100 Another useful type is the Skipper accessor. This is useful if you
101 want to skip to locations in the way std::lower_bound() works,
102 i.e. it can be used for going through the list by skipping to the
103 node no less than a specified key. The Skipper keeps its location as
104 state, which makes it convenient for things like implementing
105 intersection of two sets efficiently, as it can start from the last
109 SkipListT::Accessor accessor(sl);
110 SkipListT::Skipper skipper(accessor);
113 CHECK_LE(30, *skipper);
116 // GC may happen when the accessor gets destructed.
126 #include <type_traits>
128 #include <boost/iterator/iterator_facade.hpp>
129 #include <glog/logging.h>
131 #include <folly/ConcurrentSkipList-inl.h>
132 #include <folly/Likely.h>
133 #include <folly/Memory.h>
134 #include <folly/MicroSpinLock.h>
140 typename Comp = std::less<T>,
141 // All nodes are allocated using provided SimpleAllocator,
142 // it should be thread-safe.
143 typename NodeAlloc = SysAlloc,
145 class ConcurrentSkipList {
146 // MAX_HEIGHT needs to be at least 2 to suppress compiler
147 // warnings/errors (Werror=uninitialized tiggered due to preds_[1]
148 // being treated as a scalar in the compiler).
149 static_assert(MAX_HEIGHT >= 2 && MAX_HEIGHT < 64,
150 "MAX_HEIGHT can only be in the range of [2, 64)");
151 typedef std::unique_lock<folly::MicroSpinLock> ScopedLocker;
152 typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
155 typedef detail::SkipListNode<T> NodeType;
156 typedef T value_type;
159 typedef detail::csl_iterator<value_type, NodeType> iterator;
160 typedef detail::csl_iterator<const value_type, const NodeType> const_iterator;
165 explicit ConcurrentSkipList(int height, const NodeAlloc& alloc)
167 head_(NodeType::create(recycler_.alloc(), height, value_type(), true)),
170 explicit ConcurrentSkipList(int height)
172 head_(NodeType::create(recycler_.alloc(), height, value_type(), true)),
175 // Convenient function to get an Accessor to a new instance.
176 static Accessor create(int height, const NodeAlloc& alloc) {
177 return Accessor(createInstance(height, alloc));
180 static Accessor create(int height = 1) {
181 return Accessor(createInstance(height));
184 // Create a shared_ptr skiplist object with initial head height.
185 static std::shared_ptr<SkipListType> createInstance(int height,
186 const NodeAlloc& alloc) {
187 return std::make_shared<ConcurrentSkipList>(height, alloc);
190 static std::shared_ptr<SkipListType> createInstance(int height = 1) {
191 return std::make_shared<ConcurrentSkipList>(height);
194 //===================================================================
195 // Below are implementation details.
196 // Please see ConcurrentSkipList::Accessor for stdlib-like APIs.
197 //===================================================================
199 ~ConcurrentSkipList() {
200 /* static */ if (NodeType::template DestroyIsNoOp<NodeAlloc>::value) {
201 // Avoid traversing the list if using arena allocator.
204 for (NodeType* current = head_.load(std::memory_order_relaxed); current; ) {
205 NodeType* tmp = current->skip(0);
206 NodeType::destroy(recycler_.alloc(), current);
212 static bool greater(const value_type &data, const NodeType *node) {
213 return node && Comp()(node->data(), data);
216 static bool less(const value_type &data, const NodeType *node) {
217 return (node == nullptr) || Comp()(data, node->data());
220 static int findInsertionPoint(NodeType *cur, int cur_layer,
221 const value_type &data,
222 NodeType *preds[], NodeType *succs[]) {
224 NodeType *pred = cur;
225 NodeType *foundNode = nullptr;
226 for (int layer = cur_layer; layer >= 0; --layer) {
227 NodeType *node = pred->skip(layer);
228 while (greater(data, node)) {
230 node = node->skip(layer);
232 if (foundLayer == -1 && !less(data, node)) { // the two keys equal
238 // if found, succs[0..foundLayer] need to point to the cached foundNode,
239 // as foundNode might be deleted at the same time thus pred->skip() can
240 // return nullptr or another node.
241 succs[layer] = foundNode ? foundNode : node;
246 size_t size() const { return size_.load(std::memory_order_relaxed); }
249 return head_.load(std::memory_order_consume)->height();
252 int maxLayer() const { return height() - 1; }
254 size_t incrementSize(int delta) {
255 return size_.fetch_add(delta, std::memory_order_relaxed) + delta;
258 // Returns the node if found, nullptr otherwise.
259 NodeType* find(const value_type &data) {
260 auto ret = findNode(data);
261 if (ret.second && !ret.first->markedForRemoval()) return ret.first;
265 // lock all the necessary nodes for changing (adding or removing) the list.
266 // returns true if all the lock acquried successfully and the related nodes
267 // are all validate (not in certain pending states), false otherwise.
268 bool lockNodesForChange(int nodeHeight,
269 ScopedLocker guards[MAX_HEIGHT],
270 NodeType *preds[MAX_HEIGHT],
271 NodeType *succs[MAX_HEIGHT],
273 NodeType *pred, *succ, *prevPred = nullptr;
275 for (int layer = 0; valid && layer < nodeHeight; ++layer) {
277 DCHECK(pred != nullptr) << "layer=" << layer << " height=" << height()
278 << " nodeheight=" << nodeHeight;
280 if (pred != prevPred) {
281 guards[layer] = pred->acquireGuard();
284 valid = !pred->markedForRemoval() &&
285 pred->skip(layer) == succ; // check again after locking
287 if (adding) { // when adding a node, the succ shouldn't be going away
288 valid = valid && (succ == nullptr || !succ->markedForRemoval());
295 // Returns a paired value:
296 // pair.first always stores the pointer to the node with the same input key.
297 // It could be either the newly added data, or the existed data in the
298 // list with the same key.
299 // pair.second stores whether the data is added successfully:
300 // 0 means not added, otherwise reutrns the new size.
301 template <typename U>
302 std::pair<NodeType*, size_t> addOrGetData(U &&data) {
303 NodeType *preds[MAX_HEIGHT], *succs[MAX_HEIGHT];
308 int layer = findInsertionPointGetMaxLayer(data, preds, succs, &max_layer);
311 NodeType *nodeFound = succs[layer];
312 DCHECK(nodeFound != nullptr);
313 if (nodeFound->markedForRemoval()) {
314 continue; // if it's getting deleted retry finding node.
316 // wait until fully linked.
317 while (UNLIKELY(!nodeFound->fullyLinked())) {}
318 return std::make_pair(nodeFound, 0);
321 // need to capped at the original height -- the real height may have grown
322 int nodeHeight = detail::SkipListRandomHeight::instance()->
323 getHeight(max_layer + 1);
325 ScopedLocker guards[MAX_HEIGHT];
326 if (!lockNodesForChange(nodeHeight, guards, preds, succs)) {
327 continue; // give up the locks and retry until all valid
330 // locks acquired and all valid, need to modify the links under the locks.
332 NodeType::create(recycler_.alloc(), nodeHeight, std::forward<U>(data));
333 for (int k = 0; k < nodeHeight; ++k) {
334 newNode->setSkip(k, succs[k]);
335 preds[k]->setSkip(k, newNode);
338 newNode->setFullyLinked();
339 newSize = incrementSize(1);
345 detail::SkipListRandomHeight::instance()->getSizeLimit(hgt);
347 if (hgt < MAX_HEIGHT && newSize > sizeLimit) {
350 CHECK_GT(newSize, 0);
351 return std::make_pair(newNode, newSize);
354 bool remove(const value_type &data) {
355 NodeType *nodeToDelete = nullptr;
356 ScopedLocker nodeGuard;
357 bool isMarked = false;
359 NodeType* preds[MAX_HEIGHT], *succs[MAX_HEIGHT];
363 int layer = findInsertionPointGetMaxLayer(data, preds, succs, &max_layer);
364 if (!isMarked && (layer < 0 || !okToDelete(succs[layer], layer))) {
369 nodeToDelete = succs[layer];
370 nodeHeight = nodeToDelete->height();
371 nodeGuard = nodeToDelete->acquireGuard();
372 if (nodeToDelete->markedForRemoval()) return false;
373 nodeToDelete->setMarkedForRemoval();
377 // acquire pred locks from bottom layer up
378 ScopedLocker guards[MAX_HEIGHT];
379 if (!lockNodesForChange(nodeHeight, guards, preds, succs, false)) {
380 continue; // this will unlock all the locks
383 for (int k = nodeHeight - 1; k >= 0; --k) {
384 preds[k]->setSkip(k, nodeToDelete->skip(k));
390 recycle(nodeToDelete);
394 const value_type *first() const {
395 auto node = head_.load(std::memory_order_consume)->skip(0);
396 return node ? &node->data() : nullptr;
399 const value_type *last() const {
400 NodeType *pred = head_.load(std::memory_order_consume);
401 NodeType *node = nullptr;
402 for (int layer = maxLayer(); layer >= 0; --layer) {
404 node = pred->skip(layer);
405 if (node) pred = node;
406 } while (node != nullptr);
408 return pred == head_.load(std::memory_order_relaxed)
409 ? nullptr : &pred->data();
412 static bool okToDelete(NodeType *candidate, int layer) {
413 DCHECK(candidate != nullptr);
414 return candidate->fullyLinked() &&
415 candidate->maxLayer() == layer &&
416 !candidate->markedForRemoval();
419 // find node for insertion/deleting
420 int findInsertionPointGetMaxLayer(const value_type &data,
421 NodeType *preds[], NodeType *succs[], int *max_layer) const {
422 *max_layer = maxLayer();
423 return findInsertionPoint(head_.load(std::memory_order_consume),
424 *max_layer, data, preds, succs);
427 // Find node for access. Returns a paired values:
428 // pair.first = the first node that no-less than data value
429 // pair.second = 1 when the data value is founded, or 0 otherwise.
430 // This is like lower_bound, but not exact: we could have the node marked for
431 // removal so still need to check that.
432 std::pair<NodeType*, int> findNode(const value_type &data) const {
433 return findNodeDownRight(data);
436 // Find node by first stepping down then stepping right. Based on benchmark
437 // results, this is slightly faster than findNodeRightDown for better
438 // localality on the skipping pointers.
439 std::pair<NodeType*, int> findNodeDownRight(const value_type &data) const {
440 NodeType *pred = head_.load(std::memory_order_consume);
441 int ht = pred->height();
442 NodeType *node = nullptr;
447 for (; ht > 0 && less(data, node = pred->skip(ht - 1)); --ht) {}
448 if (ht == 0) return std::make_pair(node, 0); // not found
449 // node <= data now, but we need to fix up ht
453 while (greater(data, node)) {
455 node = node->skip(ht);
457 found = !less(data, node);
459 return std::make_pair(node, found);
462 // find node by first stepping right then stepping down.
463 // We still keep this for reference purposes.
464 std::pair<NodeType*, int> findNodeRightDown(const value_type &data) const {
465 NodeType *pred = head_.load(std::memory_order_consume);
466 NodeType *node = nullptr;
467 auto top = maxLayer();
469 for (int layer = top; !found && layer >= 0; --layer) {
470 node = pred->skip(layer);
471 while (greater(data, node)) {
473 node = node->skip(layer);
475 found = !less(data, node);
477 return std::make_pair(node, found);
480 NodeType* lower_bound(const value_type &data) const {
481 auto node = findNode(data).first;
482 while (node != nullptr && node->markedForRemoval()) {
483 node = node->skip(0);
488 void growHeight(int height) {
489 NodeType* oldHead = head_.load(std::memory_order_consume);
490 if (oldHead->height() >= height) { // someone else already did this
495 NodeType::create(recycler_.alloc(), height, value_type(), true);
497 { // need to guard the head node in case others are adding/removing
498 // nodes linked to the head.
499 ScopedLocker g = oldHead->acquireGuard();
500 newHead->copyHead(oldHead);
501 NodeType* expected = oldHead;
502 if (!head_.compare_exchange_strong(expected, newHead,
503 std::memory_order_release)) {
504 // if someone has already done the swap, just return.
505 NodeType::destroy(recycler_.alloc(), newHead);
508 oldHead->setMarkedForRemoval();
513 void recycle(NodeType *node) {
517 detail::NodeRecycler<NodeType, NodeAlloc> recycler_;
518 std::atomic<NodeType*> head_;
519 std::atomic<size_t> size_;
522 template <typename T, typename Comp, typename NodeAlloc, int MAX_HEIGHT>
523 class ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT>::Accessor {
524 typedef detail::SkipListNode<T> NodeType;
525 typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
527 typedef T value_type;
529 typedef T& reference;
531 typedef const T& const_reference;
532 typedef const T* const_pointer;
533 typedef size_t size_type;
534 typedef Comp key_compare;
535 typedef Comp value_compare;
537 typedef typename SkipListType::iterator iterator;
538 typedef typename SkipListType::const_iterator const_iterator;
539 typedef typename SkipListType::Skipper Skipper;
541 explicit Accessor(std::shared_ptr<ConcurrentSkipList> skip_list)
542 : slHolder_(std::move(skip_list))
544 sl_ = slHolder_.get();
545 DCHECK(sl_ != nullptr);
546 sl_->recycler_.addRef();
549 // Unsafe initializer: the caller assumes the responsibility to keep
550 // skip_list valid during the whole life cycle of the Acessor.
551 explicit Accessor(ConcurrentSkipList *skip_list) : sl_(skip_list) {
552 DCHECK(sl_ != nullptr);
553 sl_->recycler_.addRef();
556 Accessor(const Accessor &accessor) :
558 slHolder_(accessor.slHolder_) {
559 sl_->recycler_.addRef();
562 Accessor& operator=(const Accessor &accessor) {
563 if (this != &accessor) {
564 slHolder_ = accessor.slHolder_;
565 sl_->recycler_.releaseRef();
567 sl_->recycler_.addRef();
573 sl_->recycler_.releaseRef();
576 bool empty() const { return sl_->size() == 0; }
577 size_t size() const { return sl_->size(); }
578 size_type max_size() const { return std::numeric_limits<size_type>::max(); }
580 // returns end() if the value is not in the list, otherwise returns an
581 // iterator pointing to the data, and it's guaranteed that the data is valid
582 // as far as the Accessor is hold.
583 iterator find(const key_type &value) { return iterator(sl_->find(value)); }
584 const_iterator find(const key_type &value) const {
585 return iterator(sl_->find(value));
587 size_type count(const key_type &data) const { return contains(data); }
589 iterator begin() const {
590 NodeType* head = sl_->head_.load(std::memory_order_consume);
591 return iterator(head->next());
593 iterator end() const { return iterator(nullptr); }
594 const_iterator cbegin() const { return begin(); }
595 const_iterator cend() const { return end(); }
600 typename std::enable_if<std::is_convertible<U, T>::value>::type>
601 std::pair<iterator, bool> insert(U&& data) {
602 auto ret = sl_->addOrGetData(std::forward<U>(data));
603 return std::make_pair(iterator(ret.first), ret.second);
605 size_t erase(const key_type &data) { return remove(data); }
607 iterator lower_bound(const key_type &data) const {
608 return iterator(sl_->lower_bound(data));
611 size_t height() const { return sl_->height(); }
613 // first() returns pointer to the first element in the skiplist, or
616 // last() returns the pointer to the last element in the skiplist,
617 // nullptr if list is empty.
619 // Note: As concurrent writing can happen, first() is not
620 // guaranteed to be the min_element() in the list. Similarly
621 // last() is not guaranteed to be the max_element(), and both of them can
622 // be invalid (i.e. nullptr), so we name them differently from front() and
624 const key_type *first() const { return sl_->first(); }
625 const key_type *last() const { return sl_->last(); }
627 // Try to remove the last element in the skip list.
629 // Returns true if we removed it, false if either the list is empty
630 // or a race condition happened (i.e. the used-to-be last element
631 // was already removed by another thread).
633 auto last = sl_->last();
634 return last ? sl_->remove(*last) : false;
637 std::pair<key_type*, bool> addOrGetData(const key_type &data) {
638 auto ret = sl_->addOrGetData(data);
639 return std::make_pair(&ret.first->data(), ret.second);
642 SkipListType* skiplist() const { return sl_; }
645 // TODO:(xliu) remove these.
646 // Returns true if the node is added successfully, false if not, i.e. the
647 // node with the same key already existed in the list.
648 bool contains(const key_type &data) const { return sl_->find(data); }
649 bool add(const key_type &data) { return sl_->addOrGetData(data).second; }
650 bool remove(const key_type &data) { return sl_->remove(data); }
654 std::shared_ptr<SkipListType> slHolder_;
657 // implements forward iterator concept.
658 template <typename ValT, typename NodeT>
659 class detail::csl_iterator :
660 public boost::iterator_facade<csl_iterator<ValT, NodeT>,
661 ValT, boost::forward_traversal_tag> {
663 typedef ValT value_type;
664 typedef value_type& reference;
665 typedef value_type* pointer;
666 typedef ptrdiff_t difference_type;
668 explicit csl_iterator(NodeT* node = nullptr) : node_(node) {}
670 template <typename OtherVal, typename OtherNode>
671 csl_iterator(const csl_iterator<OtherVal, OtherNode> &other,
672 typename std::enable_if<std::is_convertible<OtherVal, ValT>::value>::type*
673 = 0) : node_(other.node_) {}
675 size_t nodeSize() const {
676 return node_ == nullptr ? 0 :
677 node_->height() * sizeof(NodeT*) + sizeof(*this);
680 bool good() const { return node_ != nullptr; }
683 friend class boost::iterator_core_access;
684 template <class, class> friend class csl_iterator;
686 void increment() { node_ = node_->next(); }
687 bool equal(const csl_iterator& other) const { return node_ == other.node_; }
688 value_type& dereference() const { return node_->data(); }
694 template <typename T, typename Comp, typename NodeAlloc, int MAX_HEIGHT>
695 class ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT>::Skipper {
696 typedef detail::SkipListNode<T> NodeType;
697 typedef ConcurrentSkipList<T, Comp, NodeAlloc, MAX_HEIGHT> SkipListType;
698 typedef typename SkipListType::Accessor Accessor;
701 typedef T value_type;
702 typedef T& reference;
704 typedef ptrdiff_t difference_type;
706 Skipper(const std::shared_ptr<SkipListType>& skipList) :
707 accessor_(skipList) {
711 Skipper(const Accessor& accessor) : accessor_(accessor) {
716 // need to cache the head node
717 NodeType* head_node = head();
718 headHeight_ = head_node->height();
719 for (int i = 0; i < headHeight_; ++i) {
720 preds_[i] = head_node;
721 succs_[i] = head_node->skip(i);
723 int max_layer = maxLayer();
724 for (int i = 0; i < max_layer; ++i) {
725 hints_[i] = uint8_t(i + 1);
727 hints_[max_layer] = max_layer;
730 // advance to the next node in the list.
731 Skipper& operator ++() {
732 preds_[0] = succs_[0];
733 succs_[0] = preds_[0]->skip(0);
734 int height = curHeight();
735 for (int i = 1; i < height && preds_[0] == succs_[i]; ++i) {
736 preds_[i] = succs_[i];
737 succs_[i] = preds_[i]->skip(i);
742 bool good() const { return succs_[0] != nullptr; }
744 int maxLayer() const { return headHeight_ - 1; }
746 int curHeight() const {
747 // need to cap the height to the cached head height, as the current node
748 // might be some newly inserted node and also during the time period the
749 // head height may have grown.
750 return succs_[0] ? std::min(headHeight_, succs_[0]->height()) : 0;
753 const value_type &data() const {
754 DCHECK(succs_[0] != nullptr);
755 return succs_[0]->data();
758 value_type &operator *() const {
759 DCHECK(succs_[0] != nullptr);
760 return succs_[0]->data();
763 value_type *operator->() {
764 DCHECK(succs_[0] != nullptr);
765 return &succs_[0]->data();
769 * Skip to the position whose data is no less than the parameter.
770 * (I.e. the lower_bound).
772 * Returns true if the data is found, false otherwise.
774 bool to(const value_type &data) {
775 int layer = curHeight() - 1;
776 if (layer < 0) return false; // reaches the end of the list
778 int lyr = hints_[layer];
779 int max_layer = maxLayer();
780 while (SkipListType::greater(data, succs_[lyr]) && lyr < max_layer) {
783 hints_[layer] = lyr; // update the hint
785 int foundLayer = SkipListType::
786 findInsertionPoint(preds_[lyr], lyr, data, preds_, succs_);
787 if (foundLayer < 0) return false;
789 DCHECK(succs_[0] != nullptr) << "lyr=" << lyr
790 << "; max_layer=" << max_layer;
791 return !succs_[0]->markedForRemoval();
795 NodeType* head() const {
796 return accessor_.skiplist()->head_.load(std::memory_order_consume);
801 NodeType *succs_[MAX_HEIGHT], *preds_[MAX_HEIGHT];
802 uint8_t hints_[MAX_HEIGHT];