18 #include "ndb_type_traits.h"
26 #include "small_vector.h"
27 #include "ownership_checker.h"
30 template <typename T, typename P> struct u64manip;
32 struct u64manip<uint64_t, P> {
33 static inline uint64_t Load(uint64_t t) { return t; }
34 static inline void Store(uint64_t &t, uint64_t v) { t = v; }
35 static inline uint64_t
38 #ifdef CHECK_INVARIANTS
39 INVARIANT(!P::IsLocked(t));
40 t |= P::HDR_LOCKED_MASK;
45 static inline uint64_t
46 LockWithSpinCount(uint64_t &t, unsigned &spins)
48 const uint64_t ret = Lock(t);
55 #ifdef CHECK_INVARIANTS
56 INVARIANT(P::IsLocked(t));
57 t &= ~(P::HDR_LOCKED_MASK | P::HDR_MODIFYING_MASK);
60 static inline uint64_t
61 StableVersion(uint64_t t)
63 INVARIANT(!P::IsModifying(t));
67 CheckVersion(uint64_t t, uint64_t stablev)
69 INVARIANT(!P::IsModifying(stablev));
70 INVARIANT((t & ~P::HDR_LOCKED_MASK) == (stablev & ~P::HDR_LOCKED_MASK));
75 struct u64manip<std::atomic<uint64_t>, P> {
76 static inline uint64_t
77 Load(const std::atomic<uint64_t> &t)
79 return t.load(std::memory_order_acquire);
82 Store(std::atomic<uint64_t> &t, uint64_t v)
84 t.store(v, std::memory_order_release);
86 static inline uint64_t
87 Lock(std::atomic<uint64_t> &t)
89 #ifdef SPINLOCK_BACKOFF
90 uint64_t backoff_shift = 0;
93 while ((v & P::HDR_LOCKED_MASK) ||
94 !t.compare_exchange_strong(v, v | P::HDR_LOCKED_MASK)) {
95 #ifdef SPINLOCK_BACKOFF
96 if (backoff_shift < 63)
98 uint64_t spins = (1UL << backoff_shift) * BACKOFF_SPINS_FACTOR;
108 COMPILER_MEMORY_FENCE;
111 static inline uint64_t
112 LockWithSpinCount(std::atomic<uint64_t> &t, unsigned &spins)
114 #ifdef SPINLOCK_BACKOFF
115 uint64_t backoff_shift = 0;
118 uint64_t v = Load(t);
119 while ((v & P::HDR_LOCKED_MASK) ||
120 !t.compare_exchange_strong(v, v | P::HDR_LOCKED_MASK)) {
121 #ifdef SPINLOCK_BACKOFF
122 if (backoff_shift < 63)
124 uint64_t backoff_spins = (1UL << backoff_shift) * BACKOFF_SPINS_FACTOR;
125 while (backoff_spins) {
135 COMPILER_MEMORY_FENCE;
139 Unlock(std::atomic<uint64_t> &v)
141 INVARIANT(P::IsLocked(v));
142 const uint64_t oldh = Load(v);
145 if ((h & P::HDR_MODIFYING_MASK) ||
146 (h & P::HDR_DELETING_MASK)) {
148 const uint64_t n = (h & P::HDR_VERSION_MASK) >> P::HDR_VERSION_SHIFT;
149 h &= ~P::HDR_VERSION_MASK;
150 h |= (((n + 1) << P::HDR_VERSION_SHIFT) & P::HDR_VERSION_MASK);
152 // clear locked + modifying bits
153 h &= ~(P::HDR_LOCKED_MASK | P::HDR_MODIFYING_MASK);
155 INVARIANT(!CheckVersion(oldh, h));
156 INVARIANT(!(h & P::HDR_LOCKED_MASK));
157 INVARIANT(!(h & P::HDR_MODIFYING_MASK));
158 COMPILER_MEMORY_FENCE;
161 static inline uint64_t
162 StableVersion(const std::atomic<uint64_t> &t)
164 uint64_t v = Load(t);
165 while ((v & P::HDR_MODIFYING_MASK)) {
169 COMPILER_MEMORY_FENCE;
173 CheckVersion(uint64_t t, uint64_t stablev)
175 INVARIANT(!(stablev & P::HDR_MODIFYING_MASK));
176 COMPILER_MEMORY_FENCE;
177 return (t & ~P::HDR_LOCKED_MASK) ==
178 (stablev & ~P::HDR_LOCKED_MASK);
184 * manipulates a btree version
186 * hdr bits: layout is (actual bytes depend on the NKeysPerNode parameter)
189 * [type | key_slots_used | locked | is_root | modifying | deleting | version ]
190 * [0:1 | 1:5 | 5:6 | 6:7 | 7:8 | 8:9 | 9:64 ]
193 * 1) modifying => locked
194 * 2) deleting => locked
196 * WARNING: the correctness of our concurrency scheme relies on being able
197 * to do a memory reads/writes from/to hdr atomically. x86 architectures
198 * guarantee that aligned writes are atomic (see intel spec)
200 template <typename VersionType, unsigned NKeysPerNode>
201 class btree_version_manip {
204 typename private_::typeutil<VersionType>::func_param_type
210 btree_version_manip<VersionType, NKeysPerNode>>
213 static inline constexpr uint64_t
214 LowMask(uint64_t nbits)
216 return (1UL << nbits) - 1UL;
221 static const uint64_t HDR_TYPE_BITS = 1;
222 static const uint64_t HDR_TYPE_MASK = 0x1;
224 static const uint64_t HDR_KEY_SLOTS_SHIFT = HDR_TYPE_BITS;
225 static const uint64_t HDR_KEY_SLOTS_BITS = ceil_log2_const(NKeysPerNode);
226 static const uint64_t HDR_KEY_SLOTS_MASK = LowMask(HDR_KEY_SLOTS_BITS) << HDR_KEY_SLOTS_SHIFT;
228 static const uint64_t HDR_LOCKED_SHIFT = HDR_KEY_SLOTS_SHIFT + HDR_KEY_SLOTS_BITS;
229 static const uint64_t HDR_LOCKED_BITS = 1;
230 static const uint64_t HDR_LOCKED_MASK = LowMask(HDR_LOCKED_BITS) << HDR_LOCKED_SHIFT;
232 static const uint64_t HDR_IS_ROOT_SHIFT = HDR_LOCKED_SHIFT + HDR_LOCKED_BITS;
233 static const uint64_t HDR_IS_ROOT_BITS = 1;
234 static const uint64_t HDR_IS_ROOT_MASK = LowMask(HDR_IS_ROOT_BITS) << HDR_IS_ROOT_SHIFT;
236 static const uint64_t HDR_MODIFYING_SHIFT = HDR_IS_ROOT_SHIFT + HDR_IS_ROOT_BITS;
237 static const uint64_t HDR_MODIFYING_BITS = 1;
238 static const uint64_t HDR_MODIFYING_MASK = LowMask(HDR_MODIFYING_BITS) << HDR_MODIFYING_SHIFT;
240 static const uint64_t HDR_DELETING_SHIFT = HDR_MODIFYING_SHIFT + HDR_MODIFYING_BITS;
241 static const uint64_t HDR_DELETING_BITS = 1;
242 static const uint64_t HDR_DELETING_MASK = LowMask(HDR_DELETING_BITS) << HDR_DELETING_SHIFT;
244 static const uint64_t HDR_VERSION_SHIFT = HDR_DELETING_SHIFT + HDR_DELETING_BITS;
245 static const uint64_t HDR_VERSION_MASK = ((uint64_t)-1) << HDR_VERSION_SHIFT;
248 static_assert(NKeysPerNode >= 1, "XX");
250 static_assert(std::numeric_limits<uint64_t>::max() == (
260 static_assert( !(HDR_TYPE_MASK & HDR_KEY_SLOTS_MASK) , "XX");
261 static_assert( !(HDR_KEY_SLOTS_MASK & HDR_LOCKED_MASK) , "XX");
262 static_assert( !(HDR_LOCKED_MASK & HDR_IS_ROOT_MASK) , "XX");
263 static_assert( !(HDR_IS_ROOT_MASK & HDR_MODIFYING_MASK) , "XX");
264 static_assert( !(HDR_MODIFYING_MASK & HDR_DELETING_MASK) , "XX");
265 static_assert( !(HDR_DELETING_MASK & HDR_VERSION_MASK) , "XX");
269 static inline uint64_t
270 Load(LoadVersionType v)
272 return U64Manip::Load(v);
276 Store(VersionType &t, uint64_t v)
278 U64Manip::Store(t, v);
284 IsLeafNode(LoadVersionType v)
286 return (Load(v) & HDR_TYPE_MASK) == 0;
290 IsInternalNode(LoadVersionType v)
292 return !IsLeafNode(v);
296 KeySlotsUsed(LoadVersionType v)
298 return (Load(v) & HDR_KEY_SLOTS_MASK) >> HDR_KEY_SLOTS_SHIFT;
302 IsLocked(LoadVersionType v)
304 return (Load(v) & HDR_LOCKED_MASK);
308 IsRoot(LoadVersionType v)
310 return (Load(v) & HDR_IS_ROOT_MASK);
314 IsModifying(LoadVersionType v)
316 return (Load(v) & HDR_MODIFYING_MASK);
320 IsDeleting(LoadVersionType v)
322 return (Load(v) & HDR_DELETING_MASK);
325 static inline uint64_t
326 Version(LoadVersionType v)
328 return (Load(v) & HDR_VERSION_MASK) >> HDR_VERSION_SHIFT;
332 VersionInfoStr(LoadVersionType v)
334 std::ostringstream buf;
343 buf << KeySlotsUsed(v) << " | ";
378 SetKeySlotsUsed(VersionType &v, size_t n)
380 INVARIANT(n <= NKeysPerNode);
381 INVARIANT(IsModifying(v));
382 uint64_t h = Load(v);
383 h &= ~HDR_KEY_SLOTS_MASK;
384 h |= (n << HDR_KEY_SLOTS_SHIFT);
389 IncKeySlotsUsed(VersionType &v)
391 SetKeySlotsUsed(v, KeySlotsUsed(v) + 1);
395 DecKeySlotsUsed(VersionType &v)
397 INVARIANT(KeySlotsUsed(v) > 0);
398 SetKeySlotsUsed(v, KeySlotsUsed(v) - 1);
402 SetRoot(VersionType &v)
404 INVARIANT(IsLocked(v));
405 INVARIANT(!IsRoot(v));
406 uint64_t h = Load(v);
407 h |= HDR_IS_ROOT_MASK;
412 ClearRoot(VersionType &v)
414 INVARIANT(IsLocked(v));
415 INVARIANT(IsRoot(v));
416 uint64_t h = Load(v);
417 h &= ~HDR_IS_ROOT_MASK;
422 MarkModifying(VersionType &v)
424 INVARIANT(IsLocked(v));
425 INVARIANT(!IsModifying(v));
426 uint64_t h = Load(v);
427 h |= HDR_MODIFYING_MASK;
432 MarkDeleting(VersionType &v)
434 INVARIANT(IsLocked(v));
435 INVARIANT(!IsDeleting(v));
436 uint64_t h = Load(v);
437 h |= HDR_DELETING_MASK;
441 // concurrency control
443 static inline uint64_t
444 StableVersion(LoadVersionType v)
446 return U64Manip::StableVersion(v);
449 static inline uint64_t
450 UnstableVersion(LoadVersionType v)
456 CheckVersion(LoadVersionType v, uint64_t stablev)
458 return U64Manip::CheckVersion(Load(v), stablev);
461 static inline uint64_t
464 return U64Manip::Lock(v);
467 static inline uint64_t
468 LockWithSpinCount(VersionType &v, unsigned &spins)
470 return U64Manip::LockWithSpinCount(v, spins);
474 Unlock(VersionType &v)
480 struct base_btree_config {
481 static const unsigned int NKeysPerNode = 15;
482 static const bool RcuRespCaller = true;
485 struct concurrent_btree_traits : public base_btree_config {
486 typedef std::atomic<uint64_t> VersionType;
489 struct single_threaded_btree_traits : public base_btree_config {
490 typedef uint64_t VersionType;
494 * A concurrent, variable key length b+-tree, optimized for read heavy
497 * This b+-tree maps uninterpreted binary strings (key_type) of arbitrary
498 * length to a single pointer (value_type). Binary string values are copied
499 * into the b+-tree, so the caller does not have to worry about preserving
500 * memory for key values.
502 * This b+-tree does not manage the memory pointed to by value_type. The
503 * pointer is treated completely opaquely.
505 * So far, this b+-tree has only been tested on 64-bit intel x86 processors.
506 * It's correctness, as it is implemented (not conceptually), requires the
507 * semantics of total store order (TSO) for correctness. To fix this, we would
508 * change compiler fences into actual memory fences, at the very least.
510 template <typename P>
512 template <template <typename> class, typename>
513 friend class base_txn_btree;
515 typedef varkey key_type;
516 typedef std::string string_type;
517 typedef uint64_t key_slice;
518 typedef uint8_t* value_type;
519 typedef typename std::conditional<!P::RcuRespCaller,
521 disabled_rcu_region>::type rcu_region;
523 // public to assist in testing
524 static const unsigned int NKeysPerNode = P::NKeysPerNode;
525 static const unsigned int NMinKeysPerNode = P::NKeysPerNode / 2;
529 typedef std::pair<ssize_t, size_t> key_search_ret;
532 btree_version_manip<typename P::VersionType, NKeysPerNode>
535 btree_version_manip<uint64_t, NKeysPerNode>
540 typename P::VersionType hdr_;
542 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
543 std::thread::id lock_owner_;
544 #endif /* BTREE_LOCK_OWNERSHIP_CHECKING */
547 * Keys are assumed to be stored in contiguous sorted order, so that all
548 * the used slots are grouped together. That is, elems in positions
549 * [0, key_slots_used) are valid, and elems in positions
550 * [key_slots_used, NKeysPerNode) are empty
552 key_slice keys_[NKeysPerNode];
556 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
562 INVARIANT(!is_locked());
563 INVARIANT(is_deleting());
569 return VersionManip::IsLeafNode(hdr_);
573 is_internal_node() const
575 return !is_leaf_node();
579 key_slots_used() const
581 return VersionManip::KeySlotsUsed(hdr_);
585 set_key_slots_used(size_t n)
587 VersionManip::SetKeySlotsUsed(hdr_, n);
593 VersionManip::IncKeySlotsUsed(hdr_);
599 VersionManip::DecKeySlotsUsed(hdr_);
605 return VersionManip::IsLocked(hdr_);
608 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
610 is_lock_owner() const
612 return std::this_thread::get_id() == lock_owner_;
616 is_lock_owner() const
620 #endif /* BTREE_LOCK_OWNERSHIP_CHECKING */
625 #ifdef ENABLE_EVENT_COUNTERS
626 static event_avg_counter
627 evt_avg_btree_leaf_node_lock_acquire_spins(
628 util::cxx_typename<btree<P>>::value() +
629 std::string("_avg_btree_leaf_node_lock_acquire_spins"));
630 static event_avg_counter
631 evt_avg_btree_internal_node_lock_acquire_spins(
632 util::cxx_typename<btree<P>>::value() +
633 std::string("_avg_btree_internal_node_lock_acquire_spins"));
635 const uint64_t ret = VersionManip::LockWithSpinCount(hdr_, spins);
637 evt_avg_btree_leaf_node_lock_acquire_spins.offer(spins);
639 evt_avg_btree_internal_node_lock_acquire_spins.offer(spins);
641 const uint64_t ret = VersionManip::Lock(hdr_);
643 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
644 lock_owner_ = std::this_thread::get_id();
645 AddNodeToLockRegion(this);
646 INVARIANT(is_lock_owner());
654 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
655 lock_owner_ = std::thread::id();
656 INVARIANT(!is_lock_owner());
658 VersionManip::Unlock(hdr_);
664 return VersionManip::IsRoot(hdr_);
670 VersionManip::SetRoot(hdr_);
676 VersionManip::ClearRoot(hdr_);
682 return VersionManip::IsModifying(hdr_);
688 VersionManip::MarkModifying(hdr_);
694 return VersionManip::IsDeleting(hdr_);
700 VersionManip::MarkDeleting(hdr_);
704 unstable_version() const
706 return VersionManip::UnstableVersion(hdr_);
710 * spin until we get a version which is not modifying (but can be locked)
713 stable_version() const
715 return VersionManip::StableVersion(hdr_);
719 check_version(uint64_t version) const
721 return VersionManip::CheckVersion(hdr_, version);
725 version_info_str() const
727 return VersionManip::VersionInfoStr(hdr_);
730 void base_invariant_unique_keys_check() const;
732 // [min_key, max_key)
734 base_invariant_checker(const key_slice *min_key,
735 const key_slice *max_key,
738 /** manually simulated virtual function (so we don't make node virtual) */
740 invariant_checker(const key_slice *min_key,
741 const key_slice *max_key,
742 const node *left_sibling,
743 const node *right_sibling,
746 /** another manually simulated virtual function */
747 inline void prefetch() const;
750 struct leaf_node : public node {
751 union value_or_node_ptr {
756 key_slice min_key_; // really is min_key's key slice
757 value_or_node_ptr values_[NKeysPerNode];
760 // [ slice_length | type | unused ]
761 // [ 0:4 | 4:5 | 5:8 ]
762 uint8_t lengths_[NKeysPerNode];
767 // starts out empty- once set, doesn't get freed until dtor (even if all
768 // keys w/ suffixes get removed)
771 inline ALWAYS_INLINE varkey
772 suffix(size_t i) const
774 return suffixes_ ? varkey(suffixes_[i]) : varkey();
777 //static event_counter g_evt_suffixes_array_created;
782 INVARIANT(this->is_modifying());
783 INVARIANT(!suffixes_);
784 suffixes_ = new imstring[NKeysPerNode];
785 //++g_evt_suffixes_array_created;
791 INVARIANT(this->is_modifying());
799 static const uint64_t LEN_LEN_MASK = 0xf;
801 static const uint64_t LEN_TYPE_SHIFT = 4;
802 static const uint64_t LEN_TYPE_MASK = 0x1 << LEN_TYPE_SHIFT;
807 #ifdef BTREE_NODE_PREFETCH
808 prefetch_object(this);
813 keyslice_length(size_t n) const
815 INVARIANT(n < NKeysPerNode);
816 return lengths_[n] & LEN_LEN_MASK;
820 keyslice_set_length(size_t n, size_t len, bool layer)
822 INVARIANT(n < NKeysPerNode);
823 INVARIANT(this->is_modifying());
825 INVARIANT(!layer || len == 9);
826 lengths_[n] = (len | (layer ? LEN_TYPE_MASK : 0));
830 value_is_layer(size_t n) const
832 INVARIANT(n < NKeysPerNode);
833 return lengths_[n] & LEN_TYPE_MASK;
837 value_set_layer(size_t n)
839 INVARIANT(n < NKeysPerNode);
840 INVARIANT(this->is_modifying());
841 INVARIANT(keyslice_length(n) == 9);
842 INVARIANT(!value_is_layer(n));
843 lengths_[n] |= LEN_TYPE_MASK;
847 * keys[key_search(k).first] == k if key_search(k).first != -1
848 * key does not exist otherwise. considers key length also
850 inline key_search_ret
851 key_search(key_slice k, size_t len) const
853 size_t n = this->key_slots_used();
856 while (lower < upper) {
857 ssize_t i = (lower + upper) / 2;
858 key_slice k0 = this->keys_[i];
859 size_t len0 = this->keyslice_length(i);
860 if (k0 < k || (k0 == k && len0 < len))
862 else if (k0 == k && len0 == len)
863 return key_search_ret(i, n);
867 return key_search_ret(-1, n);
871 * tightest lower bound key, -1 if no such key exists. operates only
872 * on key slices (internal nodes have unique key slices)
874 inline key_search_ret
875 key_lower_bound_search(key_slice k, size_t len) const
878 size_t n = this->key_slots_used();
881 while (lower < upper) {
882 ssize_t i = (lower + upper) / 2;
883 key_slice k0 = this->keys_[i];
884 size_t len0 = this->keyslice_length(i);
885 if (k0 < k || (k0 == k && len0 < len)) {
888 } else if (k0 == k && len0 == len) {
889 return key_search_ret(i, n);
894 return key_search_ret(ret, n);
898 invariant_checker_impl(const key_slice *min_key,
899 const key_slice *max_key,
900 const node *left_sibling,
901 const node *right_sibling,
904 static inline leaf_node*
907 void * const p = rcu::s_instance.alloc(LeafNodeAllocSize);
909 return new (p) leaf_node;
915 leaf_node *n = (leaf_node *) p;
916 INVARIANT(n->is_deleting());
917 INVARIANT(!n->is_locked());
919 rcu::s_instance.dealloc(p, LeafNodeAllocSize);
923 release(leaf_node *n)
928 rcu::s_instance.free_with_fn(n, deleter);
933 struct internal_node : public node {
935 * child at position child_idx is responsible for keys
936 * [keys[child_idx - 1], keys[child_idx])
938 * in the case where child_idx == 0 or child_idx == this->key_slots_used(), then
939 * the responsiblity value of the min/max key, respectively, is determined
942 node *children_[NKeysPerNode + 1];
950 #ifdef BTREE_NODE_PREFETCH
951 prefetch_object(this);
956 * keys[key_search(k).first] == k if key_search(k).first != -1
957 * key does not exist otherwise. operates ony on key slices
958 * (internal nodes have unique key slices)
960 inline key_search_ret
961 key_search(key_slice k) const
963 size_t n = this->key_slots_used();
966 while (lower < upper) {
967 ssize_t i = (lower + upper) / 2;
968 key_slice k0 = this->keys_[i];
970 return key_search_ret(i, n);
976 return key_search_ret(-1, n);
980 * tightest lower bound key, -1 if no such key exists. operates only
981 * on key slices (internal nodes have unique key slices)
983 inline key_search_ret
984 key_lower_bound_search(key_slice k) const
987 size_t n = this->key_slots_used();
990 while (lower < upper) {
991 ssize_t i = (lower + upper) / 2;
992 key_slice k0 = this->keys_[i];
994 return key_search_ret(i, n);
1002 return key_search_ret(ret, n);
1006 invariant_checker_impl(const key_slice *min_key,
1007 const key_slice *max_key,
1008 const node *left_sibling,
1009 const node *right_sibling,
1010 bool is_root) const;
1012 // XXX: alloc(), deleter(), and release() are copied from leaf_node-
1013 // we should templatize them to avoid code duplication
1015 static inline internal_node*
1018 void * const p = rcu::s_instance.alloc(InternalNodeAllocSize);
1020 return new (p) internal_node;
1026 internal_node *n = (internal_node *) p;
1027 INVARIANT(n->is_deleting());
1028 INVARIANT(!n->is_locked());
1029 n->~internal_node();
1030 rcu::s_instance.dealloc(p, InternalNodeAllocSize);
1034 release(internal_node *n)
1039 rcu::s_instance.free_with_fn(n, deleter);
1044 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
1047 NodeLockRegionBegin()
1049 ownership_checker<btree<P>, typename btree<P>::node>::NodeLockRegionBegin();
1052 AssertAllNodeLocksReleased()
1054 ownership_checker<btree<P>, typename btree<P>::node>::AssertAllNodeLocksReleased();
1058 AddNodeToLockRegion(const node *n)
1060 ownership_checker<btree<P>, typename btree<P>::node>::AddNodeToLockRegion(n);
1064 #ifdef BTREE_NODE_ALLOC_CACHE_ALIGNED
1065 static const size_t LeafNodeAllocSize = util::round_up<size_t, LG_CACHELINE_SIZE>(sizeof(leaf_node));
1066 static const size_t InternalNodeAllocSize = util::round_up<size_t, LG_CACHELINE_SIZE>(sizeof(internal_node));
1068 static const size_t LeafNodeAllocSize = sizeof(leaf_node);
1069 static const size_t InternalNodeAllocSize = sizeof(internal_node);
1072 static inline leaf_node*
1075 INVARIANT(!n || n->is_leaf_node());
1076 return static_cast<leaf_node *>(n);
1079 static inline const leaf_node*
1080 AsLeaf(const node *n)
1082 return AsLeaf(const_cast<node *>(n));
1085 static inline internal_node*
1088 INVARIANT(!n || n->is_internal_node());
1089 return static_cast<internal_node *>(n);
1092 static inline const internal_node*
1093 AsInternal(const node *n)
1095 return AsInternal(const_cast<node *>(n));
1098 static inline leaf_node *
1099 AsLeafCheck(node *n, uint64_t v)
1101 return likely(n) && RawVersionManip::IsLeafNode(v) ?
1102 static_cast<leaf_node *>(n) : NULL;
1105 static inline leaf_node*
1106 AsLeafCheck(node *n)
1108 return likely(n) && n->is_leaf_node() ?
1109 static_cast<leaf_node *>(n) : NULL;
1112 static inline const leaf_node*
1113 AsLeafCheck(const node *n, uint64_t v)
1115 return AsLeafCheck(const_cast<node *>(n), v);
1118 static inline const leaf_node*
1119 AsLeafCheck(const node *n)
1121 return AsLeafCheck(const_cast<node *>(n));
1124 static inline internal_node*
1125 AsInternalCheck(node *n)
1127 return likely(n) && n->is_internal_node() ? static_cast<internal_node *>(n) : NULL;
1130 static inline const internal_node*
1131 AsInternalCheck(const node *n)
1133 return AsInternalCheck(const_cast<node *>(n));
1137 UnlockNodes(typename util::vec<node *>::type &locked_nodes)
1139 for (auto it = locked_nodes.begin(); it != locked_nodes.end(); ++it)
1141 locked_nodes.clear();
1144 template <typename T>
1146 UnlockAndReturn(typename util::vec<node *>::type &locked_nodes, T t)
1148 UnlockNodes(locked_nodes);
1153 CheckVersion(uint64_t a, uint64_t b)
1155 return VersionManip::CheckVersion(a, b);
1159 * Is not thread safe, and does not use RCU to free memory
1161 * Should only be called when there are no outstanding operations on
1162 * any nodes reachable from n
1164 static void recursive_delete(node *n);
1166 node *volatile root_;
1170 // XXX(stephentu): trying out a very opaque node API for now
1171 typedef struct node node_opaque_t;
1172 typedef std::pair< const node_opaque_t *, uint64_t > versioned_node_t;
1173 struct insert_info_t {
1174 const node_opaque_t* node;
1175 uint64_t old_version;
1176 uint64_t new_version;
1179 btree() : root_(leaf_node::alloc())
1182 NKeysPerNode > (sizeof(key_slice) + 2), "XX"); // so we can always do a split
1185 (VersionManip::HDR_KEY_SLOTS_MASK >> VersionManip::HDR_KEY_SLOTS_SHIFT), "XX");
1187 #ifdef CHECK_INVARIANTS
1193 #endif /* CHECK_INVARIANTS */
1198 // NOTE: it is assumed on deletion time there are no
1199 // outstanding requests to the btree, so deletion proceeds
1200 // in a non-threadsafe manner
1201 recursive_delete(root_);
1211 recursive_delete(root_);
1212 root_ = leaf_node::alloc();
1213 #ifdef CHECK_INVARIANTS
1219 #endif /* CHECK_INVARIANTS */
1222 /** Note: invariant checking is not thread safe */
1224 invariant_checker() const
1226 root_->invariant_checker(NULL, NULL, NULL, NULL, true);
1229 /** NOTE: the public interface assumes that the caller has taken care
1230 * of setting up RCU */
1233 search(const key_type &k, value_type &v,
1234 versioned_node_t *search_info = nullptr) const
1237 typename util::vec<leaf_node *>::type ns;
1238 return search_impl(k, v, ns, search_info);
1242 * The low level callback interface is as follows:
1244 * Consider a scan in the range [a, b):
1245 * 1) on_resp_node() is called at least once per node which
1246 * has a responibility range that overlaps with the scan range
1247 * 2) invoke() is called per <k, v>-pair such that k is in [a, b)
1249 * The order of calling on_resp_node() and invoke() is up to the implementation.
1251 class low_level_search_range_callback {
1253 virtual ~low_level_search_range_callback() {}
1256 * This node lies within the search range (at version v)
1258 virtual void on_resp_node(const node_opaque_t *n, uint64_t version) = 0;
1261 * This key/value pair was read from node n @ version
1263 virtual bool invoke(const string_type &k, value_type v,
1264 const node_opaque_t *n, uint64_t version) = 0;
1268 * A higher level interface if you don't care about node and version numbers
1270 class search_range_callback : public low_level_search_range_callback {
1273 on_resp_node(const node_opaque_t *n, uint64_t version)
1278 invoke(const string_type &k, value_type v,
1279 const node_opaque_t *n, uint64_t version)
1281 return invoke(k, v);
1284 virtual bool invoke(const string_type &k, value_type v) = 0;
1288 template <typename T>
1289 class type_callback_wrapper : public search_range_callback {
1291 type_callback_wrapper(T *callback) : callback_(callback) {}
1293 invoke(const string_type &k, value_type v)
1295 return callback_->operator()(k, v);
1301 struct leaf_kvinfo {
1302 key_slice key_; // in host endian
1303 key_slice key_big_endian_;
1304 typename leaf_node::value_or_node_ptr vn_;
1308 leaf_kvinfo() {} // for STL
1309 leaf_kvinfo(key_slice key,
1310 typename leaf_node::value_or_node_ptr vn,
1313 const varkey &suffix)
1314 : key_(key), key_big_endian_(util::big_endian_trfm<key_slice>()(key)),
1315 vn_(vn), layer_(layer), length_(length), suffix_(suffix)
1321 return (const char *) &key_big_endian_;
1325 bool search_range_at_layer(leaf_node *leaf,
1326 string_type &prefix,
1327 const key_type &lower,
1329 const key_type *upper,
1330 low_level_search_range_callback &callback) const;
1335 * For all keys in [lower, *upper), invoke callback in ascending order.
1336 * If upper is NULL, then there is no upper bound
1339 * This function by default provides a weakly consistent view of the b-tree. For
1340 * instance, consider the following tree, where n = 3 is the max number of
1348 * [A|B|C]<->[D|E|F]<->[G|H|I]
1350 * Suppose we want to scan [A, inf), so we traverse to the leftmost leaf node
1351 * and start a left-to-right walk. Suppose we have emitted keys A, B, and C,
1352 * and we are now just about to scan the middle leaf node. Now suppose
1353 * another thread concurrently does delete(A), followed by a delete(H). Now
1354 * the scaning thread resumes and emits keys D, E, F, G, and I, omitting H
1355 * because H was deleted. This is an inconsistent view of the b-tree, since
1356 * the scanning thread has observed the deletion of H but did not observe the
1357 * deletion of A, but we know that delete(A) happens before delete(H).
1359 * The weakly consistent guarantee provided is the following: all keys
1360 * which, at the time of invocation, are known to exist in the btree
1361 * will be discovered on a scan (provided the key falls within the scan's range),
1362 * and provided there are no concurrent modifications/removals of that key
1364 * Note that scans within a single node are consistent
1366 * XXX: add other modes which provide better consistency:
1368 * B) optimistic validation mode
1370 * the last string parameter is an optional string buffer to use:
1371 * if null, a stack allocated string will be used. if not null, must
1373 * A) buf->empty() at the beginning
1374 * B) no concurrent mutation of string
1375 * note that string contents upon return are arbitrary
1378 search_range_call(const key_type &lower,
1379 const key_type *upper,
1380 low_level_search_range_callback &callback,
1381 string_type *buf = nullptr) const;
1385 rsearch_range_call(const key_type &upper,
1386 const key_type *lower,
1387 low_level_search_range_callback &callback,
1388 std::string *buf = nullptr) const
1390 NDB_UNIMPLEMENTED("rsearch_range_call");
1394 * Callback is expected to implement bool operator()(key_slice k, value_type v),
1395 * where the callback returns true if it wants to keep going, false otherwise
1398 template <typename T>
1400 search_range(const key_type &lower,
1401 const key_type *upper,
1403 string_type *buf = nullptr) const
1405 type_callback_wrapper<T> w(&callback);
1406 search_range_call(lower, upper, w, buf);
1409 template <typename F>
1411 rsearch_range(const key_type &upper,
1412 const key_type *lower,
1414 std::string *buf = nullptr) const
1416 NDB_UNIMPLEMENTED("rsearch_range");
1420 * returns true if key k did not already exist, false otherwise
1421 * If k exists with a different mapping, still returns false
1423 * If false and old_v is not NULL, then the overwritten value of v
1424 * is written into old_v
1427 insert(const key_type &k, value_type v,
1428 value_type *old_v = NULL,
1429 insert_info_t *insert_info = NULL)
1432 return insert_stable_location((node **) &root_, k, v, false, old_v, insert_info);
1436 * Only puts k=>v if k does not exist in map. returns true
1437 * if k inserted, false otherwise (k exists already)
1440 insert_if_absent(const key_type &k, value_type v,
1441 insert_info_t *insert_info = NULL)
1444 return insert_stable_location((node **) &root_, k, v, true, NULL, insert_info);
1448 * return true if a value was removed, false otherwise.
1450 * if true and old_v is not NULL, then the removed value of v
1451 * is written into old_v
1454 remove(const key_type &k, value_type *old_v = NULL)
1457 return remove_stable_location((node **) &root_, k, old_v);
1462 insert_stable_location(node **root_location, const key_type &k, value_type v,
1463 bool only_if_absent, value_type *old_v,
1464 insert_info_t *insert_info);
1467 remove_stable_location(node **root_location, const key_type &k, value_type *old_v);
1472 * The tree walk API is a bit strange, due to the optimistic nature of the
1475 * The way it works is that, on_node_begin() is first called. In
1476 * on_node_begin(), a callback function should read (but not modify) the
1477 * values it is interested in, and save them.
1479 * Then, either one of on_node_success() or on_node_failure() is called. If
1480 * on_node_success() is called, then the previous values read in
1481 * on_node_begin() are indeed valid. If on_node_failure() is called, then
1482 * the previous values are not valid and should be discarded.
1484 class tree_walk_callback {
1486 virtual ~tree_walk_callback() {}
1487 virtual void on_node_begin(const node_opaque_t *n) = 0;
1488 virtual void on_node_success() = 0;
1489 virtual void on_node_failure() = 0;
1492 void tree_walk(tree_walk_callback &callback) const;
1495 class size_walk_callback : public tree_walk_callback {
1497 size_walk_callback() : spec_size_(0), size_(0) {}
1498 virtual void on_node_begin(const node_opaque_t *n);
1499 virtual void on_node_success();
1500 virtual void on_node_failure();
1501 inline size_t get_size() const { return size_; }
1509 * Is thread-safe, but not really designed to perform well with concurrent
1510 * modifications. also the value returned is not consistent given concurrent
1516 size_walk_callback c;
1518 return c.get_size();
1521 static inline uint64_t
1522 ExtractVersionNumber(const node_opaque_t *n)
1524 // XXX(stephentu): I think we must use stable_version() for
1525 // correctness, but I am not 100% sure. It's definitely correct to use it,
1526 // but maybe we can get away with unstable_version()?
1527 return RawVersionManip::Version(n->stable_version());
1530 // [value, has_suffix]
1531 static std::vector< std::pair<value_type, bool> >
1532 ExtractValues(const node_opaque_t *n);
1538 * Not well defined if n is being concurrently modified, just for debugging
1541 NodeStringify(const node_opaque_t *n);
1543 static inline size_t
1546 return sizeof(internal_node);
1549 static inline size_t
1552 return sizeof(leaf_node);
1558 * Move the array slice from [p, n) to the right by k position, occupying [p + k, n + k),
1559 * leaving the values of array[p] to array[p + k -1] undefined. Has no effect if p >= n
1561 * Note: Assumes that array[n] is valid memory.
1563 template <typename T>
1564 static inline ALWAYS_INLINE void
1565 sift_right(T *array, size_t p, size_t n, size_t k = 1)
1569 for (size_t i = n + k - 1; i > p + k - 1; i--)
1570 array[i] = array[i - k];
1573 // use swap() to do moves, for efficiency- avoid this
1574 // variant when using primitive arrays
1575 template <typename T>
1576 static inline ALWAYS_INLINE void
1577 sift_swap_right(T *array, size_t p, size_t n, size_t k = 1)
1581 for (size_t i = n + k - 1; i > p + k - 1; i--)
1582 array[i].swap(array[i - k]);
1586 * Move the array slice from [p + k, n) to the left by k positions, occupying [p, n - k),
1587 * overwriting array[p]..array[p+k-1] Has no effect if p + k >= n
1589 template <typename T>
1590 static inline ALWAYS_INLINE void
1591 sift_left(T *array, size_t p, size_t n, size_t k = 1)
1593 if (unlikely(p + k >= n))
1595 for (size_t i = p; i < n - k; i++)
1596 array[i] = array[i + k];
1599 template <typename T>
1600 static inline ALWAYS_INLINE void
1601 sift_swap_left(T *array, size_t p, size_t n, size_t k = 1)
1603 if (unlikely(p + k >= n))
1605 for (size_t i = p; i < n - k; i++)
1606 array[i].swap(array[i + k]);
1610 * Copy [p, n) from source into dest. Has no effect if p >= n
1612 template <typename T>
1613 static inline ALWAYS_INLINE void
1614 copy_into(T *dest, T *source, size_t p, size_t n)
1616 for (size_t i = p; i < n; i++)
1617 *dest++ = source[i];
1620 template <typename T>
1621 static inline ALWAYS_INLINE void
1622 swap_with(T *dest, T *source, size_t p, size_t n)
1624 for (size_t i = p; i < n; i++)
1625 (dest++)->swap(source[i]);
1628 leaf_node *leftmost_descend_layer(node *n) const;
1631 * Assumes RCU region scope is held
1633 bool search_impl(const key_type &k, value_type &v,
1634 typename util::vec<leaf_node *>::type &leaf_nodes,
1635 versioned_node_t *search_info = nullptr) const;
1639 leaf_node *leaf, uint64_t kslice, uint64_t &version);
1642 * traverses the lower leaf levels for a leaf node resp for kslice such that
1643 * version is stable and not deleting. resp info is given via idxmatch +
1646 * if idxmatch != -1, then ignore idxlowerbound
1648 * note to actually use the info, you still need to validate it (the info is
1649 * tentative as of the version)
1652 FindRespLeafLowerBound(
1653 leaf_node *leaf, uint64_t kslice,
1654 size_t kslicelen, uint64_t &version,
1655 size_t &n, ssize_t &idxmatch, ssize_t &idxlowerbound);
1659 leaf_node *leaf, uint64_t kslice,
1660 size_t kslicelen, uint64_t &version,
1661 size_t &n, ssize_t &idxmatch);
1663 typedef std::pair<node *, uint64_t> insert_parent_entry;
1665 enum insert_status {
1666 I_NONE_NOMOD, // no nodes split nor modified
1667 I_NONE_MOD, // no nodes split, but modified
1669 I_SPLIT, // node(s) split
1673 * insert k=>v into node n. if this insert into n causes it to split into two
1674 * nodes, return the new node (upper half of keys). in this case, min_key is set to the
1675 * smallest key that the new node is responsible for. otherwise return null, in which
1676 * case min_key's value is not defined.
1678 * NOTE: our implementation of insert0() is not as efficient as possible, in favor
1685 bool only_if_absent,
1687 insert_info_t *insert_info,
1690 typename util::vec<insert_parent_entry>::type &parents,
1691 typename util::vec<node *>::type &locked_nodes);
1693 enum remove_status {
1704 inline ALWAYS_INLINE void
1705 remove_pos_from_leaf_node(leaf_node *leaf, size_t pos, size_t n)
1707 INVARIANT(leaf->key_slots_used() == n);
1709 if (leaf->value_is_layer(pos)) {
1710 #ifdef CHECK_INVARIANTS
1711 leaf->values_[pos].n_->lock();
1713 leaf->values_[pos].n_->mark_deleting();
1714 INVARIANT(leaf->values_[pos].n_->is_leaf_node());
1715 INVARIANT(leaf->values_[pos].n_->key_slots_used() == 0);
1716 leaf_node::release((leaf_node *) leaf->values_[pos].n_);
1717 #ifdef CHECK_INVARIANTS
1718 leaf->values_[pos].n_->unlock();
1721 sift_left(leaf->keys_, pos, n);
1722 sift_left(leaf->values_, pos, n);
1723 sift_left(leaf->lengths_, pos, n);
1724 if (leaf->suffixes_)
1725 sift_swap_left(leaf->suffixes_, pos, n);
1726 leaf->dec_key_slots_used();
1729 inline ALWAYS_INLINE void
1730 remove_pos_from_internal_node(
1731 internal_node *internal, size_t key_pos, size_t child_pos, size_t n)
1733 INVARIANT(internal->key_slots_used() == n);
1734 INVARIANT(key_pos < n);
1735 INVARIANT(child_pos < n + 1);
1736 sift_left(internal->keys_, key_pos, n);
1737 sift_left(internal->children_, child_pos, n + 1);
1738 internal->dec_key_slots_used();
1741 struct remove_parent_entry {
1742 // non-const members for STL
1744 node *parent_left_sibling_;
1745 node *parent_right_sibling_;
1746 uint64_t parent_version_;
1748 // default ctor for STL
1749 remove_parent_entry()
1750 : parent_(NULL), parent_left_sibling_(NULL),
1751 parent_right_sibling_(NULL), parent_version_(0)
1754 remove_parent_entry(node *parent,
1755 node *parent_left_sibling,
1756 node *parent_right_sibling,
1757 uint64_t parent_version)
1758 : parent_(parent), parent_left_sibling_(parent_left_sibling),
1759 parent_right_sibling_(parent_right_sibling),
1760 parent_version_(parent_version)
1773 node *&replace_node,
1774 typename util::vec<remove_parent_entry>::type &parents,
1775 typename util::vec<node *>::type &locked_nodes);
1778 template <typename P>
1780 btree<P>::node::prefetch() const
1783 AsLeaf(this)->prefetch();
1785 AsInternal(this)->prefetch();
1788 extern void TestConcurrentBtreeFast();
1789 extern void TestConcurrentBtreeSlow();
1792 typedef btree<concurrent_btree_traits> concurrent_btree;
1793 typedef btree<single_threaded_btree_traits> single_threaded_btree;