1 // -*- c-basic-offset: 2 -*-
18 #include "ndb_type_traits.h"
26 #include "small_vector.h"
27 #include "ownership_checker.h"
29 #include "masstree/masstree_scan.hh"
30 #include "masstree/masstree_insert.hh"
31 #include "masstree/masstree_remove.hh"
32 #include "masstree/masstree_print.hh"
33 #include "masstree/timestamp.hh"
34 #include "masstree/mtcounters.hh"
35 #include "masstree/circular_int.hh"
37 class simple_threadinfo {
44 virtual void operator()(simple_threadinfo& ti) = 0;
48 static inline void rcu_callback_function(void* p) {
50 static_cast<rcu_callback*>(p)->operator()(ti);
54 // XXX Correct node timstamps are needed for recovery, but for no other
56 kvtimestamp_t operation_timestamp() const {
59 kvtimestamp_t update_timestamp() const {
62 kvtimestamp_t update_timestamp(kvtimestamp_t x) const {
63 if (circular_int<kvtimestamp_t>::less_equal(ts_, x))
64 // x might be a marker timestamp; ensure result is not
68 kvtimestamp_t update_timestamp(kvtimestamp_t x, kvtimestamp_t y) const {
69 if (circular_int<kvtimestamp_t>::less(x, y))
71 if (circular_int<kvtimestamp_t>::less_equal(ts_, x))
72 // x might be a marker timestamp; ensure result is not
76 void increment_timestamp() {
79 void advance_timestamp(kvtimestamp_t x) {
80 if (circular_int<kvtimestamp_t>::less(ts_, x))
85 void mark(threadcounter) {
87 void mark(threadcounter, int64_t) {
89 bool has_counter(threadcounter) const {
92 uint64_t counter(threadcounter ci) const {
96 /** @brief Return a function object that calls mark(ci); relax_fence().
98 * This function object can be used to count the number of relax_fence()s
100 relax_fence_function accounting_relax_fence(threadcounter) {
101 return relax_fence_function();
104 class accounting_relax_fence_function {
106 template <typename V>
111 /** @brief Return a function object that calls mark(ci); relax_fence().
113 * This function object can be used to count the number of relax_fence()s
115 accounting_relax_fence_function stable_fence() {
116 return accounting_relax_fence_function();
119 relax_fence_function lock_fence(threadcounter) {
120 return relax_fence_function();
124 void* allocate(size_t sz, memtag) {
125 return rcu::s_instance.alloc(sz);
127 void deallocate(void* p, size_t sz, memtag) {
128 // in C++ allocators, 'p' must be nonnull
129 rcu::s_instance.dealloc(p, sz);
131 void deallocate_rcu(void *p, size_t sz, memtag) {
133 rcu::s_instance.dealloc_rcu(p, sz);
136 void* pool_allocate(size_t sz, memtag) {
137 int nl = (sz + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE;
138 return rcu::s_instance.alloc(nl * CACHE_LINE_SIZE);
140 void pool_deallocate(void* p, size_t sz, memtag) {
141 int nl = (sz + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE;
142 rcu::s_instance.dealloc(p, nl * CACHE_LINE_SIZE);
144 void pool_deallocate_rcu(void* p, size_t sz, memtag) {
146 int nl = (sz + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE;
147 rcu::s_instance.dealloc_rcu(p, nl * CACHE_LINE_SIZE);
151 void rcu_register(rcu_callback *cb) {
152 scoped_rcu_base<false> guard;
153 rcu::s_instance.free_with_fn(cb, rcu_callback_function);
157 mutable kvtimestamp_t ts_;
160 struct masstree_params : public Masstree::nodeparams<> {
161 typedef uint8_t* value_type;
162 typedef Masstree::value_print<value_type> value_print_type;
163 typedef simple_threadinfo threadinfo_type;
164 enum { RcuRespCaller = true };
167 struct masstree_single_threaded_params : public masstree_params {
168 static constexpr bool concurrent = false;
171 template <typename P>
174 typedef Masstree::node_base<P> node_base_type;
175 typedef Masstree::internode<P> internode_type;
176 typedef Masstree::leaf<P> leaf_type;
177 typedef Masstree::leaf<P> node_type;
178 typedef typename node_base_type::nodeversion_type nodeversion_type;
180 typedef varkey key_type;
181 typedef lcdf::Str string_type;
182 typedef uint64_t key_slice;
183 typedef typename P::value_type value_type;
184 typedef typename P::threadinfo_type threadinfo;
185 typedef typename std::conditional<!P::RcuRespCaller,
187 disabled_rcu_region>::type rcu_region;
189 // public to assist in testing
190 static const unsigned int NKeysPerNode = P::leaf_width;
191 static const unsigned int NMinKeysPerNode = P::leaf_width / 2;
193 // XXX(stephentu): trying out a very opaque node API for now
194 typedef node_type node_opaque_t;
195 typedef std::pair< const node_opaque_t *, uint64_t > versioned_node_t;
196 struct insert_info_t {
197 const node_opaque_t* node;
198 uint64_t old_version;
199 uint64_t new_version;
202 void invariant_checker() {} // stub for now
204 #ifdef BTREE_LOCK_OWNERSHIP_CHECKING
207 NodeLockRegionBegin()
210 ALWAYS_ASSERT(false);
211 //ownership_checker<mbtree<P>, node_base_type>::NodeLockRegionBegin();
214 AssertAllNodeLocksReleased()
217 ALWAYS_ASSERT(false);
218 //ownership_checker<mbtree<P>, node_base_type>::AssertAllNodeLocksReleased();
222 AddNodeToLockRegion(const node_base_type *n)
225 ALWAYS_ASSERT(false);
226 //ownership_checker<mbtree<P>, node_base_type>::AddNodeToLockRegion(n);
233 table_.initialize(ti);
245 inline void clear() {
249 table_.initialize(ti);
252 /** Note: invariant checking is not thread safe */
253 inline void invariant_checker() const {
256 /** NOTE: the public interface assumes that the caller has taken care
257 * of setting up RCU */
259 inline bool search(const key_type &k, value_type &v,
260 versioned_node_t *search_info = nullptr) const;
263 * The low level callback interface is as follows:
265 * Consider a scan in the range [a, b):
266 * 1) on_resp_node() is called at least once per node which
267 * has a responibility range that overlaps with the scan range
268 * 2) invoke() is called per <k, v>-pair such that k is in [a, b)
270 * The order of calling on_resp_node() and invoke() is up to the implementation.
272 class low_level_search_range_callback {
274 virtual ~low_level_search_range_callback() {}
277 * This node lies within the search range (at version v)
279 virtual void on_resp_node(const node_opaque_t *n, uint64_t version) = 0;
282 * This key/value pair was read from node n @ version
284 virtual bool invoke(const string_type &k, value_type v,
285 const node_opaque_t *n, uint64_t version) = 0;
289 * For all keys in [lower, *upper), invoke callback in ascending order.
290 * If upper is NULL, then there is no upper bound
293 * This function by default provides a weakly consistent view of the b-tree. For
294 * instance, consider the following tree, where n = 3 is the max number of
302 * [A|B|C]<->[D|E|F]<->[G|H|I]
304 * Suppose we want to scan [A, inf), so we traverse to the leftmost leaf node
305 * and start a left-to-right walk. Suppose we have emitted keys A, B, and C,
306 * and we are now just about to scan the middle leaf node. Now suppose
307 * another thread concurrently does delete(A), followed by a delete(H). Now
308 * the scaning thread resumes and emits keys D, E, F, G, and I, omitting H
309 * because H was deleted. This is an inconsistent view of the b-tree, since
310 * the scanning thread has observed the deletion of H but did not observe the
311 * deletion of A, but we know that delete(A) happens before delete(H).
313 * The weakly consistent guarantee provided is the following: all keys
314 * which, at the time of invocation, are known to exist in the btree
315 * will be discovered on a scan (provided the key falls within the scan's range),
316 * and provided there are no concurrent modifications/removals of that key
318 * Note that scans within a single node are consistent
320 * XXX: add other modes which provide better consistency:
322 * B) optimistic validation mode
324 * the last string parameter is an optional string buffer to use:
325 * if null, a stack allocated string will be used. if not null, must
327 * A) buf->empty() at the beginning
328 * B) no concurrent mutation of string
329 * note that string contents upon return are arbitrary
332 search_range_call(const key_type &lower,
333 const key_type *upper,
334 low_level_search_range_callback &callback,
335 std::string *buf = nullptr) const;
339 rsearch_range_call(const key_type &upper,
340 const key_type *lower,
341 low_level_search_range_callback &callback,
342 std::string *buf = nullptr) const;
344 class search_range_callback : public low_level_search_range_callback {
347 on_resp_node(const node_opaque_t *n, uint64_t version)
352 invoke(const string_type &k, value_type v,
353 const node_opaque_t *n, uint64_t version)
358 virtual bool invoke(const string_type &k, value_type v) = 0;
364 * Callback is expected to implement bool operator()(key_slice k, value_type v),
365 * where the callback returns true if it wants to keep going, false otherwise
367 template <typename F>
369 search_range(const key_type &lower,
370 const key_type *upper,
372 std::string *buf = nullptr) const;
377 * Callback is expected to implement bool operator()(key_slice k, value_type v),
378 * where the callback returns true if it wants to keep going, false otherwise
380 template <typename F>
382 rsearch_range(const key_type &upper,
383 const key_type *lower,
385 std::string *buf = nullptr) const;
388 * returns true if key k did not already exist, false otherwise
389 * If k exists with a different mapping, still returns false
391 * If false and old_v is not NULL, then the overwritten value of v
392 * is written into old_v
395 insert(const key_type &k, value_type v,
396 value_type *old_v = NULL,
397 insert_info_t *insert_info = NULL);
400 * Only puts k=>v if k does not exist in map. returns true
401 * if k inserted, false otherwise (k exists already)
404 insert_if_absent(const key_type &k, value_type v,
405 insert_info_t *insert_info = NULL);
408 * return true if a value was removed, false otherwise.
410 * if true and old_v is not NULL, then the removed value of v
411 * is written into old_v
414 remove(const key_type &k, value_type *old_v = NULL);
417 * The tree walk API is a bit strange, due to the optimistic nature of the
420 * The way it works is that, on_node_begin() is first called. In
421 * on_node_begin(), a callback function should read (but not modify) the
422 * values it is interested in, and save them.
424 * Then, either one of on_node_success() or on_node_failure() is called. If
425 * on_node_success() is called, then the previous values read in
426 * on_node_begin() are indeed valid. If on_node_failure() is called, then
427 * the previous values are not valid and should be discarded.
429 class tree_walk_callback {
431 virtual ~tree_walk_callback() {}
432 virtual void on_node_begin(const node_opaque_t *n) = 0;
433 virtual void on_node_success() = 0;
434 virtual void on_node_failure() = 0;
437 void tree_walk(tree_walk_callback &callback) const;
440 * Is thread-safe, but not really designed to perform well with concurrent
441 * modifications. also the value returned is not consistent given concurrent
444 inline size_t size() const;
446 static inline uint64_t
447 ExtractVersionNumber(const node_opaque_t *n) {
448 // XXX(stephentu): I think we must use stable_version() for
449 // correctness, but I am not 100% sure. It's definitely correct to use it,
450 // but maybe we can get away with unstable_version()?
451 return n->full_version_value();
454 // [value, has_suffix]
455 static std::vector< std::pair<value_type, bool> >
456 ExtractValues(const node_opaque_t *n);
459 * Not well defined if n is being concurrently modified, just for debugging
462 NodeStringify(const node_opaque_t *n);
466 static inline size_t InternalNodeSize() {
467 return sizeof(internode_type);
470 static inline size_t LeafNodeSize() {
471 return sizeof(leaf_type);
475 Masstree::basic_table<P> table_;
477 static leaf_type* leftmost_descend_layer(node_base_type* n);
478 class size_walk_callback;
479 template <bool Reverse> class search_range_scanner_base;
480 template <bool Reverse> class low_level_search_range_scanner;
481 template <typename F> class low_level_search_range_callback_wrapper;
484 template <typename P>
485 typename mbtree<P>::leaf_type *
486 mbtree<P>::leftmost_descend_layer(node_base_type *n)
488 node_base_type *cur = n;
491 return static_cast<leaf_type*>(cur);
492 internode_type *in = static_cast<internode_type*>(cur);
493 nodeversion_type version = cur->stable();
494 node_base_type *child = in->child_[0];
495 if (unlikely(in->has_changed(version)))
501 template <typename P>
502 void mbtree<P>::tree_walk(tree_walk_callback &callback) const {
504 INVARIANT(rcu::s_instance.in_rcu_region());
505 std::vector<node_base_type *> q, layers;
506 q.push_back(table_.root());
508 node_base_type *cur = q.back();
511 leaf_type *leaf = leftmost_descend_layer(cur);
516 auto version = leaf->stable();
517 auto perm = leaf->permutation();
518 for (int i = 0; i != perm.size(); ++i)
519 if (leaf->is_layer(perm[i]))
520 layers.push_back(leaf->lv_[perm[i]].layer());
521 leaf_type *next = leaf->safe_next();
522 callback.on_node_begin(leaf);
523 if (unlikely(leaf->has_changed(version))) {
524 callback.on_node_failure();
528 callback.on_node_success();
530 if (!layers.empty()) {
531 q.insert(q.end(), layers.begin(), layers.end());
538 template <typename P>
539 class mbtree<P>::size_walk_callback : public tree_walk_callback {
544 virtual void on_node_begin(const node_opaque_t *n);
545 virtual void on_node_success();
546 virtual void on_node_failure();
551 template <typename P>
553 mbtree<P>::size_walk_callback::on_node_begin(const node_opaque_t *n)
555 auto perm = n->permutation();
557 for (int i = 0; i != perm.size(); ++i)
558 if (!n->is_layer(perm[i]))
562 template <typename P>
564 mbtree<P>::size_walk_callback::on_node_success()
569 template <typename P>
571 mbtree<P>::size_walk_callback::on_node_failure()
575 template <typename P>
576 inline size_t mbtree<P>::size() const
578 size_walk_callback c;
583 template <typename P>
584 inline bool mbtree<P>::search(const key_type &k, value_type &v,
585 versioned_node_t *search_info) const
589 Masstree::unlocked_tcursor<P> lp(table_, k.data(), k.length());
590 bool found = lp.find_unlocked(ti);
594 *search_info = versioned_node_t(lp.node(), lp.full_version_value());
598 template <typename P>
599 inline bool mbtree<P>::insert(const key_type &k, value_type v,
601 insert_info_t *insert_info)
605 Masstree::tcursor<P> lp(table_, k.data(), k.length());
606 bool found = lp.find_insert(ti);
608 ti.advance_timestamp(lp.node_timestamp());
613 insert_info->node = lp.node();
614 insert_info->old_version = lp.previous_full_version_value();
615 insert_info->new_version = lp.next_full_version_value(1);
621 template <typename P>
622 inline bool mbtree<P>::insert_if_absent(const key_type &k, value_type v,
623 insert_info_t *insert_info)
627 Masstree::tcursor<P> lp(table_, k.data(), k.length());
628 bool found = lp.find_insert(ti);
630 ti.advance_timestamp(lp.node_timestamp());
633 insert_info->node = lp.node();
634 insert_info->old_version = lp.previous_full_version_value();
635 insert_info->new_version = lp.next_full_version_value(1);
638 lp.finish(!found, ti);
643 * return true if a value was removed, false otherwise.
645 * if true and old_v is not NULL, then the removed value of v
646 * is written into old_v
648 template <typename P>
649 inline bool mbtree<P>::remove(const key_type &k, value_type *old_v)
653 Masstree::tcursor<P> lp(table_, k.data(), k.length());
654 bool found = lp.find_locked(ti);
657 lp.finish(found ? -1 : 0, ti);
661 template <typename P>
662 template <bool Reverse>
663 class mbtree<P>::search_range_scanner_base {
665 search_range_scanner_base(const key_type* boundary)
666 : boundary_(boundary), boundary_compar_(false) {
668 void check(const Masstree::scanstackelt<P>& iter,
669 const Masstree::key<uint64_t>& key) {
670 int min = std::min(boundary_->length(), key.prefix_length());
671 int cmp = memcmp(boundary_->data(), key.full_string().data(), min);
673 if (cmp < 0 || (cmp == 0 && boundary_->length() <= key.prefix_length()))
674 boundary_compar_ = true;
676 uint64_t last_ikey = iter.node()->ikey0_[iter.permutation()[iter.permutation().size() - 1]];
677 boundary_compar_ = boundary_->slice_at(key.prefix_length()) <= last_ikey;
681 boundary_compar_ = true;
685 const key_type* boundary_;
686 bool boundary_compar_;
689 template <typename P>
690 template <bool Reverse>
691 class mbtree<P>::low_level_search_range_scanner
692 : public search_range_scanner_base<Reverse> {
694 low_level_search_range_scanner(const key_type* boundary,
695 low_level_search_range_callback& callback)
696 : search_range_scanner_base<Reverse>(boundary), callback_(callback) {
698 void visit_leaf(const Masstree::scanstackelt<P>& iter,
699 const Masstree::key<uint64_t>& key, threadinfo&) {
700 this->n_ = iter.node();
701 this->v_ = iter.full_version_value();
702 callback_.on_resp_node(this->n_, this->v_);
704 this->check(iter, key);
706 bool visit_value(const Masstree::key<uint64_t>& key,
707 value_type value, threadinfo&) {
708 if (this->boundary_compar_) {
709 lcdf::Str bs(this->boundary_->data(), this->boundary_->size());
710 if ((!Reverse && bs <= key.full_string()) ||
711 ( Reverse && bs >= key.full_string()))
714 return callback_.invoke(key.full_string(), value, this->n_, this->v_);
717 Masstree::leaf<P>* n_;
719 low_level_search_range_callback& callback_;
722 template <typename P>
723 template <typename F>
724 class mbtree<P>::low_level_search_range_callback_wrapper :
725 public mbtree<P>::low_level_search_range_callback {
727 low_level_search_range_callback_wrapper(F& callback) : callback_(callback) {}
729 void on_resp_node(const node_opaque_t *n, uint64_t version) OVERRIDE {}
732 invoke(const string_type &k, value_type v,
733 const node_opaque_t *n, uint64_t version) OVERRIDE
735 return callback_(k, v);
742 template <typename P>
743 inline void mbtree<P>::search_range_call(const key_type &lower,
744 const key_type *upper,
745 low_level_search_range_callback &callback,
746 std::string*) const {
747 low_level_search_range_scanner<false> scanner(upper, callback);
749 table_.scan(lcdf::Str(lower.data(), lower.length()), true, scanner, ti);
752 template <typename P>
753 inline void mbtree<P>::rsearch_range_call(const key_type &upper,
754 const key_type *lower,
755 low_level_search_range_callback &callback,
756 std::string*) const {
757 low_level_search_range_scanner<true> scanner(lower, callback);
759 table_.rscan(lcdf::Str(upper.data(), upper.length()), true, scanner, ti);
762 template <typename P> template <typename F>
763 inline void mbtree<P>::search_range(const key_type &lower,
764 const key_type *upper,
766 std::string*) const {
767 low_level_search_range_callback_wrapper<F> wrapper(callback);
768 low_level_search_range_scanner<false> scanner(upper, wrapper);
770 table_.scan(lcdf::Str(lower.data(), lower.length()), true, scanner, ti);
773 template <typename P> template <typename F>
774 inline void mbtree<P>::rsearch_range(const key_type &upper,
775 const key_type *lower,
777 std::string*) const {
778 low_level_search_range_callback_wrapper<F> wrapper(callback);
779 low_level_search_range_scanner<true> scanner(lower, wrapper);
781 table_.rscan(lcdf::Str(upper.data(), upper.length()), true, scanner, ti);
784 template <typename P>
785 std::string mbtree<P>::NodeStringify(const node_opaque_t *n)
787 std::ostringstream b;
788 b << "node[v=" << n->version_value() << "]";
792 template <typename P>
793 std::vector<std::pair<typename mbtree<P>::value_type, bool>>
794 mbtree<P>::ExtractValues(const node_opaque_t *n)
796 std::vector< std::pair<value_type, bool> > ret;
797 auto perm = n->permutation();
798 for (int i = 0; i != perm.size(); ++i) {
799 int keylenx = n->keylenx_[perm[i]];
800 if (!n->keylenx_is_layer(keylenx))
801 ret.emplace_back(n->lv_[perm[i]].value(), n->keylenx_has_ksuf(keylenx));
806 template <typename P>
807 void mbtree<P>::print() {
811 typedef mbtree<masstree_params> concurrent_btree;
812 typedef mbtree<masstree_single_threaded_params> single_threaded_btree;