16 #include <type_traits>
19 #include <unordered_map>
22 #include "btree_choice.h"
31 #include "small_unordered_map.h"
32 #include "static_unordered_map.h"
33 #include "static_vector.h"
36 #include "scopedperf.hh"
37 #include "marked_ptr.h"
38 #include "ndb_type_traits.h"
41 template <template <typename> class Transaction, typename P>
44 class transaction_unusable_exception {};
45 class transaction_read_only_exception {};
48 extern std::string (*g_proto_version_str)(uint64_t v);
50 // base class with very simple definitions- nothing too exciting yet
51 class transaction_base {
52 template <template <typename> class T, typename P>
53 friend class base_txn_btree;
56 typedef dbtuple::tid_t tid_t;
57 typedef dbtuple::size_type size_type;
58 typedef dbtuple::string_type string_type;
60 // TXN_EMBRYO - the transaction object has been allocated but has not
61 // done any operations yet
62 enum txn_state { TXN_EMBRYO, TXN_ACTIVE, TXN_COMMITED, TXN_ABRT, };
65 // use the low-level scan protocol for checking scan consistency,
66 // instead of keeping track of absent ranges
67 TXN_FLAG_LOW_LEVEL_SCAN = 0x1,
69 // true to mark a read-only transaction- if a txn marked read-only
70 // does a write, a transaction_read_only_exception is thrown and the
72 TXN_FLAG_READ_ONLY = 0x2,
74 // XXX: more flags in the future, things like consistency levels
77 #define ABORT_REASONS(x) \
78 x(ABORT_REASON_NONE) \
79 x(ABORT_REASON_USER) \
80 x(ABORT_REASON_UNSTABLE_READ) \
81 x(ABORT_REASON_FUTURE_TID_READ) \
82 x(ABORT_REASON_NODE_SCAN_WRITE_VERSION_CHANGED) \
83 x(ABORT_REASON_NODE_SCAN_READ_VERSION_CHANGED) \
84 x(ABORT_REASON_WRITE_NODE_INTERFERENCE) \
85 x(ABORT_REASON_INSERT_NODE_INTERFERENCE) \
86 x(ABORT_REASON_READ_NODE_INTEREFERENCE) \
87 x(ABORT_REASON_READ_ABSENCE_INTEREFERENCE)
96 AbortReasonStr(abort_reason reason)
99 #define CASE_X(x) case x: return #x;
100 ABORT_REASONS(CASE_X)
105 ALWAYS_ASSERT(false);
109 transaction_base(uint64_t flags)
111 reason(ABORT_REASON_NONE),
114 transaction_base(const transaction_base &) = delete;
115 transaction_base(transaction_base &&) = delete;
116 transaction_base &operator=(const transaction_base &) = delete;
119 #define EVENT_COUNTER_DEF_X(x) \
120 static event_counter g_ ## x ## _ctr;
121 ABORT_REASONS(EVENT_COUNTER_DEF_X)
122 #undef EVENT_COUNTER_DEF_X
124 static event_counter *
125 AbortReasonCounter(abort_reason reason)
128 #define EVENT_COUNTER_CASE_X(x) case x: return &g_ ## x ## _ctr;
129 ABORT_REASONS(EVENT_COUNTER_CASE_X)
130 #undef EVENT_COUNTER_CASE_X
134 ALWAYS_ASSERT(false);
140 // only fires during invariant checking
144 if (state == TXN_EMBRYO)
146 INVARIANT(state == TXN_ACTIVE);
157 // the read set is a mapping from (tuple -> tid_read).
158 // "write_set" is used to indicate if this read tuple
159 // also belongs in the write set.
160 struct read_record_t {
161 constexpr read_record_t() : tuple(), t() {}
162 constexpr read_record_t(const dbtuple *tuple, tid_t t)
163 : tuple(tuple), t(t) {}
164 inline const dbtuple *
175 const dbtuple *tuple;
179 friend std::ostream &
180 operator<<(std::ostream &o, const read_record_t &r);
182 // the write set is logically a mapping from (tuple -> value_to_write).
183 struct write_record_t {
186 FLAGS_DOWRITE = 0x1 << 1,
189 constexpr inline write_record_t()
190 : tuple(), k(), r(), w(), btr()
193 // all inputs are assumed to be stable
194 inline write_record_t(dbtuple *tuple,
195 const string_type *k,
197 dbtuple::tuple_writer_t w,
198 concurrent_btree *btr,
206 this->btr.set_flags(insert ? FLAGS_INSERT : 0);
213 inline const dbtuple *
221 return btr.get_flags() & FLAGS_INSERT;
226 return btr.get_flags() & FLAGS_DOWRITE;
231 INVARIANT(!do_write());
232 btr.or_flags(FLAGS_DOWRITE);
234 inline concurrent_btree *
239 inline const string_type &
249 inline dbtuple::tuple_writer_t
256 const string_type *k;
258 dbtuple::tuple_writer_t w;
259 marked_ptr<concurrent_btree> btr; // first bit for inserted, 2nd for dowrite
262 friend std::ostream &
263 operator<<(std::ostream &o, const write_record_t &r);
265 // the absent set is a mapping from (btree_node -> version_number).
266 struct absent_record_t { uint64_t version; };
268 friend std::ostream &
269 operator<<(std::ostream &o, const absent_record_t &r);
271 struct dbtuple_write_info {
274 FLAGS_INSERT = 0x1 << 1,
276 dbtuple_write_info() : tuple(), entry(nullptr), pos() {}
277 dbtuple_write_info(dbtuple *tuple, write_record_t *entry,
278 bool is_insert, size_t pos)
279 : tuple(tuple), entry(entry), pos(pos)
282 this->tuple.set_flags(FLAGS_LOCKED | FLAGS_INSERT);
284 // XXX: for searching only
285 explicit dbtuple_write_info(const dbtuple *tuple)
286 : tuple(const_cast<dbtuple *>(tuple)), entry(), pos() {}
292 inline const dbtuple *
297 inline ALWAYS_INLINE void
300 INVARIANT(!is_locked());
301 tuple.or_flags(FLAGS_LOCKED);
302 INVARIANT(is_locked());
304 inline ALWAYS_INLINE bool
307 return tuple.get_flags() & FLAGS_LOCKED;
309 inline ALWAYS_INLINE bool
312 return tuple.get_flags() & FLAGS_INSERT;
315 bool operator<(const dbtuple_write_info &o) const
317 // the unique key is [tuple, !is_insert, pos]
318 return tuple < o.tuple ||
319 (tuple == o.tuple && !is_insert() < !o.is_insert()) ||
320 (tuple == o.tuple && !is_insert() == !o.is_insert() && pos < o.pos);
322 marked_ptr<dbtuple> tuple;
323 write_record_t *entry;
328 static event_counter g_evt_read_logical_deleted_node_search;
329 static event_counter g_evt_read_logical_deleted_node_scan;
330 static event_counter g_evt_dbtuple_write_search_failed;
331 static event_counter g_evt_dbtuple_write_insert_failed;
333 static event_counter evt_local_search_lookups;
334 static event_counter evt_local_search_write_set_hits;
335 static event_counter evt_dbtuple_latest_replacement;
337 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe0, g_txn_commit_probe0_cg);
338 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe1, g_txn_commit_probe1_cg);
339 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe2, g_txn_commit_probe2_cg);
340 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe3, g_txn_commit_probe3_cg);
341 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe4, g_txn_commit_probe4_cg);
342 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe5, g_txn_commit_probe5_cg);
343 CLASS_STATIC_COUNTER_DECL(scopedperf::tsc_ctr, g_txn_commit_probe6, g_txn_commit_probe6_cg);
347 const uint64_t flags;
350 // type specializations
353 struct is_trivially_destructible<transaction_base::read_record_t> {
354 static const bool value = true;
358 struct is_trivially_destructible<transaction_base::write_record_t> {
359 static const bool value = true;
363 struct is_trivially_destructible<transaction_base::absent_record_t> {
364 static const bool value = true;
368 struct is_trivially_destructible<transaction_base::dbtuple_write_info> {
369 static const bool value = true;
373 inline ALWAYS_INLINE std::ostream &
374 operator<<(std::ostream &o, const transaction_base::read_record_t &r)
376 //o << "[tuple=" << util::hexify(r.get_tuple())
377 o << "[tuple=" << *r.get_tuple()
378 << ", tid_read=" << g_proto_version_str(r.get_tid())
383 inline ALWAYS_INLINE std::ostream &
386 const transaction_base::write_record_t &r)
388 o << "[tuple=" << r.get_tuple()
389 << ", key=" << util::hexify(r.get_key())
390 << ", value=" << util::hexify(r.get_value())
391 << ", insert=" << r.is_insert()
392 << ", do_write=" << r.do_write()
393 << ", btree=" << r.get_btree()
398 inline ALWAYS_INLINE std::ostream &
399 operator<<(std::ostream &o, const transaction_base::absent_record_t &r)
401 o << "[v=" << r.version << "]";
405 struct default_transaction_traits {
406 static const size_t read_set_expected_size = SMALL_SIZE_MAP;
407 static const size_t absent_set_expected_size = EXTRA_SMALL_SIZE_MAP;
408 static const size_t write_set_expected_size = SMALL_SIZE_MAP;
409 static const bool stable_input_memory = false;
410 static const bool hard_expected_sizes = false; // true if the expected sizes are hard maximums
411 static const bool read_own_writes = true; // if we read a key which we previous put(), are we guaranteed
412 // to read our latest (uncommited) values? this comes at a
413 // performance penality [you should not need this behavior to
414 // write txns, since you *know* the values you inserted]
416 typedef util::default_string_allocator StringAllocator;
419 struct default_stable_transaction_traits : public default_transaction_traits {
420 static const bool stable_input_memory = true;
423 template <template <typename> class Protocol, typename Traits>
424 class transaction : public transaction_base {
425 // XXX: weaker than necessary
426 template <template <typename> class, typename>
427 friend class base_txn_btree;
428 friend Protocol<Traits>;
432 // KeyWriter is expected to implement:
433 // [1-arg constructor]
434 // KeyWriter(const Key *)
435 // [fully materialize]
436 // template <typename StringAllocator>
437 // const std::string * fully_materialize(bool, StringAllocator &)
439 // ValueWriter is expected to implement:
440 // [1-arg constructor]
441 // ValueWriter(const Value *, ValueInfo)
442 // [compute new size from old value]
443 // size_t compute_needed(const uint8_t *, size_t)
444 // [fully materialize]
445 // template <typename StringAllocator>
446 // const std::string * fully_materialize(bool, StringAllocator &)
448 // void operator()(uint8_t *, size_t)
450 // ValueWriter does not have to be move/copy constructable. The value passed
451 // into the ValueWriter constructor is guaranteed to be valid throughout the
452 // lifetime of a ValueWriter instance.
454 // KeyReader Interface
456 // KeyReader is a simple transformation from (const std::string &) => const Key &.
457 // The input is guaranteed to be stable, so it has a simple interface:
459 // const Key &operator()(const std::string &)
461 // The KeyReader is expect to preserve the following property: After a call
462 // to operator(), but before the next, the returned value is guaranteed to be
463 // valid and remain stable.
465 // ValueReader Interface
467 // ValueReader is a more complex transformation from (const uint8_t *, size_t) => Value &.
468 // The input is not guaranteed to be stable, so it has a more complex interface:
470 // template <typename StringAllocator>
471 // bool operator()(const uint8_t *, size_t, StringAllocator &)
473 // This interface returns false if there was not enough buffer space to
474 // finish the read, true otherwise. Note that this interface returning true
475 // does NOT mean that a read was stable, but it just means there were enough
476 // bytes in the buffer to perform the tentative read.
478 // Note that ValueReader also exposes a dup interface
480 // template <typename StringAllocator>
481 // void dup(const Value &, StringAllocator &)
483 // ValueReader also exposes a means to fetch results:
487 // The ValueReader is expected to preserve the following property: After a
488 // call to operator(), if it returns true, then the value returned from
489 // results() should remain valid and stable until the next call to
492 //typedef typename P::Key key_type;
493 //typedef typename P::Value value_type;
494 //typedef typename P::ValueInfo value_info_type;
496 //typedef typename P::KeyWriter key_writer_type;
497 //typedef typename P::ValueWriter value_writer_type;
499 //typedef typename P::KeyReader key_reader_type;
500 //typedef typename P::SingleValueReader single_value_reader_type;
501 //typedef typename P::ValueReader value_reader_type;
503 typedef Traits traits_type;
504 typedef typename Traits::StringAllocator string_allocator_type;
509 inline ALWAYS_INLINE Protocol<Traits> *
512 return static_cast<Protocol<Traits> *>(this);
515 inline ALWAYS_INLINE const Protocol<Traits> *
518 return static_cast<const Protocol<Traits> *>(this);
521 // XXX: we have baked in b-tree into the protocol- other indexes are possible
522 // but we would need to abstract it away. we don't bother for now.
524 #ifdef USE_SMALL_CONTAINER_OPT
525 // XXX: use parameterized typedef to avoid duplication
528 typedef small_vector<
530 traits_type::read_set_expected_size> read_set_map_small;
531 typedef small_vector<
533 traits_type::write_set_expected_size> write_set_map_small;
534 typedef small_unordered_map<
535 const typename concurrent_btree::node_opaque_t *, absent_record_t,
536 traits_type::absent_set_expected_size> absent_set_map_small;
539 typedef static_vector<
541 traits_type::read_set_expected_size> read_set_map_static;
542 typedef static_vector<
544 traits_type::write_set_expected_size> write_set_map_static;
545 typedef static_unordered_map<
546 const typename concurrent_btree::node_opaque_t *, absent_record_t,
547 traits_type::absent_set_expected_size> absent_set_map_static;
549 // helper types for log writing
550 typedef small_vector<
552 traits_type::write_set_expected_size> write_set_u32_vec_small;
553 typedef static_vector<
555 traits_type::write_set_expected_size> write_set_u32_vec_static;
557 // use static types if the expected sizes are guarantees
559 typename std::conditional<
560 traits_type::hard_expected_sizes,
561 read_set_map_static, read_set_map_small>::type read_set_map;
563 typename std::conditional<
564 traits_type::hard_expected_sizes,
565 write_set_map_static, write_set_map_small>::type write_set_map;
567 typename std::conditional<
568 traits_type::hard_expected_sizes,
569 absent_set_map_static, absent_set_map_small>::type absent_set_map;
571 typename std::conditional<
572 traits_type::hard_expected_sizes,
573 write_set_u32_vec_static, write_set_u32_vec_small>::type write_set_u32_vec;
576 typedef std::vector<read_record_t> read_set_map;
577 typedef std::vector<write_record_t> write_set_map;
578 typedef std::vector<absent_record_t> absent_set_map;
579 typedef std::vector<uint32_t> write_set_u32_vec;
582 template <typename T>
583 using write_set_sized_vec =
584 typename std::conditional<
585 traits_type::hard_expected_sizes,
586 static_vector<T, traits_type::write_set_expected_size>,
587 typename util::vec<T, traits_type::write_set_expected_size>::type
593 dbtuple_write_info, traits_type::write_set_expected_size>::type
594 dbtuple_write_info_vec_small;
599 dbtuple_write_info, traits_type::write_set_expected_size>
600 dbtuple_write_info_vec_static;
604 typename std::conditional<
605 traits_type::hard_expected_sizes,
606 dbtuple_write_info_vec_static, dbtuple_write_info_vec_small>::type
607 dbtuple_write_info_vec;
610 sorted_dbtuples_contains(
611 const dbtuple_write_info_vec &dbtuples,
612 const dbtuple *tuple)
614 // XXX: skip binary search for small-sized dbtuples?
615 return std::binary_search(
616 dbtuples.begin(), dbtuples.end(),
617 dbtuple_write_info(tuple),
618 [](const dbtuple_write_info &lhs, const dbtuple_write_info &rhs)
619 { return lhs.get_tuple() < rhs.get_tuple(); });
624 inline transaction(uint64_t flags, string_allocator_type &sa);
625 inline ~transaction();
627 // returns TRUE on successful commit, FALSE on abort
628 // if doThrow, signals success by returning true, and
629 // failure by throwing an abort exception
630 bool commit(bool doThrow = false);
632 // abort() always succeeds
636 abort_impl(ABORT_REASON_USER);
639 void dump_debug_info() const;
643 abort_trap(abort_reason reason)
645 AbortReasonCounter(reason)->inc();
646 this->reason = reason; // for dump_debug_info() to see
651 inline ALWAYS_INLINE void
652 abort_trap(abort_reason reason)
654 AbortReasonCounter(reason)->inc();
658 std::map<std::string, uint64_t> get_txn_counters() const;
660 inline ALWAYS_INLINE bool
663 return get_flags() & TXN_FLAG_READ_ONLY;
666 // for debugging purposes only
667 inline const read_set_map &
673 inline const write_set_map &
674 get_write_set() const
679 inline const absent_set_map &
680 get_absent_set() const
686 inline void abort_impl(abort_reason r);
688 // assumes lock on marker is held on marker by caller, and marker is the
689 // latest: removes marker from tree, and clears latest
690 void cleanup_inserted_tuple_marker(
691 dbtuple *marker, const std::string &key,
692 concurrent_btree *btr);
694 // low-level API for txn_btree
696 // try to insert a new "tentative" tuple into the underlying
697 // btree associated with the given context.
699 // if return.first is not null, then this function will
700 // 1) mutate the transaction such that the absent_set is aware of any
701 // mutating changes made to the underlying btree.
702 // 2) add the new tuple to the write_set
704 // if return.second is true, then this txn should abort, because a conflict
705 // was detected w/ the absent_set.
707 // if return.first is not null, the returned tuple is locked()!
709 // if the return.first is null, then this function has no side effects.
711 // NOTE: !ret.first => !ret.second
712 // NOTE: assumes key/value are stable
713 std::pair< dbtuple *, bool >
714 try_insert_new_tuple(
715 concurrent_btree &btr,
716 const std::string *key,
718 dbtuple::tuple_writer_t writer);
720 // reads the contents of tuple into v
721 // within this transaction context
722 template <typename ValueReader>
724 do_tuple_read(const dbtuple *tuple, ValueReader &value_reader);
727 do_node_read(const typename concurrent_btree::node_opaque_t *n, uint64_t version);
730 // expected public overrides
733 * Can we overwrite prev with cur?
735 bool can_overwrite_record_tid(tid_t prev, tid_t cur) const;
737 inline string_allocator_type &
744 // expected protected overrides
747 * create a new, unique TID for a txn. at the point which gen_commit_tid(),
748 * it still has not been decided whether or not this txn will commit
751 tid_t gen_commit_tid(const dbtuple_write_info_vec &write_tuples);
753 bool can_read_tid(tid_t t) const;
755 // For GC handlers- note that on_dbtuple_spill() is called
756 // with the lock on ln held, to simplify GC code
758 // Is also called within an RCU read region
759 void on_dbtuple_spill(dbtuple *tuple_ahead, dbtuple *tuple);
761 // Called when the latest value written to ln is an empty
762 // (delete) marker. The protocol can then decide how to schedule
763 // the logical node for actual deletion
764 void on_logical_delete(dbtuple *tuple, const std::string &key, concurrent_btree *btr);
766 // if gen_commit_tid() is called, then on_tid_finish() will be called
767 // with the commit tid. before on_tid_finish() is called, state is updated
768 // with the resolution (commited, aborted) of this txn
769 void on_tid_finish(tid_t commit_tid);
771 void on_post_rcu_region_completion();
776 // SLOW accessor methods- used for invariant checking
778 typename read_set_map::iterator
779 find_read_set(const dbtuple *tuple)
781 // linear scan- returns the *first* entry found
782 // (a tuple can exist in the read_set more than once)
783 typename read_set_map::iterator it = read_set.begin();
784 typename read_set_map::iterator it_end = read_set.end();
785 for (; it != it_end; ++it)
786 if (it->get_tuple() == tuple)
791 inline typename read_set_map::const_iterator
792 find_read_set(const dbtuple *tuple) const
794 return const_cast<transaction *>(this)->find_read_set(tuple);
797 typename write_set_map::iterator
798 find_write_set(dbtuple *tuple)
800 // linear scan- returns the *first* entry found
801 // (a tuple can exist in the write_set more than once)
802 typename write_set_map::iterator it = write_set.begin();
803 typename write_set_map::iterator it_end = write_set.end();
804 for (; it != it_end; ++it)
805 if (it->get_tuple() == tuple)
810 inline typename write_set_map::const_iterator
811 find_write_set(const dbtuple *tuple) const
813 return const_cast<transaction *>(this)->find_write_set(tuple);
817 handle_last_tuple_in_group(
818 dbtuple_write_info &info, bool did_group_insert);
820 read_set_map read_set;
821 write_set_map write_set;
822 absent_set_map absent_set;
824 string_allocator_type *sa;
826 unmanaged<scoped_rcu_region> rcu_guard_;
829 class transaction_abort_exception : public std::exception {
831 transaction_abort_exception(transaction_base::abort_reason r)
833 inline transaction_base::abort_reason
841 return transaction_base::AbortReasonStr(r);
844 transaction_base::abort_reason r;
847 // XXX(stephentu): stupid hacks
848 // XXX(stephentu): txn_epoch_sync is a misnomer
849 template <template <typename> class Transaction>
850 struct txn_epoch_sync {
851 // block until the next epoch
852 static inline void sync() {}
853 // finish any async jobs
854 static inline void finish() {}
855 // run this code when a benchmark worker finishes
856 static inline void thread_end() {}
857 // how many txns have we persisted in total, from
858 // the last reset invocation?
859 static inline std::pair<uint64_t, double>
860 compute_ntxn_persisted() { return {0, 0.0}; }
861 // reset the persisted counters
862 static inline void reset_ntxn_persisted() {}
865 #endif /* _NDB_TXN_H_ */