From 53e6886f69ae6ee428c9a72643213c80bd75c793 Mon Sep 17 00:00:00 2001 From: Nathan Bronson Date: Mon, 15 Jun 2015 08:29:44 -0700 Subject: [PATCH] Move AtomicUnorderedInsertMap to folly. Summary: AtomicUnorderedInsertMap is a concurrent hash table that firmly at the performance end of the generality <-> performance spectrum. If you don't need updates (or can use your own concurrency control when overwriting values), you never need to delete, and you can predict your capacity perfectly, then you will get wait-free reads, lock-free inserts, safe concurrent iteration, and excellent cache and performance outlier behavior. Arbitrary key and value types are supported. Reviewed By: @yfeldblum Differential Revision: D2145281 --- folly/AtomicUnorderedMap.h | 490 +++++++++++++++++++++++++ folly/Makefile.am | 2 + folly/detail/AtomicUnorderedMapUtils.h | 52 +++ folly/test/AtomicUnorderedMapTest.cpp | 316 ++++++++++++++++ 4 files changed, 860 insertions(+) create mode 100644 folly/AtomicUnorderedMap.h create mode 100644 folly/detail/AtomicUnorderedMapUtils.h create mode 100644 folly/test/AtomicUnorderedMapTest.cpp diff --git a/folly/AtomicUnorderedMap.h b/folly/AtomicUnorderedMap.h new file mode 100644 index 00000000..a2c4ac73 --- /dev/null +++ b/folly/AtomicUnorderedMap.h @@ -0,0 +1,490 @@ +/* + * Copyright 2015 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef FOLLY_ATOMICUNORDEREDMAP_H +#define FOLLY_ATOMICUNORDEREDMAP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace folly { + +/// You're probably reading this because you are looking for an +/// AtomicUnorderedMap that is fully general, highly concurrent (for +/// reads, writes, and iteration), and makes no performance compromises. +/// We haven't figured that one out yet. What you will find here is a +/// hash table implementation that sacrifices generality so that it can +/// give you all of the other things. +/// +/// LIMITATIONS: +/// +/// * Insert only (*) - the only write operation supported directly by +/// AtomicUnorderedInsertMap is findOrConstruct. There is a (*) because +/// values aren't moved, so you can roll your own concurrency control for +/// in-place updates of values (see MutableData and MutableAtom below), +/// but the hash table itself doesn't help you. +/// +/// * No resizing - you must specify the capacity up front, and once +/// the hash map gets full you won't be able to insert. Insert +/// performance will degrade once the load factor is high. Insert is +/// O(1/(1-actual_load_factor)). Note that this is a pretty strong +/// limitation, because you can't remove existing keys. +/// +/// * 2^30 maximum capacity - you'll need to use something else if you +/// have more than a billion entries. If this limit bothers you let it +/// wouldn't be too hard to parameterize the internal indexes between +/// uint32_t and uint64_t. +/// +/// WHAT YOU GET IN EXCHANGE: +/// +/// * Arbitrary key and value types - any K and V that can be used in a +/// std::unordered_map can be used here. In fact, the key and value +/// types don't even have to be copyable or moveable! +/// +/// * Keys and values in the map won't be moved - it is safe to keep +/// pointers or references to the keys and values in the map, because +/// they are never moved or destroyed (until the map itself is destroyed). +/// +/// * Iterators are never invalidated - writes don't invalidate iterators, +/// so you can scan and insert in parallel. +/// +/// * Fast wait-free reads - reads are usually only a single cache miss, +/// even when the hash table is very large. Wait-freedom means that +/// you won't see latency outliers even in the face of concurrent writes. +/// +/// * Lock-free insert - writes proceed in parallel. If a thread in the +/// middle of a write is unlucky and gets suspended, it doesn't block +/// anybody else. +/// +/// COMMENTS ON INSERT-ONLY +/// +/// This map provides wait-free linearizable reads and lock-free +/// linearizable inserts. Inserted values won't be moved, but no +/// concurrency control is provided for safely updating them. To remind +/// you of that fact they are only provided in const form. This is the +/// only simple safe thing to do while preserving something like the normal +/// std::map iteration form, which requires that iteration be exposed +/// via std::pair (and prevents encapsulation of access to the value). +/// +/// There are a couple of reasonable policies for doing in-place +/// concurrency control on the values. I am hoping that the policy can +/// be injected via the value type or an extra template param, to keep +/// the core AtomicUnorderedInsertMap insert-only: +/// +/// CONST: this is the currently implemented strategy, which is simple, +/// performant, and not that expressive. You can always put in a value +/// with a mutable field (see MutableAtom below), but that doesn't look +/// as pretty as it should. +/// +/// ATOMIC: for integers and integer-size trivially copyable structs +/// (via an adapter like tao/queues/AtomicStruct) the value can be a +/// std::atomic and read and written atomically. +/// +/// SEQ-LOCK: attach a counter incremented before and after write. +/// Writers serialize by using CAS to make an even->odd transition, +/// then odd->even after the write. Readers grab the value with memcpy, +/// checking sequence value before and after. Readers retry until they +/// see an even sequence number that doesn't change. This works for +/// larger structs, but still requires memcpy to be equivalent to copy +/// assignment, and it is no longer lock-free. It scales very well, +/// because the readers are still invisible (no cache line writes). +/// +/// LOCK: folly's SharedMutex would be a good choice here. +/// +/// MEMORY ALLOCATION +/// +/// Underlying memory is allocated as a big anonymous mmap chunk, which +/// might be cheaper than calloc() and is certainly not more expensive +/// for large maps. If the SkipKeyValueDeletion template param is true +/// then deletion of the map consists of unmapping the backing memory, +/// which is much faster than destructing all of the keys and values. +/// Feel free to override if std::is_trivial_destructor isn't recognizing +/// the triviality of your destructors. +template , + typename KeyEqual = std::equal_to, + bool SkipKeyValueDeletion = + (boost::has_trivial_destructor::value && + boost::has_trivial_destructor::value), + template class Atom = std::atomic, + typename Allocator = folly::detail::MMapAlloc> + +struct AtomicUnorderedInsertMap { + + typedef Key key_type; + typedef Value mapped_type; + typedef std::pair value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef Hash hasher; + typedef KeyEqual key_equal; + typedef const value_type& const_reference; + + typedef struct ConstIterator { + ConstIterator(const AtomicUnorderedInsertMap& owner, uint32_t slot) + : owner_(owner) + , slot_(slot) + {} + + ConstIterator(const ConstIterator&) = default; + ConstIterator& operator= (const ConstIterator&) = default; + + const value_type& operator* () const { + return owner_.slots_[slot_].keyValue(); + } + + const value_type* operator-> () const { + return &owner_.slots_[slot_].keyValue(); + } + + // pre-increment + const ConstIterator& operator++ () { + while (slot_ > 0) { + --slot_; + if (owner_.slots_[slot_].state() == LINKED) { + break; + } + } + return *this; + } + + // post-increment + ConstIterator operator++ (int dummy) { + auto prev = *this; + ++*this; + return prev; + } + + bool operator== (const ConstIterator& rhs) const { + return slot_ == rhs.slot_; + } + bool operator!= (const ConstIterator& rhs) const { + return !(*this == rhs); + } + + private: + const AtomicUnorderedInsertMap& owner_; + uint32_t slot_; + } const_iterator; + + friend ConstIterator; + + /// Constructs a map that will support the insertion of maxSize + /// key-value pairs without exceeding the max load factor. Load + /// factors of greater than 1 are not supported, and once the actual load + /// factor of the map approaches 1 the insert performance will suffer. + /// The capacity is limited to 2^30 (about a billion), beyond which + /// we will throw invalid_argument. + explicit AtomicUnorderedInsertMap( + size_t maxSize, + float maxLoadFactor = 0.8f, + const Allocator& alloc = Allocator()) + : allocator_(alloc) + { + size_t capacity = maxSize / std::max(1.0f, maxLoadFactor) + 128; + if (capacity > (1 << 30) && maxSize < (1 << 30)) { + // we'll do our best + capacity = (1 << 30); + } + if (capacity < maxSize || capacity > (1 << 30)) { + throw std::invalid_argument( + "AtomicUnorderedInsertMap capacity must fit in 30 bits"); + } + + numSlots_ = capacity; + slotMask_ = folly::nextPowTwo(capacity * 4) - 1; + mmapRequested_ = sizeof(Slot) * capacity; + slots_ = reinterpret_cast(allocator_.allocate(mmapRequested_)); + zeroFillSlots(); + // mark the zero-th slot as in-use but not valid, since that happens + // to be our nil value + slots_[0].stateUpdate(EMPTY, CONSTRUCTING); + } + + ~AtomicUnorderedInsertMap() { + if (!SkipKeyValueDeletion) { + for (size_t i = 1; i < numSlots_; ++i) { + slots_[i].~Slot(); + } + } + allocator_.deallocate(reinterpret_cast(slots_), mmapRequested_); + } + + /// Searches for the key, returning (iter,false) if it is found. + /// If it is not found calls the functor Func with a void* argument + /// that is raw storage suitable for placement construction of a Value + /// (see raw_value_type), then returns (iter,true). May call Func and + /// then return (iter,false) if there are other concurrent writes, in + /// which case the newly constructed value will be immediately destroyed. + /// + /// This function does not block other readers or writers. If there + /// are other concurrent writes, many parallel calls to func may happen + /// and only the first one to complete will win. The values constructed + /// by the other calls to func will be destroyed. + /// + /// Usage: + /// + /// AtomicUnorderedInsertMap memo; + /// + /// auto value = memo.findOrConstruct(key, [=](void* raw) { + /// new (raw) std::string(computation(key)); + /// })->first; + template + std::pair findOrConstruct(const Key& key, Func&& func) { + auto const slot = keyToSlotIdx(key); + auto prev = slots_[slot].headAndState_.load(std::memory_order_acquire); + + auto existing = find(key, slot); + if (existing != 0) { + return std::make_pair(ConstIterator(*this, existing), false); + } + + auto idx = allocateNear(slot); + new (&slots_[idx].keyValue().first) Key(key); + func(static_cast(&slots_[idx].keyValue().second)); + + while (true) { + slots_[idx].next_ = prev >> 2; + + // we can merge the head update and the CONSTRUCTING -> LINKED update + // into a single CAS if slot == idx (which should happen often) + auto after = idx << 2; + if (slot == idx) { + after += LINKED; + } else { + after += (prev & 3); + } + + if (slots_[slot].headAndState_.compare_exchange_strong(prev, after)) { + // success + if (idx != slot) { + slots_[idx].stateUpdate(CONSTRUCTING, LINKED); + } + return std::make_pair(ConstIterator(*this, idx), true); + } + // compare_exchange_strong updates its first arg on failure, so + // there is no need to reread prev + + existing = find(key, slot); + if (existing != 0) { + // our allocated key and value are no longer needed + slots_[idx].keyValue().first.~Key(); + slots_[idx].keyValue().second.~Value(); + slots_[idx].stateUpdate(CONSTRUCTING, EMPTY); + + return std::make_pair(ConstIterator(*this, existing), false); + } + } + } + + /// This isn't really emplace, but it is what we need to test. + /// Eventually we can duplicate all of the std::pair constructor + /// forms, including a recursive tuple forwarding template + /// http://functionalcpp.wordpress.com/2013/08/28/tuple-forwarding/). + template + std::pair emplace(const K& key, V&& value) { + return findOrConstruct(key, [&](void* raw) { + new (raw) Value(std::forward(value)); + }); + } + + const_iterator find(const Key& key) const { + return ConstIterator(*this, find(key, keyToSlotIdx(key))); + } + + const_iterator cbegin() const { + uint32_t slot = numSlots_ - 1; + while (slot > 0 && slots_[slot].state() != LINKED) { + --slot; + } + return ConstIterator(*this, slot); + } + + const_iterator cend() const { + return ConstIterator(*this, 0); + } + + private: + + enum { + kMaxAllocationTries = 1000, // after this we throw + }; + + enum BucketState : uint32_t { + EMPTY = 0, + CONSTRUCTING = 1, + LINKED = 2, + }; + + /// Lock-free insertion is easiest by prepending to collision chains. + /// A large chaining hash table takes two cache misses instead of + /// one, however. Our solution is to colocate the bucket storage and + /// the head storage, so that even though we are traversing chains we + /// are likely to stay within the same cache line. Just make sure to + /// traverse head before looking at any keys. This strategy gives us + /// 32 bit pointers and fast iteration. + struct Slot { + /// The bottom two bits are the BucketState, the rest is the index + /// of the first bucket for the chain whose keys map to this slot. + /// When things are going well the head usually links to this slot, + /// but that doesn't always have to happen. + Atom headAndState_; + + /// The next bucket in the chain + uint32_t next_; + + /// Key and Value + typename std::aligned_storage::type raw_; + + + ~Slot() { + auto s = state(); + assert(s == EMPTY || s == LINKED); + if (s == LINKED) { + keyValue().first.~Key(); + keyValue().second.~Value(); + } + } + + BucketState state() const { + return BucketState(headAndState_.load(std::memory_order_acquire) & 3); + } + + void stateUpdate(BucketState before, BucketState after) { + assert(state() == before); + headAndState_ += (after - before); + } + + value_type& keyValue() { + assert(state() != EMPTY); + return *static_cast(static_cast(&raw_)); + } + + const value_type& keyValue() const { + assert(state() != EMPTY); + return *static_cast(static_cast(&raw_)); + } + + }; + + // We manually manage the slot memory so we can bypass initialization + // (by getting a zero-filled mmap chunk) and optionally destruction of + // the slots + + size_t mmapRequested_; + size_t numSlots_; + + /// tricky, see keyToSlodIdx + size_t slotMask_; + + Allocator allocator_; + Slot* slots_; + + uint32_t keyToSlotIdx(const Key& key) const { + size_t h = hasher()(key); + h &= slotMask_; + while (h >= numSlots_) { + h -= numSlots_; + } + return h; + } + + uint32_t find(const Key& key, uint32_t slot) const { + KeyEqual ke = {}; + auto hs = slots_[slot].headAndState_.load(std::memory_order_acquire); + for (slot = hs >> 2; slot != 0; slot = slots_[slot].next_) { + if (ke(key, slots_[slot].keyValue().first)) { + return slot; + } + } + return 0; + } + + /// Allocates a slot and returns its index. Tries to put it near + /// slots_[start]. + uint32_t allocateNear(uint32_t start) { + for (auto tries = 0; tries < kMaxAllocationTries; ++tries) { + auto slot = allocationAttempt(start, tries); + auto prev = slots_[slot].headAndState_.load(std::memory_order_acquire); + if ((prev & 3) == EMPTY && + slots_[slot].headAndState_.compare_exchange_strong( + prev, prev + CONSTRUCTING - EMPTY)) { + return slot; + } + } + throw std::bad_alloc(); + } + + /// Returns the slot we should attempt to allocate after tries failed + /// tries, starting from the specified slot. This is pulled out so we + /// can specialize it differently during deterministic testing + uint32_t allocationAttempt(uint32_t start, uint32_t tries) const { + if (LIKELY(tries < 8 && start + tries < numSlots_)) { + return start + tries; + } else { + uint32_t rv = folly::Random::rand32(numSlots_); + assert(rv < numSlots_); + return rv; + } + } + + void zeroFillSlots() { + using folly::detail::GivesZeroFilledMemory; + if (!GivesZeroFilledMemory::value) { + memset(slots_, 0, mmapRequested_); + } + } +}; + + +/// MutableAtom is a tiny wrapper than gives you the option of atomically +/// updating values inserted into an AtomicUnorderedInsertMap>. This relies on AtomicUnorderedInsertMap's guarantee +/// that it doesn't move values. +template class Atom = std::atomic> +struct MutableAtom { + mutable Atom data; + + explicit MutableAtom(const T& init) : data(init) {} +}; + +/// MutableData is a tiny wrapper than gives you the option of using an +/// external concurrency control mechanism to updating values inserted +/// into an AtomicUnorderedInsertMap. +template +struct MutableData { + mutable T data; + explicit MutableData(const T& init) : data(init) {} +}; + + +} +#endif diff --git a/folly/Makefile.am b/folly/Makefile.am index 5b1775e4..a682e012 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -28,6 +28,7 @@ nobase_follyinclude_HEADERS = \ AtomicHashMap-inl.h \ AtomicLinkedList.h \ AtomicStruct.h \ + AtomicUnorderedMap.h \ Baton.h \ Benchmark.h \ Bits.h \ @@ -39,6 +40,7 @@ nobase_follyinclude_HEADERS = \ CpuId.h \ CPortability.h \ detail/AtomicHashUtils.h \ + detail/AtomicUnorderedMapUtils.h \ detail/BitIteratorDetail.h \ detail/BitsDetail.h \ detail/CacheLocality.h \ diff --git a/folly/detail/AtomicUnorderedMapUtils.h b/folly/detail/AtomicUnorderedMapUtils.h new file mode 100644 index 00000000..f5737972 --- /dev/null +++ b/folly/detail/AtomicUnorderedMapUtils.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include + +namespace folly { namespace detail { + +class MMapAlloc { + private: + size_t computeSize(size_t size) { + long pagesize = sysconf(_SC_PAGESIZE); + size_t mmapLength = ((size - 1) & ~(pagesize - 1)) + pagesize; + assert(size <= mmapLength && mmapLength < size + pagesize); + assert((mmapLength % pagesize) == 0); + return mmapLength; + } + + public: + void* allocate(size_t size) { + auto len = computeSize(size); + + // MAP_HUGETLB is a perf win, but requires cooperation from the + // deployment environment (and a change to computeSize()). + void* mem = static_cast(mmap( + nullptr, + len, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, + -1, + 0)); + if (mem == reinterpret_cast(-1)) { + throw std::system_error(errno, std::system_category()); + } + + return mem; + } + + void deallocate(void* p, size_t size) { + auto len = computeSize(size); + munmap(p, len); + } +}; + +template +struct GivesZeroFilledMemory : public std::false_type {}; + +template<> +struct GivesZeroFilledMemory : public std::true_type{}; + +}} diff --git a/folly/test/AtomicUnorderedMapTest.cpp b/folly/test/AtomicUnorderedMapTest.cpp new file mode 100644 index 00000000..7896906f --- /dev/null +++ b/folly/test/AtomicUnorderedMapTest.cpp @@ -0,0 +1,316 @@ +/* + * Copyright 2015 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace folly; +using namespace folly::test; + +template +struct non_atomic { + T value; + + non_atomic() = default; + non_atomic(const non_atomic&) = delete; + constexpr /* implicit */ non_atomic(T desired): value(desired) {} + + T operator+=(T arg) { value += arg; return load();} + + T load(std::memory_order order= std::memory_order_seq_cst) const { + return value; + } + + /* implicit */ + operator T() const {return load();} + + void store(T desired, std::memory_order order = std::memory_order_seq_cst) { + value = desired; + } + + T exchange(T desired, std::memory_order order = std::memory_order_seq_cst) { + T old = load(); + store(desired); + return old; + } + + bool compare_exchange_weak( + T& expected, T desired, + std::memory_order success = std::memory_order_seq_cst, + std::memory_order failure = std::memory_order_seq_cst) { + if (value == expected) { + value = desired; + return true; + } + + expected = value; + return false; + } + + bool compare_exchange_strong( + T& expected, T desired, + std::memory_order success = std::memory_order_seq_cst, + std::memory_order failure = std::memory_order_seq_cst) { + if (value == expected) { + value = desired; + return true; + } + + expected = value; + return false; + } + + bool is_lock_free() const {return true;} +}; + +template< + typename Key, typename Value, template class Atom = non_atomic> +using UnorderedInsertMap = AtomicUnorderedInsertMap< + Key, + Value, + std::hash, + std::equal_to, + (boost::has_trivial_destructor::value && + boost::has_trivial_destructor::value), + Atom, + std::allocator>; + +TEST(AtomicUnorderedInsertMap, basic) { + AtomicUnorderedInsertMap m(100); + + m.emplace("abc", "ABC"); + EXPECT_TRUE(m.find("abc") != m.cend()); + EXPECT_EQ(m.find("abc")->first, "abc"); + EXPECT_EQ(m.find("abc")->second, "ABC"); + EXPECT_TRUE(m.find("def") == m.cend()); + auto iter = m.cbegin(); + EXPECT_TRUE(iter != m.cend()); + EXPECT_TRUE(iter == m.find("abc")); + auto a = iter; + EXPECT_TRUE(a == iter); + auto b = iter; + ++iter; + EXPECT_TRUE(iter == m.cend()); + EXPECT_TRUE(a == b); + EXPECT_TRUE(a != iter); + a++; + EXPECT_TRUE(a == iter); + EXPECT_TRUE(a != b); +} + +TEST(AtomicUnorderedInsertMap, value_mutation) { + AtomicUnorderedInsertMap> m(100); + + for (int i = 0; i < 50; ++i) { + m.emplace(i, i); + } + + m.find(1)->second.data++; +} + +TEST(UnorderedInsertMap, value_mutation) { + UnorderedInsertMap> m(100); + + for (int i = 0; i < 50; ++i) { + m.emplace(i, i); + } + + m.find(1)->second.data++; + EXPECT_EQ(m.find(1)->second.data, 2); +} + +BENCHMARK(lookup_int_int_hit, iters) { + std::unique_ptr> ptr = {}; + + size_t capacity = 100000; + + BENCHMARK_SUSPEND { + ptr.reset(new AtomicUnorderedInsertMap(capacity)); + for (size_t i = 0; i < capacity; ++i) { + auto k = 3 * ((5641 * i) % capacity); + ptr->emplace(k, k + 1); + EXPECT_EQ(ptr->find(k)->second, k + 1); + } + } + + for (size_t i = 0; i < iters; ++i) { + size_t k = 3 * (((i * 7919) ^ (i * 4001)) % capacity); + auto iter = ptr->find(k); + if (iter == ptr->cend() || + iter->second != k + 1) { + auto jter = ptr->find(k); + EXPECT_TRUE(iter == jter); + } + EXPECT_EQ(iter->second, k + 1); + } + + BENCHMARK_SUSPEND { + ptr.reset(nullptr); + } +} + +struct PairHash { + size_t operator()(const std::pair& pr) const { + return pr.first ^ pr.second; + } +}; + +void contendedRW(size_t itersPerThread, + size_t capacity, + size_t numThreads, + size_t readsPerWrite) { + typedef std::pair Key; + typedef AtomicUnorderedInsertMap,PairHash> Map; + + std::unique_ptr ptr = {}; + std::atomic go; + std::vector threads; + + BENCHMARK_SUSPEND { + ptr.reset(new Map(capacity)); + while (threads.size() < numThreads) { + threads.emplace_back([&](){ + while (!go) { + std::this_thread::yield(); + } + + size_t reads = 0; + size_t writes = 0; + while (reads + writes < itersPerThread) { + auto r = Random::rand32(); + Key key(reads + writes, r); + if (reads < writes * readsPerWrite || + writes >= capacity / numThreads) { + // read needed + ++reads; + auto iter = ptr->find(key); + EXPECT_TRUE( + iter == ptr->cend() || + iter->second.data.load(std::memory_order_acquire) >= key.first); + } else { + ++writes; + try { + auto pr = ptr->emplace(key, key.first); + if (!pr.second) { + pr.first->second.data++; + } + } catch (std::bad_alloc& x) { + LOG(INFO) << "bad alloc"; + } + } + } + }); + } + } + + go = true; + + for (auto& thr : threads) { + thr.join(); + } + + BENCHMARK_SUSPEND { + ptr.reset(nullptr); + } +} + +// sudo nice -n -20 ~/fbcode/_bin/common/concurrency/experimental/atomic_unordered_map --benchmark --bm_min_iters=1000000 +// +// without MAP_HUGETLB (default) +// +// ============================================================================ +// common/concurrency/experimental/AtomicUnorderedMapTest.cpprelative time/iter +// iters/s +// ============================================================================ +// lookup_int_int_hit 20.05ns 49.89M +// contendedRW(small_32thr_99pct) 70.36ns 14.21M +// contendedRW(large_32thr_99pct) 164.23ns 6.09M +// contendedRW(large_32thr_99_9pct) 158.81ns 6.30M +// ============================================================================ +// +// with MAP_HUGETLB hacked in +// ============================================================================ +// lookup_int_int_hit 19.67ns 50.84M +// contendedRW(small_32thr_99pct) 62.46ns 16.01M +// contendedRW(large_32thr_99pct) 119.41ns 8.37M +// contendedRW(large_32thr_99_9pct) 111.23ns 8.99M +// ============================================================================ +BENCHMARK_NAMED_PARAM(contendedRW, small_32thr_99pct, 100000, 32, 99) +BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99pct, 100000000, 32, 99) +BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99_9pct, 100000000, 32, 999) + +BENCHMARK_DRAW_LINE(); + +// sudo nice -n -20 ~/fbcode/_build/opt/site_integrity/quasar/experimental/atomic_unordered_map_test --benchmark --bm_min_iters=10000 +// Single threaded benchmarks to test how much better we are than +// std::unordered_map and what is the cost of using atomic operations +// in the uncontended use case +// ============================================================================ +// std_map 1.20ms 832.58 +// atomic_fast_map 511.35us 1.96K +// fast_map 196.28us 5.09K +// ============================================================================ + +BENCHMARK(std_map) { + std::unordered_map m; + m.reserve(10000); + for (int i=0; i<10000; ++i) { + m.emplace(i,i); + } + + for (int i=0; i<10000; ++i) { + auto a = m.find(i); + folly::doNotOptimizeAway(&*a); + } +} + +BENCHMARK(atomic_fast_map) { + UnorderedInsertMap m(10000); + for (int i=0; i<10000; ++i) { + m.emplace(i,i); + } + + for (int i=0; i<10000; ++i) { + auto a = m.find(i); + folly::doNotOptimizeAway(&*a); + } +} + +BENCHMARK(fast_map) { + UnorderedInsertMap m(10000); + for (int i=0; i<10000; ++i) { + m.emplace(i,i); + } + + for (int i=0; i<10000; ++i) { + auto a = m.find(i); + folly::doNotOptimizeAway(&*a); + } +} + + +int main(int argc, char ** argv) { + testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, true); + int rv = RUN_ALL_TESTS(); + folly::runBenchmarksOnFlag(); + return rv; +} -- 2.34.1