Move AtomicUnorderedInsertMap to folly.

author Nathan Bronson <ngbronson@fb.com>

Mon, 15 Jun 2015 15:29:44 +0000 (08:29 -0700)

committer Sara Golemon <sgolemon@fb.com>

Mon, 15 Jun 2015 20:36:36 +0000 (13:36 -0700)
author Nathan Bronson <ngbronson@fb.com>
Mon, 15 Jun 2015 15:29:44 +0000 (08:29 -0700)
committer Sara Golemon <sgolemon@fb.com>
Mon, 15 Jun 2015 20:36:36 +0000 (13:36 -0700)
diff --git a/folly/AtomicUnorderedMap.h b/folly/AtomicUnorderedMap.h

new file mode 100644 (file)

index 0000000..a2c4ac7
--- /dev/null
+++ b/folly/AtomicUnorderedMap.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOLLY_ATOMICUNORDEREDMAP_H
+#define FOLLY_ATOMICUNORDEREDMAP_H
+
+#include <atomic>
+#include <functional>
+#include <stdexcept>
+#include <system_error>
+#include <type_traits>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <folly/Likely.h>
+#include <folly/Bits.h>
+#include <folly/Conv.h>
+#include <folly/Random.h>
+#include <folly/detail/AtomicUnorderedMapUtils.h>
+#include <boost/type_traits/has_trivial_destructor.hpp>
+#include <limits>
+
+namespace folly {
+
+/// You're probably reading this because you are looking for an
+/// AtomicUnorderedMap<K,V> that is fully general, highly concurrent (for
+/// reads, writes, and iteration), and makes no performance compromises.
+/// We haven't figured that one out yet.  What you will find here is a
+/// hash table implementation that sacrifices generality so that it can
+/// give you all of the other things.
+///
+/// LIMITATIONS:
+///
+/// * Insert only (*) - the only write operation supported directly by
+///   AtomicUnorderedInsertMap is findOrConstruct.  There is a (*) because
+///   values aren't moved, so you can roll your own concurrency control for
+///   in-place updates of values (see MutableData and MutableAtom below),
+///   but the hash table itself doesn't help you.
+///
+/// * No resizing - you must specify the capacity up front, and once
+///   the hash map gets full you won't be able to insert.  Insert
+///   performance will degrade once the load factor is high.  Insert is
+///   O(1/(1-actual_load_factor)).  Note that this is a pretty strong
+///   limitation, because you can't remove existing keys.
+///
+/// * 2^30 maximum capacity - you'll need to use something else if you
+///   have more than a billion entries.  If this limit bothers you let it
+///   wouldn't be too hard to parameterize the internal indexes between
+///   uint32_t and uint64_t.
+///
+/// WHAT YOU GET IN EXCHANGE:
+///
+/// * Arbitrary key and value types - any K and V that can be used in a
+///   std::unordered_map can be used here.  In fact, the key and value
+///   types don't even have to be copyable or moveable!
+///
+/// * Keys and values in the map won't be moved - it is safe to keep
+///   pointers or references to the keys and values in the map, because
+///   they are never moved or destroyed (until the map itself is destroyed).
+///
+/// * Iterators are never invalidated - writes don't invalidate iterators,
+///   so you can scan and insert in parallel.
+///
+/// * Fast wait-free reads - reads are usually only a single cache miss,
+///   even when the hash table is very large.  Wait-freedom means that
+///   you won't see latency outliers even in the face of concurrent writes.
+///
+/// * Lock-free insert - writes proceed in parallel.  If a thread in the
+///   middle of a write is unlucky and gets suspended, it doesn't block
+///   anybody else.
+///
+/// COMMENTS ON INSERT-ONLY
+///
+/// This map provides wait-free linearizable reads and lock-free
+/// linearizable inserts.  Inserted values won't be moved, but no
+/// concurrency control is provided for safely updating them.  To remind
+/// you of that fact they are only provided in const form.  This is the
+/// only simple safe thing to do while preserving something like the normal
+/// std::map iteration form, which requires that iteration be exposed
+/// via std::pair (and prevents encapsulation of access to the value).
+///
+/// There are a couple of reasonable policies for doing in-place
+/// concurrency control on the values.  I am hoping that the policy can
+/// be injected via the value type or an extra template param, to keep
+/// the core AtomicUnorderedInsertMap insert-only:
+///
+///   CONST: this is the currently implemented strategy, which is simple,
+///   performant, and not that expressive.  You can always put in a value
+///   with a mutable field (see MutableAtom below), but that doesn't look
+///   as pretty as it should.
+///
+///   ATOMIC: for integers and integer-size trivially copyable structs
+///   (via an adapter like tao/queues/AtomicStruct) the value can be a
+///   std::atomic and read and written atomically.
+///
+///   SEQ-LOCK: attach a counter incremented before and after write.
+///   Writers serialize by using CAS to make an even->odd transition,
+///   then odd->even after the write.  Readers grab the value with memcpy,
+///   checking sequence value before and after.  Readers retry until they
+///   see an even sequence number that doesn't change.  This works for
+///   larger structs, but still requires memcpy to be equivalent to copy
+///   assignment, and it is no longer lock-free.  It scales very well,
+///   because the readers are still invisible (no cache line writes).
+///
+///   LOCK: folly's SharedMutex would be a good choice here.
+///
+/// MEMORY ALLOCATION
+///
+/// Underlying memory is allocated as a big anonymous mmap chunk, which
+/// might be cheaper than calloc() and is certainly not more expensive
+/// for large maps.  If the SkipKeyValueDeletion template param is true
+/// then deletion of the map consists of unmapping the backing memory,
+/// which is much faster than destructing all of the keys and values.
+/// Feel free to override if std::is_trivial_destructor isn't recognizing
+/// the triviality of your destructors.
+template <typename Key,
+          typename Value,
+          typename Hash = std::hash<Key>,
+          typename KeyEqual = std::equal_to<Key>,
+          bool SkipKeyValueDeletion =
+              (boost::has_trivial_destructor<Key>::value &&
+               boost::has_trivial_destructor<Value>::value),
+          template<typename> class Atom = std::atomic,
+          typename Allocator = folly::detail::MMapAlloc>
+
+struct AtomicUnorderedInsertMap {
+
+  typedef Key key_type;
+  typedef Value mapped_type;
+  typedef std::pair<Key,Value> value_type;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef Hash hasher;
+  typedef KeyEqual key_equal;
+  typedef const value_type& const_reference;
+
+  typedef struct ConstIterator {
+    ConstIterator(const AtomicUnorderedInsertMap& owner, uint32_t slot)
+      : owner_(owner)
+      , slot_(slot)
+    {}
+
+    ConstIterator(const ConstIterator&) = default;
+    ConstIterator& operator= (const ConstIterator&) = default;
+
+    const value_type& operator* () const {
+      return owner_.slots_[slot_].keyValue();
+    }
+
+    const value_type* operator-> () const {
+      return &owner_.slots_[slot_].keyValue();
+    }
+
+    // pre-increment
+    const ConstIterator& operator++ () {
+      while (slot_ > 0) {
+        --slot_;
+        if (owner_.slots_[slot_].state() == LINKED) {
+          break;
+        }
+      }
+      return *this;
+    }
+
+    // post-increment
+    ConstIterator operator++ (int dummy) {
+      auto prev = *this;
+      ++*this;
+      return prev;
+    }
+
+    bool operator== (const ConstIterator& rhs) const {
+      return slot_ == rhs.slot_;
+    }
+    bool operator!= (const ConstIterator& rhs) const {
+      return !(*this == rhs);
+    }
+
+   private:
+    const AtomicUnorderedInsertMap& owner_;
+    uint32_t slot_;
+  } const_iterator;
+
+  friend ConstIterator;
+
+  /// Constructs a map that will support the insertion of maxSize
+  /// key-value pairs without exceeding the max load factor.  Load
+  /// factors of greater than 1 are not supported, and once the actual load
+  /// factor of the map approaches 1 the insert performance will suffer.
+  /// The capacity is limited to 2^30 (about a billion), beyond which
+  /// we will throw invalid_argument.
+  explicit AtomicUnorderedInsertMap(
+      size_t maxSize,
+      float maxLoadFactor = 0.8f,
+      const Allocator& alloc = Allocator())
+    : allocator_(alloc)
+  {
+    size_t capacity = maxSize / std::max(1.0f, maxLoadFactor) + 128;
+    if (capacity > (1 << 30) && maxSize < (1 << 30)) {
+      // we'll do our best
+      capacity = (1 << 30);
+    }
+    if (capacity < maxSize || capacity > (1 << 30)) {
+      throw std::invalid_argument(
+          "AtomicUnorderedInsertMap capacity must fit in 30 bits");
+    }
+
+    numSlots_ = capacity;
+    slotMask_ = folly::nextPowTwo(capacity * 4) - 1;
+    mmapRequested_ = sizeof(Slot) * capacity;
+    slots_ = reinterpret_cast<Slot*>(allocator_.allocate(mmapRequested_));
+    zeroFillSlots();
+    // mark the zero-th slot as in-use but not valid, since that happens
+    // to be our nil value
+    slots_[0].stateUpdate(EMPTY, CONSTRUCTING);
+  }
+
+  ~AtomicUnorderedInsertMap() {
+    if (!SkipKeyValueDeletion) {
+      for (size_t i = 1; i < numSlots_; ++i) {
+        slots_[i].~Slot();
+      }
+    }
+    allocator_.deallocate(reinterpret_cast<char*>(slots_), mmapRequested_);
+  }
+
+  /// Searches for the key, returning (iter,false) if it is found.
+  /// If it is not found calls the functor Func with a void* argument
+  /// that is raw storage suitable for placement construction of a Value
+  /// (see raw_value_type), then returns (iter,true).  May call Func and
+  /// then return (iter,false) if there are other concurrent writes, in
+  /// which case the newly constructed value will be immediately destroyed.
+  ///
+  /// This function does not block other readers or writers.  If there
+  /// are other concurrent writes, many parallel calls to func may happen
+  /// and only the first one to complete will win.  The values constructed
+  /// by the other calls to func will be destroyed.
+  ///
+  /// Usage:
+  ///
+  ///  AtomicUnorderedInsertMap<std::string,std::string> memo;
+  ///
+  ///  auto value = memo.findOrConstruct(key, [=](void* raw) {
+  ///    new (raw) std::string(computation(key));
+  ///  })->first;
+  template<typename Func>
+  std::pair<const_iterator,bool> findOrConstruct(const Key& key, Func&& func) {
+    auto const slot = keyToSlotIdx(key);
+    auto prev = slots_[slot].headAndState_.load(std::memory_order_acquire);
+
+    auto existing = find(key, slot);
+    if (existing != 0) {
+      return std::make_pair(ConstIterator(*this, existing), false);
+    }
+
+    auto idx = allocateNear(slot);
+    new (&slots_[idx].keyValue().first) Key(key);
+    func(static_cast<void*>(&slots_[idx].keyValue().second));
+
+    while (true) {
+      slots_[idx].next_ = prev >> 2;
+
+      // we can merge the head update and the CONSTRUCTING -> LINKED update
+      // into a single CAS if slot == idx (which should happen often)
+      auto after = idx << 2;
+      if (slot == idx) {
+        after += LINKED;
+      } else {
+        after += (prev & 3);
+      }
+
+      if (slots_[slot].headAndState_.compare_exchange_strong(prev, after)) {
+        // success
+        if (idx != slot) {
+          slots_[idx].stateUpdate(CONSTRUCTING, LINKED);
+        }
+        return std::make_pair(ConstIterator(*this, idx), true);
+      }
+      // compare_exchange_strong updates its first arg on failure, so
+      // there is no need to reread prev
+
+      existing = find(key, slot);
+      if (existing != 0) {
+        // our allocated key and value are no longer needed
+        slots_[idx].keyValue().first.~Key();
+        slots_[idx].keyValue().second.~Value();
+        slots_[idx].stateUpdate(CONSTRUCTING, EMPTY);
+
+        return std::make_pair(ConstIterator(*this, existing), false);
+      }
+    }
+  }
+
+  /// This isn't really emplace, but it is what we need to test.
+  /// Eventually we can duplicate all of the std::pair constructor
+  /// forms, including a recursive tuple forwarding template
+  /// http://functionalcpp.wordpress.com/2013/08/28/tuple-forwarding/).
+  template<class K, class V>
+  std::pair<const_iterator,bool> emplace(const K& key, V&& value) {
+    return findOrConstruct(key, [&](void* raw) {
+      new (raw) Value(std::forward<V>(value));
+    });
+  }
+
+  const_iterator find(const Key& key) const {
+    return ConstIterator(*this, find(key, keyToSlotIdx(key)));
+  }
+
+  const_iterator cbegin() const {
+    uint32_t slot = numSlots_ - 1;
+    while (slot > 0 && slots_[slot].state() != LINKED) {
+      --slot;
+    }
+    return ConstIterator(*this, slot);
+  }
+
+  const_iterator cend() const {
+    return ConstIterator(*this, 0);
+  }
+
+ private:
+
+  enum {
+    kMaxAllocationTries = 1000, // after this we throw
+  };
+
+  enum BucketState : uint32_t {
+    EMPTY = 0,
+    CONSTRUCTING = 1,
+    LINKED = 2,
+  };
+
+  /// Lock-free insertion is easiest by prepending to collision chains.
+  /// A large chaining hash table takes two cache misses instead of
+  /// one, however.  Our solution is to colocate the bucket storage and
+  /// the head storage, so that even though we are traversing chains we
+  /// are likely to stay within the same cache line.  Just make sure to
+  /// traverse head before looking at any keys.  This strategy gives us
+  /// 32 bit pointers and fast iteration.
+  struct Slot {
+    /// The bottom two bits are the BucketState, the rest is the index
+    /// of the first bucket for the chain whose keys map to this slot.
+    /// When things are going well the head usually links to this slot,
+    /// but that doesn't always have to happen.
+    Atom<uint32_t> headAndState_;
+
+    /// The next bucket in the chain
+    uint32_t next_;
+
+    /// Key and Value
+    typename std::aligned_storage<sizeof(value_type),
+                                  alignof(value_type)>::type raw_;
+
+
+    ~Slot() {
+      auto s = state();
+      assert(s == EMPTY || s == LINKED);
+      if (s == LINKED) {
+        keyValue().first.~Key();
+        keyValue().second.~Value();
+      }
+    }
+
+    BucketState state() const {
+      return BucketState(headAndState_.load(std::memory_order_acquire) & 3);
+    }
+
+    void stateUpdate(BucketState before, BucketState after) {
+      assert(state() == before);
+      headAndState_ += (after - before);
+    }
+
+    value_type& keyValue() {
+      assert(state() != EMPTY);
+      return *static_cast<value_type*>(static_cast<void*>(&raw_));
+    }
+
+    const value_type& keyValue() const {
+      assert(state() != EMPTY);
+      return *static_cast<const value_type*>(static_cast<const void*>(&raw_));
+    }
+
+  };
+
+  // We manually manage the slot memory so we can bypass initialization
+  // (by getting a zero-filled mmap chunk) and optionally destruction of
+  // the slots
+
+  size_t mmapRequested_;
+  size_t numSlots_;
+
+  /// tricky, see keyToSlodIdx
+  size_t slotMask_;
+
+  Allocator allocator_;
+  Slot* slots_;
+
+  uint32_t keyToSlotIdx(const Key& key) const {
+    size_t h = hasher()(key);
+    h &= slotMask_;
+    while (h >= numSlots_) {
+      h -= numSlots_;
+    }
+    return h;
+  }
+
+  uint32_t find(const Key& key, uint32_t slot) const {
+    KeyEqual ke = {};
+    auto hs = slots_[slot].headAndState_.load(std::memory_order_acquire);
+    for (slot = hs >> 2; slot != 0; slot = slots_[slot].next_) {
+      if (ke(key, slots_[slot].keyValue().first)) {
+        return slot;
+      }
+    }
+    return 0;
+  }
+
+  /// Allocates a slot and returns its index.  Tries to put it near
+  /// slots_[start].
+  uint32_t allocateNear(uint32_t start) {
+    for (auto tries = 0; tries < kMaxAllocationTries; ++tries) {
+      auto slot = allocationAttempt(start, tries);
+      auto prev = slots_[slot].headAndState_.load(std::memory_order_acquire);
+      if ((prev & 3) == EMPTY &&
+          slots_[slot].headAndState_.compare_exchange_strong(
+              prev, prev + CONSTRUCTING - EMPTY)) {
+        return slot;
+      }
+    }
+    throw std::bad_alloc();
+  }
+
+  /// Returns the slot we should attempt to allocate after tries failed
+  /// tries, starting from the specified slot.  This is pulled out so we
+  /// can specialize it differently during deterministic testing
+  uint32_t allocationAttempt(uint32_t start, uint32_t tries) const {
+    if (LIKELY(tries < 8 && start + tries < numSlots_)) {
+      return start + tries;
+    } else {
+      uint32_t rv = folly::Random::rand32(numSlots_);
+      assert(rv < numSlots_);
+      return rv;
+    }
+  }
+
+  void zeroFillSlots() {
+    using folly::detail::GivesZeroFilledMemory;
+    if (!GivesZeroFilledMemory<Allocator>::value) {
+      memset(slots_, 0, mmapRequested_);
+    }
+  }
+};
+
+
+/// MutableAtom is a tiny wrapper than gives you the option of atomically
+/// updating values inserted into an AtomicUnorderedInsertMap<K,
+/// MutableAtom<V>>.  This relies on AtomicUnorderedInsertMap's guarantee
+/// that it doesn't move values.
+template <typename T,
+          template<typename> class Atom = std::atomic>
+struct MutableAtom {
+  mutable Atom<T> data;
+
+  explicit MutableAtom(const T& init) : data(init) {}
+};
+
+/// MutableData is a tiny wrapper than gives you the option of using an
+/// external concurrency control mechanism to updating values inserted
+/// into an AtomicUnorderedInsertMap.
+template <typename T>
+struct MutableData {
+  mutable T data;
+  explicit MutableData(const T& init) : data(init) {}
+};
+
+
+}
+#endif
diff --git a/folly/Makefile.am b/folly/Makefile.am

index 5b1775e475e70519aa6afd1dac6c6b4e7c22753f..a682e0120d543fe0efb185faa23ba2e47e6e3645 100644 (file)
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -28,6 +28,7 @@ nobase_follyinclude_HEADERS = \
         AtomicHashMap-inl.h \
         AtomicLinkedList.h \
         AtomicStruct.h \
+       AtomicUnorderedMap.h \
         Baton.h \
         Benchmark.h \
         Bits.h \
@@ -39,6 +40,7 @@ nobase_follyinclude_HEADERS = \
         CpuId.h \
         CPortability.h \
         detail/AtomicHashUtils.h \
+       detail/AtomicUnorderedMapUtils.h \
         detail/BitIteratorDetail.h \
         detail/BitsDetail.h \
         detail/CacheLocality.h \
diff --git a/folly/detail/AtomicUnorderedMapUtils.h b/folly/detail/AtomicUnorderedMapUtils.h

new file mode 100644 (file)

index 0000000..f573797
--- /dev/null
+++ b/folly/detail/AtomicUnorderedMapUtils.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <atomic>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+namespace folly { namespace detail {
+
+class MMapAlloc {
+ private:
+  size_t computeSize(size_t size) {
+    long pagesize = sysconf(_SC_PAGESIZE);
+    size_t mmapLength = ((size - 1) & ~(pagesize - 1)) + pagesize;
+    assert(size <= mmapLength && mmapLength < size + pagesize);
+    assert((mmapLength % pagesize) == 0);
+    return mmapLength;
+  }
+
+ public:
+  void* allocate(size_t size) {
+    auto len = computeSize(size);
+
+    // MAP_HUGETLB is a perf win, but requires cooperation from the
+    // deployment environment (and a change to computeSize()).
+    void* mem = static_cast<void*>(mmap(
+         nullptr,
+         len,
+         PROT_READ | PROT_WRITE,
+         MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE,
+         -1,
+         0));
+    if (mem == reinterpret_cast<void*>(-1)) {
+      throw std::system_error(errno, std::system_category());
+    }
+
+    return mem;
+  }
+
+  void deallocate(void* p, size_t size) {
+    auto len = computeSize(size);
+    munmap(p, len);
+  }
+};
+
+template<typename Allocator>
+struct GivesZeroFilledMemory : public std::false_type {};
+
+template<>
+struct GivesZeroFilledMemory<MMapAlloc> : public std::true_type{};
+
+}}
diff --git a/folly/test/AtomicUnorderedMapTest.cpp b/folly/test/AtomicUnorderedMapTest.cpp

new file mode 100644 (file)

index 0000000..7896906
--- /dev/null
+++ b/folly/test/AtomicUnorderedMapTest.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright 2015 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/AtomicUnorderedMap.h>
+#include <folly/test/DeterministicSchedule.h>
+#include <thread>
+#include <semaphore.h>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <folly/Benchmark.h>
+#include <unordered_map>
+
+using namespace folly;
+using namespace folly::test;
+
+template<class T>
+struct non_atomic {
+  T value;
+
+  non_atomic() = default;
+  non_atomic(const non_atomic&) = delete;
+  constexpr /* implicit */ non_atomic(T desired): value(desired) {}
+
+  T operator+=(T arg) { value += arg; return load();}
+
+  T load(std::memory_order order= std::memory_order_seq_cst) const {
+    return value;
+  }
+
+  /* implicit */
+  operator T() const {return load();}
+
+  void store(T desired, std::memory_order order = std::memory_order_seq_cst) {
+    value = desired;
+  }
+
+  T exchange(T desired, std::memory_order order = std::memory_order_seq_cst) {
+    T old = load();
+    store(desired);
+    return old;
+  }
+
+  bool compare_exchange_weak(
+      T& expected, T desired,
+      std::memory_order success = std::memory_order_seq_cst,
+      std::memory_order failure = std::memory_order_seq_cst) {
+    if (value == expected) {
+      value = desired;
+      return true;
+    }
+
+    expected = value;
+    return false;
+  }
+
+  bool compare_exchange_strong(
+      T& expected, T desired,
+      std::memory_order success = std::memory_order_seq_cst,
+      std::memory_order failure = std::memory_order_seq_cst) {
+    if (value == expected) {
+      value = desired;
+      return true;
+    }
+
+    expected = value;
+    return false;
+  }
+
+  bool is_lock_free() const {return true;}
+};
+
+template<
+    typename Key, typename Value, template<typename> class Atom = non_atomic>
+using UnorderedInsertMap =  AtomicUnorderedInsertMap<
+    Key,
+    Value,
+    std::hash<Key>,
+    std::equal_to<Key>,
+    (boost::has_trivial_destructor<Key>::value &&
+     boost::has_trivial_destructor<Value>::value),
+    Atom,
+    std::allocator<char>>;
+
+TEST(AtomicUnorderedInsertMap, basic) {
+  AtomicUnorderedInsertMap<std::string,std::string> m(100);
+
+  m.emplace("abc", "ABC");
+  EXPECT_TRUE(m.find("abc") != m.cend());
+  EXPECT_EQ(m.find("abc")->first, "abc");
+  EXPECT_EQ(m.find("abc")->second, "ABC");
+  EXPECT_TRUE(m.find("def") == m.cend());
+  auto iter = m.cbegin();
+  EXPECT_TRUE(iter != m.cend());
+  EXPECT_TRUE(iter == m.find("abc"));
+  auto a = iter;
+  EXPECT_TRUE(a == iter);
+  auto b = iter;
+  ++iter;
+  EXPECT_TRUE(iter == m.cend());
+  EXPECT_TRUE(a == b);
+  EXPECT_TRUE(a != iter);
+  a++;
+  EXPECT_TRUE(a == iter);
+  EXPECT_TRUE(a != b);
+}
+
+TEST(AtomicUnorderedInsertMap, value_mutation) {
+  AtomicUnorderedInsertMap<int, MutableAtom<int>> m(100);
+
+  for (int i = 0; i < 50; ++i) {
+    m.emplace(i, i);
+  }
+
+  m.find(1)->second.data++;
+}
+
+TEST(UnorderedInsertMap, value_mutation) {
+  UnorderedInsertMap<int, MutableData<int>> m(100);
+
+  for (int i = 0; i < 50; ++i) {
+    m.emplace(i, i);
+  }
+
+  m.find(1)->second.data++;
+  EXPECT_EQ(m.find(1)->second.data, 2);
+}
+
+BENCHMARK(lookup_int_int_hit, iters) {
+  std::unique_ptr<AtomicUnorderedInsertMap<int,size_t>> ptr = {};
+
+  size_t capacity = 100000;
+
+  BENCHMARK_SUSPEND {
+    ptr.reset(new AtomicUnorderedInsertMap<int,size_t>(capacity));
+    for (size_t i = 0; i < capacity; ++i) {
+      auto k = 3 * ((5641 * i) % capacity);
+      ptr->emplace(k, k + 1);
+      EXPECT_EQ(ptr->find(k)->second, k + 1);
+    }
+  }
+
+  for (size_t i = 0; i < iters; ++i) {
+    size_t k = 3 * (((i * 7919) ^ (i * 4001)) % capacity);
+    auto iter = ptr->find(k);
+    if (iter == ptr->cend() ||
+        iter->second != k + 1) {
+      auto jter = ptr->find(k);
+      EXPECT_TRUE(iter == jter);
+    }
+    EXPECT_EQ(iter->second, k + 1);
+  }
+
+  BENCHMARK_SUSPEND {
+    ptr.reset(nullptr);
+  }
+}
+
+struct PairHash {
+  size_t operator()(const std::pair<uint64_t,uint64_t>& pr) const {
+    return pr.first ^ pr.second;
+  }
+};
+
+void contendedRW(size_t itersPerThread,
+                 size_t capacity,
+                 size_t numThreads,
+                 size_t readsPerWrite) {
+  typedef std::pair<uint64_t,uint64_t> Key;
+  typedef AtomicUnorderedInsertMap<Key,MutableAtom<uint32_t>,PairHash> Map;
+
+  std::unique_ptr<Map> ptr = {};
+  std::atomic<bool> go;
+  std::vector<std::thread> threads;
+
+  BENCHMARK_SUSPEND {
+    ptr.reset(new Map(capacity));
+    while (threads.size() < numThreads) {
+      threads.emplace_back([&](){
+        while (!go) {
+          std::this_thread::yield();
+        }
+
+        size_t reads = 0;
+        size_t writes = 0;
+        while (reads + writes < itersPerThread) {
+          auto r = Random::rand32();
+          Key key(reads + writes, r);
+          if (reads < writes * readsPerWrite ||
+              writes >= capacity / numThreads) {
+            // read needed
+            ++reads;
+            auto iter = ptr->find(key);
+            EXPECT_TRUE(
+                iter == ptr->cend() ||
+                iter->second.data.load(std::memory_order_acquire) >= key.first);
+          } else {
+            ++writes;
+            try {
+              auto pr = ptr->emplace(key, key.first);
+              if (!pr.second) {
+                pr.first->second.data++;
+              }
+            } catch (std::bad_alloc& x) {
+              LOG(INFO) << "bad alloc";
+            }
+          }
+        }
+      });
+    }
+  }
+
+  go = true;
+
+  for (auto& thr : threads) {
+    thr.join();
+  }
+
+  BENCHMARK_SUSPEND {
+    ptr.reset(nullptr);
+  }
+}
+
+// sudo nice -n -20 ~/fbcode/_bin/common/concurrency/experimental/atomic_unordered_map --benchmark --bm_min_iters=1000000
+//
+// without MAP_HUGETLB (default)
+//
+// ============================================================================
+// common/concurrency/experimental/AtomicUnorderedMapTest.cpprelative  time/iter
+//   iters/s
+// ============================================================================
+// lookup_int_int_hit                                          20.05ns   49.89M
+// contendedRW(small_32thr_99pct)                              70.36ns   14.21M
+// contendedRW(large_32thr_99pct)                             164.23ns    6.09M
+// contendedRW(large_32thr_99_9pct)                           158.81ns    6.30M
+// ============================================================================
+//
+// with MAP_HUGETLB hacked in
+// ============================================================================
+// lookup_int_int_hit                                          19.67ns   50.84M
+// contendedRW(small_32thr_99pct)                              62.46ns   16.01M
+// contendedRW(large_32thr_99pct)                             119.41ns    8.37M
+// contendedRW(large_32thr_99_9pct)                           111.23ns    8.99M
+// ============================================================================
+BENCHMARK_NAMED_PARAM(contendedRW, small_32thr_99pct, 100000, 32, 99)
+BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99pct, 100000000, 32, 99)
+BENCHMARK_NAMED_PARAM(contendedRW, large_32thr_99_9pct, 100000000, 32, 999)
+
+BENCHMARK_DRAW_LINE();
+
+// sudo nice -n -20 ~/fbcode/_build/opt/site_integrity/quasar/experimental/atomic_unordered_map_test --benchmark --bm_min_iters=10000
+// Single threaded benchmarks to test how much better we are than
+// std::unordered_map and what is the cost of using atomic operations
+// in the uncontended use case
+// ============================================================================
+// std_map                                                      1.20ms   832.58
+// atomic_fast_map                                            511.35us    1.96K
+// fast_map                                                   196.28us    5.09K
+// ============================================================================
+
+BENCHMARK(std_map) {
+  std::unordered_map<long, long> m;
+  m.reserve(10000);
+  for (int i=0; i<10000; ++i) {
+    m.emplace(i,i);
+  }
+
+  for (int i=0; i<10000; ++i) {
+    auto a = m.find(i);
+    folly::doNotOptimizeAway(&*a);
+  }
+}
+
+BENCHMARK(atomic_fast_map) {
+  UnorderedInsertMap<long, long, std::atomic> m(10000);
+  for (int i=0; i<10000; ++i) {
+    m.emplace(i,i);
+  }
+
+  for (int i=0; i<10000; ++i) {
+    auto a = m.find(i);
+    folly::doNotOptimizeAway(&*a);
+  }
+}
+
+BENCHMARK(fast_map) {
+  UnorderedInsertMap<long, long> m(10000);
+  for (int i=0; i<10000; ++i) {
+    m.emplace(i,i);
+  }
+
+  for (int i=0; i<10000; ++i) {
+    auto a = m.find(i);
+    folly::doNotOptimizeAway(&*a);
+  }
+}
+
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  int rv = RUN_ALL_TESTS();
+  folly::runBenchmarksOnFlag();
+  return rv;
+}
author	Nathan Bronson <ngbronson@fb.com>
	Mon, 15 Jun 2015 15:29:44 +0000 (08:29 -0700)
committer	Sara Golemon <sgolemon@fb.com>
	Mon, 15 Jun 2015 20:36:36 +0000 (13:36 -0700)
folly/AtomicUnorderedMap.h	[new file with mode: 0644]	patch \| blob
folly/Makefile.am		patch \| blob \| history
folly/detail/AtomicUnorderedMapUtils.h	[new file with mode: 0644]	patch \| blob
folly/test/AtomicUnorderedMapTest.cpp	[new file with mode: 0644]	patch \| blob