Flat Combining
authorMaged Michael <magedmichael@fb.com>
Thu, 23 Mar 2017 14:13:23 +0000 (07:13 -0700)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Thu, 23 Mar 2017 14:15:20 +0000 (07:15 -0700)
Summary:
Flat combining template that takes the following template parameters:
 T         Concurrent data structure using FC interface
 Mutex Mutex type (default std::mutex)
 Atom  Atomic template (default std::atomic)
 Req    Optional request structure to hold custom info (default dummy type bool)

Flat combining (FC) was introduced in the SPAA 2010 paper Flat Combining and the Synchronization-Parallelism Tradeoff, by Danny Hendler, Itai Incze, Nir Shavit, and Moran Tzafrir.
http://mcg.cs.tau.ac.il/projects/projects/flat-combining

Reviewed By: djwatson

Differential Revision: D4602402

fbshipit-source-id: 38327f752a3e92bb01e5496c321d8c87c818087a

folly/experimental/flat_combining/FlatCombining.h [new file with mode: 0644]
folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpp [new file with mode: 0644]
folly/experimental/flat_combining/test/FlatCombiningExamples.h [new file with mode: 0644]
folly/experimental/flat_combining/test/FlatCombiningTest.cpp [new file with mode: 0644]
folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h [new file with mode: 0644]

diff --git a/folly/experimental/flat_combining/FlatCombining.h b/folly/experimental/flat_combining/FlatCombining.h
new file mode 100644 (file)
index 0000000..9c03350
--- /dev/null
@@ -0,0 +1,658 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Baton.h>
+#include <folly/Function.h>
+#include <folly/IndexedMemPool.h>
+#include <folly/Portability.h>
+#include <folly/detail/CacheLocality.h>
+
+#include <atomic>
+#include <cassert>
+#include <mutex>
+
+namespace folly {
+
+/// Flat combining (FC) was introduced in the SPAA 2010 paper Flat
+/// Combining and the Synchronization-Parallelism Tradeoff, by Danny
+/// Hendler, Itai Incze, Nir Shavit, and Moran Tzafrir.
+/// http://mcg.cs.tau.ac.il/projects/projects/flat-combining
+///
+/// FC is an alternative to coarse-grained locking for making
+/// sequential data structures thread-safe while minimizing the
+/// synchroniation overheads and cache coherence traffic associated
+/// with locking.
+///
+/// Under FC, when a thread finds the lock contended, it can
+/// request (using a request record) that the lock holder execute its
+/// operation on the shared data structure. There can be a designated
+/// combiner thread or any thread can act as the combiner when it
+/// holds the lock.
+///
+/// Potential advantages of FC include:
+/// - Reduced cache coherence traffic
+/// - Reduced synchronization overheads, as the overheads of releasing
+///   and acquiring the lock are eliminated from the critical path of
+///   operating on the data structure.
+/// - Opportunities for smart combining, where executing multiple
+///   operations together may take less time than executng the
+///   operations separately, e.g., K delete_min operations on a
+///   priority queue may be combined to take O(K + log N) time instead
+///   of O(K * log N).
+///
+/// This implementation of flat combining supports:
+
+/// - A simple interface that requires minimal extra code by the
+///   user. To use this interface efficiently the user-provided
+///   functions must be copyable to folly::Functio without dynamic
+///   allocation. If this is impossible or inconvenient, the user is
+///   encouraged to use the custom interface described below.
+/// - A custom interface that supports custom combinining and custom
+///   request structure, either for the sake of smart combining or for
+///   efficiently supporting operations that are not be copyable to
+///   folly::Function without synamic allocation.
+/// - Both synchronous and asynchronous operations.
+/// - Request records with and without thread-caching.
+/// - Combining with and without a dedicated combiner thread.
+///
+/// This implementation differs from the algorithm in the SPAA 2010 paper:
+/// - It does not require thread caching of request records
+/// - It supports a dedicated combiner
+/// - It supports asynchronous operations
+///
+/// The generic FC class template supports generic data structures and
+/// utilities with arbitrary operations. The template supports static
+/// polymorphism for the combining function to enable custom smart
+/// combining.
+///
+/// A simple example of using the FC template:
+///   class ConcurrentFoo : public FlatCombining<ConcurrentFoo> {
+///     Foo foo_; // sequential data structure
+///    public:
+///     T bar(V v) { // thread-safe execution of foo_.bar(v)
+///       T result;
+///       // Note: fn must be copyable to folly::Function without dynamic
+///       // allocation. Otherwise, it is recommended to use the custom
+///       // interface and manage the function arguments and results
+///       // explicitly in a custom request structure.
+///       auto fn = [&] { result = foo_.bar(v); };
+///       this->requestFC(fn);
+///       return result;
+///     }
+///   };
+///
+/// See test/FlatCombiningExamples.h for more examples. See the
+/// comments for requestFC() below for a list of simple and custom
+/// variants of that function.
+
+template <
+    typename T, // concurrent data structure using FC interface
+    typename Mutex = std::mutex,
+    template <typename> class Atom = std::atomic,
+    typename Req = /* default dummy type */ bool>
+class FlatCombining {
+  using SavedFn = folly::Function<void()>;
+
+ public:
+  /// Combining request record.
+  class Rec {
+    FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+    folly::Baton<Atom, true, false> valid_;
+    folly::Baton<Atom, true, false> done_;
+    folly::Baton<Atom, true, false> disconnected_;
+    size_t index_;
+    size_t next_;
+    uint64_t last_;
+    Req req_;
+    SavedFn fn_;
+
+   public:
+    Rec() {
+      setDone();
+      setDisconnected();
+    }
+
+    void setValid() {
+      valid_.post();
+    }
+
+    void clearValid() {
+      valid_.reset();
+    }
+
+    bool isValid() const {
+      return valid_.try_wait();
+    }
+
+    void setDone() {
+      done_.post();
+    }
+
+    void clearDone() {
+      done_.reset();
+    }
+
+    bool isDone() const {
+      return done_.try_wait();
+    }
+
+    void awaitDone() {
+      done_.wait();
+    }
+
+    void setDisconnected() {
+      disconnected_.post();
+    }
+
+    void clearDisconnected() {
+      disconnected_.reset();
+    }
+
+    bool isDisconnected() const {
+      return disconnected_.try_wait();
+    }
+
+    void setIndex(const size_t index) {
+      index_ = index;
+    }
+
+    size_t getIndex() const {
+      return index_;
+    }
+
+    void setNext(const size_t next) {
+      next_ = next;
+    }
+
+    size_t getNext() const {
+      return next_;
+    }
+
+    void setLast(const uint64_t pass) {
+      last_ = pass;
+    }
+
+    uint64_t getLast() const {
+      return last_;
+    }
+
+    Req& getReq() {
+      return req_;
+    }
+
+    template <typename Func>
+    void setFn(Func&& fn) {
+      fn_ = std::forward<Func>(fn);
+      assert(fn_);
+      // If the following assertion is triggered, the user should
+      // either change the provided function, i.e., fn, to fit in
+      // folly::Function without allocation or use the custom
+      // interface to request combining for fn and manage its
+      // arguments (and results, if any) explicitly in a custom
+      // request structure.
+      assert(!fn_.hasAllocatedMemory());
+    }
+
+    void clearFn() {
+      fn_ = {};
+      assert(!fn_);
+    }
+
+    SavedFn& getFn() {
+      return fn_;
+    }
+
+    void complete() {
+      clearValid();
+      assert(!isDone());
+      setDone();
+    }
+  };
+
+  using Pool = folly::IndexedMemPool<Rec, 32, 4, Atom, false, false>;
+
+ public:
+  /// The constructor takes three optional arguments:
+  /// - Optional dedicated combiner thread (default true)
+  /// - Number of records (if 0, then kDefaultNumRecs)
+  /// - A hint for the max. number of combined operations per
+  ///   combining session that is checked at the beginning of each pass
+  ///   on the request records (if 0, then kDefaultMaxops)
+  explicit FlatCombining(
+      const bool dedicated = true,
+      uint32_t numRecs = 0, // number of combining records
+      const uint32_t maxOps = 0 // hint of max ops per combining session
+      )
+      : numRecs_(numRecs == 0 ? kDefaultNumRecs : numRecs),
+        maxOps_(maxOps == 0 ? kDefaultMaxOps : maxOps),
+        recs_(NULL_INDEX),
+        dedicated_(dedicated),
+        recsPool_(numRecs_) {
+    if (dedicated_) {
+      // dedicated combiner thread
+      combiner_ = std::thread([this] { dedicatedCombining(); });
+    }
+  }
+
+  /// Destructor: If there is a dedicated combiner, the destructor
+  /// flags it to shutdown. Otherwise, the destructor waits for all
+  /// pending asynchronous requests to be completed.
+  ~FlatCombining() {
+    if (dedicated_) {
+      shutdown();
+      combiner_.join();
+    } else {
+      drainAll();
+    }
+  }
+
+  // Wait for all pending operations to complete. Useful primarily
+  // when there are asynchronous operations without a dedicated
+  // combiner.
+  void drainAll() {
+    for (size_t i = getRecsHead(); i != NULL_INDEX; i = nextIndex(i)) {
+      Rec& rec = recsPool_[i];
+      awaitDone(rec);
+    }
+  }
+
+  // Give the caller exclusive access.
+  void acquireExclusive() {
+    m_.lock();
+  }
+
+  // Try to give the caller exclusive access. Returns true iff successful.
+  bool tryExclusive() {
+    return m_.try_lock();
+  }
+
+  // Release exclusive access. The caller must have exclusive access.
+  void releaseExclusive() {
+    m_.unlock();
+  }
+
+  // Execute an operation without combining
+  template <typename OpFunc>
+  void requestNoFC(OpFunc& opFn) {
+    std::lock_guard<Mutex> guard(m_);
+    opFn();
+  }
+
+  // This function first tries to execute the operation without
+  // combining. If unuccessful, it allocates a combining record if
+  // needed. If there are no available records, it waits for exclusive
+  // access and executes the operation. If a record is available and
+  // ready for use, it fills the record and indicates that the request
+  // is valid for combining. If the request is synchronous (by default
+  // or necessity), it waits for the operation to be completed by a
+  // combiner and optionally extracts the result, if any.
+  //
+  // This function can be called in several forms:
+  //   Simple forms that do not require the user to define a Req structure
+  //   or to override any request processing member functions:
+  //     requestFC(opFn)
+  //     requestFC(opFn, rec) // provides its own pre-allocated record
+  //     requestFC(opFn, rec, syncop) // asynchronous if syncop == false
+  //   Custom forms that require the user to define a Req structure and to
+  //   override some request processing member functions:
+  //     requestFC(opFn, fillFn)
+  //     requestFC(opFn, fillFn, rec)
+  //     requestFC(opFn, fillFn, rec, syncop)
+  //     requestFC(opFn, fillFn, resFn)
+  //     requestFC(opFn, fillFn, resFn, rec)
+  template <typename OpFunc>
+  void requestFC(OpFunc&& opFn, Rec* rec = nullptr, bool syncop = true) {
+    auto dummy = [](Req&) {};
+    requestOp(
+        std::forward<OpFunc>(opFn),
+        dummy /* fillFn */,
+        dummy /* resFn */,
+        rec,
+        syncop,
+        false /* simple */);
+  }
+  template <typename OpFunc, typename FillFunc>
+  void requestFC(
+      OpFunc&& opFn,
+      const FillFunc& fillFn,
+      Rec* rec = nullptr,
+      bool syncop = true) {
+    auto dummy = [](Req&) {};
+    requestOp(
+        std::forward<OpFunc>(opFn),
+        fillFn,
+        dummy /* resFn */,
+        rec,
+        syncop,
+        true /* custom */);
+  }
+  template <typename OpFunc, typename FillFunc, typename ResFn>
+  void requestFC(
+      OpFunc&& opFn,
+      const FillFunc& fillFn,
+      const ResFn& resFn,
+      Rec* rec = nullptr) {
+    // must wait for result to execute resFn -- so it must be synchronous
+    requestOp(
+        std::forward<OpFunc>(opFn),
+        fillFn,
+        resFn,
+        rec,
+        true /* sync */,
+        true /* custom*/);
+  }
+
+  // Allocate a record.
+  Rec* allocRec() {
+    auto idx = recsPool_.allocIndex();
+    if (idx == NULL_INDEX) {
+      outOfSpaceCount_.fetch_add(1);
+      return nullptr;
+    }
+    Rec& rec = recsPool_[idx];
+    rec.setIndex(idx);
+    return &rec;
+  }
+
+  // Free a record
+  void freeRec(Rec* rec) {
+    if (rec == nullptr) {
+      return;
+    }
+    auto idx = rec->getIndex();
+    recsPool_.recycleIndex(idx);
+  }
+
+  // Returns a count of the number of combined operations so far.
+  uint64_t getCombinedOpCount() {
+    std::lock_guard<Mutex> guard(m_);
+    return combined_;
+  }
+
+  // Returns a count of the number of combining passes so far.
+  uint64_t getCombiningPasses() {
+    std::lock_guard<Mutex> guard(m_);
+    return passes_;
+  }
+
+  uint64_t getOutOfSpaceCount() {
+    return outOfSpaceCount_.load();
+  }
+
+ protected:
+  const size_t NULL_INDEX = 0;
+  const uint32_t kDefaultMaxOps = 100;
+  const uint64_t kDefaultNumRecs = 64;
+  const uint64_t kIdleThreshold = 10;
+
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+  Mutex m_;
+
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+  folly::Baton<Atom, false, true> pending_;
+  Atom<bool> shutdown_{false};
+
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+  uint32_t numRecs_;
+  uint32_t maxOps_;
+  Atom<size_t> recs_;
+  bool dedicated_;
+  std::thread combiner_;
+  Pool recsPool_;
+
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+  uint64_t combined_ = 0;
+  uint64_t passes_ = 0;
+  uint64_t sessions_ = 0;
+  Atom<uint64_t> outOfSpaceCount_{0};
+
+  template <typename OpFunc, typename FillFunc, typename ResFn>
+  void requestOp(
+      OpFunc&& opFn,
+      const FillFunc& fillFn,
+      const ResFn& resFn,
+      Rec* rec,
+      bool syncop,
+      const bool custom) {
+    std::unique_lock<Mutex> l(this->m_, std::defer_lock);
+    if (l.try_lock()) {
+      // No contention
+      tryCombining();
+      opFn();
+      return;
+    }
+
+    // Try FC
+    bool tc = (rec != nullptr);
+    if (!tc) {
+      // if an async op doesn't have a thread-cached record then turn
+      // it into a synchronous op.
+      syncop = true;
+      rec = allocRec();
+    }
+    if (rec == nullptr) {
+      // Can't use FC - Must acquire lock
+      l.lock();
+      opFn();
+      return;
+    }
+
+    // Use FC
+    // Wait if record is in use
+    awaitDone(*rec);
+    rec->clearDone();
+    // Fill record
+    if (custom) {
+      // Fill the request (custom)
+      Req& req = rec->getReq();
+      fillFn(req);
+      rec->clearFn();
+    } else {
+      rec->setFn(std::forward<OpFunc>(opFn));
+    }
+    // Indicate that record is valid
+    assert(!rec->isValid());
+    rec->setValid();
+    // end of combining critical path
+    setPending();
+    // store-load order setValid before isDisconnected
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    if (rec->isDisconnected()) {
+      rec->clearDisconnected();
+      pushRec(rec->getIndex());
+      setPending();
+    }
+    // If synchronous wait for the request to be completed
+    if (syncop) {
+      awaitDone(*rec);
+      if (custom) {
+        Req& req = rec->getReq();
+        resFn(req); // Extract the result (custom)
+      }
+      if (!tc) {
+        freeRec(rec); // Free the temporary record.
+      }
+    }
+  }
+
+  void pushRec(size_t idx) {
+    Rec& rec = recsPool_[idx];
+    while (true) {
+      auto head = recs_.load(std::memory_order_acquire);
+      rec.setNext(head); // there shouldn't be a data race here
+      if (recs_.compare_exchange_weak(head, idx)) {
+        return;
+      }
+    }
+  }
+
+  size_t getRecsHead() {
+    return recs_.load(std::memory_order_acquire);
+  }
+
+  size_t nextIndex(size_t idx) {
+    return recsPool_[idx].getNext();
+  }
+
+  void clearPending() {
+    pending_.reset();
+  }
+
+  void setPending() {
+    pending_.post();
+  }
+
+  bool isPending() const {
+    return pending_.try_wait();
+  }
+
+  void awaitPending() {
+    pending_.wait();
+  }
+
+  uint64_t combiningSession() {
+    uint64_t combined = 0;
+    do {
+      uint64_t count = static_cast<T*>(this)->combiningPass();
+      if (count == 0) {
+        break;
+      }
+      combined += count;
+      ++this->passes_;
+    } while (combined < this->maxOps_);
+    return combined;
+  }
+
+  void tryCombining() {
+    if (!dedicated_) {
+      while (isPending()) {
+        clearPending();
+        combined_ += combiningSession();
+      }
+    }
+  }
+
+  void dedicatedCombining() {
+    while (true) {
+      awaitPending();
+      clearPending();
+      if (shutdown_.load()) {
+        break;
+      }
+      while (true) {
+        uint64_t count;
+        ++sessions_;
+        {
+          std::lock_guard<Mutex> guard(m_);
+          count = combiningSession();
+          combined_ += count;
+        }
+        if (count < maxOps_) {
+          break;
+        }
+      }
+    }
+  }
+
+  void awaitDone(Rec& rec) {
+    if (dedicated_) {
+      rec.awaitDone();
+    } else {
+      awaitDoneTryLock(rec);
+    }
+  }
+
+  /// Waits for the request to be done and occasionally tries to
+  /// acquire the lock and to do combining. Used only in the absence
+  /// of a dedicated combiner.
+  void awaitDoneTryLock(Rec& rec) {
+    assert(!dedicated_);
+    int count = 0;
+    while (!rec.isDone()) {
+      if (count == 0) {
+        std::unique_lock<Mutex> l(m_, std::defer_lock);
+        if (l.try_lock()) {
+          setPending();
+          tryCombining();
+        }
+      } else {
+        folly::asm_volatile_pause();
+        if (++count == 1000) {
+          count = 0;
+        }
+      }
+    }
+  }
+
+  void shutdown() {
+    shutdown_.store(true);
+    setPending();
+  }
+
+  /// The following member functions may be overridden for customization
+
+  void combinedOp(Req&) {
+    throw std::runtime_error(
+        "FlatCombining::combinedOp(Req&) must be overridden in the derived"
+        " class if called.");
+  }
+
+  void processReq(Rec& rec) {
+    SavedFn& opFn = rec.getFn();
+    if (opFn) {
+      // simple interface
+      opFn();
+    } else {
+      // custom interface
+      Req& req = rec.getReq();
+      static_cast<T*>(this)->combinedOp(req); // defined in derived class
+    }
+    rec.setLast(passes_);
+    rec.complete();
+  }
+
+  uint64_t combiningPass() {
+    uint64_t count = 0;
+    auto idx = getRecsHead();
+    Rec* prev = nullptr;
+    while (idx != NULL_INDEX) {
+      Rec& rec = recsPool_[idx];
+      auto next = rec.getNext();
+      bool valid = rec.isValid();
+      if (!valid && (passes_ - rec.getLast() > kIdleThreshold) &&
+          (prev != nullptr)) {
+        // Disconnect
+        prev->setNext(next);
+        rec.setDisconnected();
+        // store-load order setDisconnected before isValid
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+        valid = rec.isValid();
+      } else {
+        prev = &rec;
+      }
+      if (valid) {
+        processReq(rec);
+        ++count;
+      }
+      idx = next;
+    }
+    return count;
+  }
+};
+
+} // namespace folly {
diff --git a/folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpp b/folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpp
new file mode 100644 (file)
index 0000000..457a7a6
--- /dev/null
@@ -0,0 +1,1787 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Benchmark.h>
+#include <folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h>
+
+#include <folly/portability/GTest.h>
+#include <glog/logging.h>
+
+using namespace folly::test;
+
+// use option --benchmark to run folly::Benchmark
+// use option --direct to run direct benchmark measurements
+DEFINE_bool(direct, false, "run direct measurement");
+DEFINE_int32(reps, 10, "number of reps");
+DEFINE_int32(ops, 100000, "number of operations per rep");
+DEFINE_int32(lines, 5, "number of cache lines accessed per operation");
+DEFINE_int32(numRecs, 8, "number of records");
+DEFINE_int32(work, 1000, "amount of unrelated work per operation");
+
+static std::vector<int> nthr = {1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64};
+static int nthreads;
+static bool fc;
+static bool simple;
+static bool dedicated;
+static bool tc;
+static bool syncops;
+
+// baseline - no combining
+BENCHMARK(no_combining_base, iters) {
+  fc = false;
+  dedicated = false;
+  tc = false;
+  syncops = false;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(no_combining_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+BENCHMARK_DRAW_LINE()
+
+// dedicated combiner
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_dedicated_notc_sync, iters) {
+  fc = true;
+  dedicated = true;
+  tc = false;
+  syncops = true;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_dedicated_notc_sync_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_dedicated_notc_async, iters) {
+  syncops = false;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_dedicated_notc_async_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_dedicated_tc_sync, iters) {
+  tc = true;
+  syncops = true;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_dedicated_tc_sync_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_dedicated_tc_async, iters) {
+  tc = true;
+  syncops = false;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_dedicated_tc_async_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+// no dedicated combiner
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_no_dedicated_notc_sync, iters) {
+  dedicated = false;
+  tc = false;
+  syncops = true;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_no_dedicated_notc_sync_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_no_dedicated_notc_async, iters) {
+  syncops = false;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_no_dedicated_notc_async_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_no_dedicated_tc_sync, iters) {
+  tc = true;
+  syncops = true;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_no_dedicated_tc_sync_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK_RELATIVE(combining_no_dedicated_tc_async, iters) {
+  tc = true;
+  syncops = false;
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_RELATIVE(combining_no_dedicated_tc_async_dup, iters) {
+  run_test(
+      nthreads,
+      FLAGS_lines,
+      FLAGS_numRecs,
+      FLAGS_work,
+      iters,
+      fc,
+      simple,
+      dedicated,
+      tc,
+      syncops);
+}
+
+BENCHMARK_DRAW_LINE()
+
+void benchmarkSetup() {
+  int numCores = std::thread::hardware_concurrency();
+  std::cout << "\nRunning benchmarks on machine with " << numCores
+            << " logical cores" << std::endl;
+}
+
+TEST(FlatCombining, folly_benchmark) {
+  if (FLAGS_benchmark) {
+    benchmarkSetup();
+    for (bool b : {true, false}) {
+      simple = b;
+      std::string str = simple ? "simple" : "custom";
+      std::cout << "\n------------------------------------ " << str
+                << " interface" << std::endl;
+      for (int i : nthr) {
+        std::cout << "\n---------------------------------- Number of threads = "
+                  << i << std::endl;
+        nthreads = i;
+        folly::runBenchmarks();
+      }
+    }
+  }
+}
+
+// Direct measurement - not using folly::Benchmark
+
+static uint64_t test(
+    std::string name,
+    bool fc,
+    bool dedicated,
+    bool tc,
+    bool syncops,
+    uint64_t base) {
+  uint64_t min = UINTMAX_MAX;
+  uint64_t max = 0;
+  uint64_t sum = 0;
+
+  for (int i = 0; i < FLAGS_reps; ++i) {
+    uint64_t dur = run_test(
+        nthreads,
+        FLAGS_lines,
+        FLAGS_numRecs,
+        FLAGS_work,
+        FLAGS_ops,
+        fc,
+        simple,
+        dedicated,
+        tc,
+        syncops);
+    sum += dur;
+    min = std::min(min, dur);
+    max = std::max(max, dur);
+  }
+  uint64_t avg = sum / FLAGS_reps;
+
+  uint64_t res = min;
+  std::cout << name;
+  std::cout << "   " << std::setw(4) << max / FLAGS_ops << " ns";
+  std::cout << "   " << std::setw(4) << avg / FLAGS_ops << " ns";
+  std::cout << "   " << std::setw(4) << res / FLAGS_ops << " ns";
+  if (base) {
+    std::cout << " " << std::setw(3) << 100 * base / res << "%";
+  }
+  std::cout << std::endl;
+  return res;
+}
+
+TEST(FlatCombining, direct_measurement) {
+  if (!FLAGS_direct) {
+    return;
+  }
+  benchmarkSetup();
+  simple = false;
+  std::string str = simple ? "simple" : "custom";
+  std::cout << "\n------------------------------------ " << str << " interface"
+            << std::endl;
+  for (int i : nthr) {
+    nthreads = i;
+    std::cout << "\n------------------------------------ Number of threads = "
+              << i << "\n"
+              << std::endl;
+    std::cout << "Test_name, Max time, Avg time, Min time, % base min / min\n"
+              << std::endl;
+
+    uint64_t base =
+    test("no_combining - base         ", false, false, false, false, 0);
+    test("no_combining - dup          ", false, false, false, false, base);
+    std::cout << "---------------------------------------" << std::endl;
+
+    std::cout << "---- dedicated-------------------------" << std::endl;
+    test("combining_notc_sync         ", true, true, false, true, base);
+    test("combining_notc_sync - dup   ", true, true, false, true, base);
+    std::cout << "---------------------------------------" << std::endl;
+    test("combining_notc_async        ", true, true, false, false, base);
+    test("combining_notc_async - dup  ", true, true, false, false, base);
+    std::cout << "---------------------------------------" << std::endl;
+    test("combining_tc_sync           ", true, true, true, true, base);
+    test("combining_tc_sync - dup     ", true, true, true, true, base);
+    std::cout << "---------------------------------------" << std::endl;
+    test("combining_tc_async          ", true, true, true, false, base);
+    test("combining_tc_async - dup    ", true, true, true, false, base);
+    std::cout << "---------------------------------------" << std::endl;
+
+    std::cout << "---- no dedicated----------------------" << std::endl;
+    test("combining_notc_sync         ", true, false, false, true, base);
+    test("combining_notc_sync - dup   ", true, false, false, true, base);
+    std::cout << "---------------------------------------" << std::endl;
+    test("combining_notc_async        ", true, false, false, false, base);
+    test("combining_notc_async - dup  ", true, false, false, false, base);
+    std::cout << "---------------------------------------" << std::endl;
+    test("combining_tc_sync           ", true, false, true, true, base);
+    test("combining_tc_sync - dup     ", true, false, true, true, base);
+    std::cout << "---------------------------------------" << std::endl;
+    test("combining_tc_async          ", true, false, true, false, base);
+    test("combining_tc_async - dup    ", true, false, true, false, base);
+    std::cout << "---------------------------------------" << std::endl;
+  }
+}
+
+/*
+See benchmark results in https://phabricator.intern.facebook.com/P57204895
+
+The results are from a run using the command
+$ numactl -N 1 flat_combining_benchmark --benchmark --bm_min_iters=100000 --direct
+
+Using the default parameters of the benchmark: In each iteration, the
+operation on the shared data structure updates 5 cache lines and
+performs unrelated work (~300ns) after each operation. The benchmark
+doesn't do any smart combining (i.e., saving or dropping some work
+based on understanding the details of the combined operations).
+
+Direct measurements are used to evaluate the high variance in some cases.
+Duplicate runs are included in order to assess the relevance of outliers.
+
+----
+[==========] Running 2 tests from 1 test case.
+[----------] Global test environment set-up.
+[----------] 2 tests from FlatCombining
+[ RUN      ] FlatCombining.folly_benchmark
+
+Running benchmarks on machine with 32 logical cores
+
+------------------------------------ simple interface
+
+---------------------------------- Number of threads = 1
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          330.43ns    3.03M
+no_combining_dup                                 100.09%   330.13ns    3.03M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                     93.17%   354.66ns    2.82M
+combining_dedicated_notc_sync_dup                 93.57%   353.15ns    2.83M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                    99.35%   332.60ns    3.01M
+combining_dedicated_notc_async_dup                99.07%   333.54ns    3.00M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                       93.05%   355.13ns    2.82M
+combining_dedicated_tc_sync_dup                   92.87%   355.81ns    2.81M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                      99.17%   333.21ns    3.00M
+combining_dedicated_tc_async_dup                  99.28%   332.84ns    3.00M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                  93.51%   353.38ns    2.83M
+combining_no_dedicated_notc_sync_dup              93.27%   354.26ns    2.82M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                 99.40%   332.44ns    3.01M
+combining_no_dedicated_notc_async_dup             99.13%   333.34ns    3.00M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                    93.38%   353.86ns    2.83M
+combining_no_dedicated_tc_sync_dup                93.52%   353.31ns    2.83M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                   99.29%   332.78ns    3.00M
+combining_no_dedicated_tc_async_dup               99.19%   333.11ns    3.00M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 2
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          213.60ns    4.68M
+no_combining_dup                                 100.84%   211.82ns    4.72M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                     89.84%   237.76ns    4.21M
+combining_dedicated_notc_sync_dup                 89.85%   237.73ns    4.21M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                    93.80%   227.72ns    4.39M
+combining_dedicated_notc_async_dup                87.85%   243.15ns    4.11M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                       86.81%   246.06ns    4.06M
+combining_dedicated_tc_sync_dup                   87.15%   245.09ns    4.08M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                      92.14%   231.82ns    4.31M
+combining_dedicated_tc_async_dup                  92.04%   232.08ns    4.31M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                  95.20%   224.36ns    4.46M
+combining_no_dedicated_notc_sync_dup              95.40%   223.91ns    4.47M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                 95.41%   223.89ns    4.47M
+combining_no_dedicated_notc_async_dup             95.86%   222.82ns    4.49M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                    94.43%   226.21ns    4.42M
+combining_no_dedicated_tc_sync_dup                94.28%   226.56ns    4.41M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                   96.62%   221.07ns    4.52M
+combining_no_dedicated_tc_async_dup               97.24%   219.66ns    4.55M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 3
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          188.20ns    5.31M
+no_combining_dup                                  94.07%   200.07ns    5.00M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                     95.39%   197.30ns    5.07M
+combining_dedicated_notc_sync_dup                 94.50%   199.16ns    5.02M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                    75.29%   249.96ns    4.00M
+combining_dedicated_notc_async_dup                72.97%   257.91ns    3.88M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                       91.26%   206.22ns    4.85M
+combining_dedicated_tc_sync_dup                   90.68%   207.54ns    4.82M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                      89.64%   209.95ns    4.76M
+combining_dedicated_tc_async_dup                  88.21%   213.36ns    4.69M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                  96.19%   195.66ns    5.11M
+combining_no_dedicated_notc_sync_dup              93.27%   201.78ns    4.96M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                 81.12%   231.99ns    4.31M
+combining_no_dedicated_notc_async_dup             82.48%   228.19ns    4.38M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                    79.48%   236.78ns    4.22M
+combining_no_dedicated_tc_sync_dup                79.73%   236.04ns    4.24M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  100.70%   186.90ns    5.35M
+combining_no_dedicated_tc_async_dup               99.43%   189.27ns    5.28M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 4
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          242.84ns    4.12M
+no_combining_dup                                 100.78%   240.96ns    4.15M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    100.91%   240.65ns    4.16M
+combining_dedicated_notc_sync_dup                 99.76%   243.42ns    4.11M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   102.06%   237.95ns    4.20M
+combining_dedicated_notc_async_dup               101.63%   238.94ns    4.19M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      109.79%   221.18ns    4.52M
+combining_dedicated_tc_sync_dup                  108.94%   222.92ns    4.49M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     133.01%   182.58ns    5.48M
+combining_dedicated_tc_async_dup                 134.91%   180.00ns    5.56M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 108.77%   223.25ns    4.48M
+combining_no_dedicated_notc_sync_dup             107.64%   225.61ns    4.43M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                115.14%   210.91ns    4.74M
+combining_no_dedicated_notc_async_dup            115.06%   211.05ns    4.74M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   116.36%   208.70ns    4.79M
+combining_no_dedicated_tc_sync_dup               115.70%   209.89ns    4.76M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  159.69%   152.07ns    6.58M
+combining_no_dedicated_tc_async_dup              158.27%   153.43ns    6.52M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 6
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          281.36ns    3.55M
+no_combining_dup                                  98.56%   285.46ns    3.50M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    132.39%   212.51ns    4.71M
+combining_dedicated_notc_sync_dup                133.10%   211.38ns    4.73M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   141.35%   199.05ns    5.02M
+combining_dedicated_notc_async_dup               143.18%   196.51ns    5.09M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      138.94%   202.50ns    4.94M
+combining_dedicated_tc_sync_dup                  138.64%   202.93ns    4.93M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     199.76%   140.85ns    7.10M
+combining_dedicated_tc_async_dup                 200.28%   140.48ns    7.12M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 155.48%   180.96ns    5.53M
+combining_no_dedicated_notc_sync_dup             150.82%   186.55ns    5.36M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                162.23%   173.43ns    5.77M
+combining_no_dedicated_notc_async_dup            161.33%   174.39ns    5.73M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   167.90%   167.57ns    5.97M
+combining_no_dedicated_tc_sync_dup               164.84%   170.69ns    5.86M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  242.51%   116.02ns    8.62M
+combining_no_dedicated_tc_async_dup              245.67%   114.53ns    8.73M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 8
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          315.57ns    3.17M
+no_combining_dup                                  98.83%   319.32ns    3.13M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    170.48%   185.11ns    5.40M
+combining_dedicated_notc_sync_dup                174.57%   180.77ns    5.53M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   178.57%   176.72ns    5.66M
+combining_dedicated_notc_async_dup               181.30%   174.06ns    5.75M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      195.40%   161.50ns    6.19M
+combining_dedicated_tc_sync_dup                  197.18%   160.05ns    6.25M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     322.03%    97.99ns   10.20M
+combining_dedicated_tc_async_dup                 324.51%    97.24ns   10.28M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 205.61%   153.48ns    6.52M
+combining_no_dedicated_notc_sync_dup             204.94%   153.98ns    6.49M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                217.81%   144.88ns    6.90M
+combining_no_dedicated_notc_async_dup            218.58%   144.37ns    6.93M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   223.96%   140.91ns    7.10M
+combining_no_dedicated_tc_sync_dup               224.55%   140.53ns    7.12M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  364.58%    86.56ns   11.55M
+combining_no_dedicated_tc_async_dup              363.33%    86.86ns   11.51M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 12
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          353.59ns    2.83M
+no_combining_dup                                  99.91%   353.91ns    2.83M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    276.36%   127.95ns    7.82M
+combining_dedicated_notc_sync_dup                278.88%   126.79ns    7.89M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   249.52%   141.71ns    7.06M
+combining_dedicated_notc_async_dup               247.26%   143.00ns    6.99M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      318.57%   110.99ns    9.01M
+combining_dedicated_tc_sync_dup                  326.27%   108.37ns    9.23M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     428.50%    82.52ns   12.12M
+combining_dedicated_tc_async_dup                 429.19%    82.39ns   12.14M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 276.54%   127.86ns    7.82M
+combining_no_dedicated_notc_sync_dup             275.59%   128.31ns    7.79M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                298.92%   118.29ns    8.45M
+combining_no_dedicated_notc_async_dup            298.93%   118.28ns    8.45M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   300.56%   117.64ns    8.50M
+combining_no_dedicated_tc_sync_dup               296.95%   119.07ns    8.40M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  431.06%    82.03ns   12.19M
+combining_no_dedicated_tc_async_dup              430.40%    82.15ns   12.17M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 16
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          358.57ns    2.79M
+no_combining_dup                                  99.97%   358.70ns    2.79M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    319.73%   112.15ns    8.92M
+combining_dedicated_notc_sync_dup                327.86%   109.37ns    9.14M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   296.17%   121.07ns    8.26M
+combining_dedicated_notc_async_dup               306.86%   116.85ns    8.56M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      337.53%   106.24ns    9.41M
+combining_dedicated_tc_sync_dup                  347.98%   103.04ns    9.70M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     423.80%    84.61ns   11.82M
+combining_dedicated_tc_async_dup                 421.07%    85.16ns   11.74M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 321.94%   111.38ns    8.98M
+combining_no_dedicated_notc_sync_dup             318.54%   112.57ns    8.88M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                364.71%    98.32ns   10.17M
+combining_no_dedicated_notc_async_dup            364.22%    98.45ns   10.16M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   322.91%   111.04ns    9.01M
+combining_no_dedicated_tc_sync_dup               322.42%   111.21ns    8.99M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  466.30%    76.90ns   13.00M
+combining_no_dedicated_tc_async_dup              462.76%    77.49ns   12.91M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 24
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          348.54ns    2.87M
+no_combining_dup                                  99.96%   348.69ns    2.87M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    260.21%   133.95ns    7.47M
+combining_dedicated_notc_sync_dup                257.84%   135.18ns    7.40M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   242.25%   143.88ns    6.95M
+combining_dedicated_notc_async_dup               235.88%   147.76ns    6.77M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      262.45%   132.80ns    7.53M
+combining_dedicated_tc_sync_dup                  251.14%   138.78ns    7.21M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     256.89%   135.68ns    7.37M
+combining_dedicated_tc_async_dup                 304.76%   114.37ns    8.74M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 270.20%   129.00ns    7.75M
+combining_no_dedicated_notc_sync_dup             271.69%   128.29ns    7.80M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                298.35%   116.82ns    8.56M
+combining_no_dedicated_notc_async_dup            289.04%   120.59ns    8.29M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   286.59%   121.62ns    8.22M
+combining_no_dedicated_tc_sync_dup               292.21%   119.28ns    8.38M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  471.86%    73.87ns   13.54M
+combining_no_dedicated_tc_async_dup              458.16%    76.08ns   13.14M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 32
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          337.61ns    2.96M
+no_combining_dup                                  99.41%   339.60ns    2.94M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    204.50%   165.09ns    6.06M
+combining_dedicated_notc_sync_dup                233.28%   144.72ns    6.91M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   187.20%   180.35ns    5.54M
+combining_dedicated_notc_async_dup               192.76%   175.15ns    5.71M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      220.56%   153.07ns    6.53M
+combining_dedicated_tc_sync_dup                  207.62%   162.61ns    6.15M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     317.11%   106.46ns    9.39M
+combining_dedicated_tc_async_dup                 318.92%   105.86ns    9.45M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 259.29%   130.21ns    7.68M
+combining_no_dedicated_notc_sync_dup             248.33%   135.95ns    7.36M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                290.40%   116.26ns    8.60M
+combining_no_dedicated_notc_async_dup            299.92%   112.57ns    8.88M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   281.91%   119.76ns    8.35M
+combining_no_dedicated_tc_sync_dup               284.19%   118.80ns    8.42M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  435.16%    77.58ns   12.89M
+combining_no_dedicated_tc_async_dup              389.67%    86.64ns   11.54M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 48
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          334.48ns    2.99M
+no_combining_dup                                 100.00%   334.46ns    2.99M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    257.01%   130.14ns    7.68M
+combining_dedicated_notc_sync_dup                254.13%   131.62ns    7.60M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   189.56%   176.45ns    5.67M
+combining_dedicated_notc_async_dup               247.68%   135.05ns    7.40M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      259.47%   128.91ns    7.76M
+combining_dedicated_tc_sync_dup                  281.34%   118.89ns    8.41M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     301.96%   110.77ns    9.03M
+combining_dedicated_tc_async_dup                 347.65%    96.21ns   10.39M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 268.45%   124.60ns    8.03M
+combining_no_dedicated_notc_sync_dup             272.54%   122.73ns    8.15M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                306.04%   109.29ns    9.15M
+combining_no_dedicated_notc_async_dup            294.38%   113.62ns    8.80M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   280.89%   119.08ns    8.40M
+combining_no_dedicated_tc_sync_dup               276.01%   121.18ns    8.25M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  466.45%    71.71ns   13.95M
+combining_no_dedicated_tc_async_dup              465.45%    71.86ns   13.92M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 64
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          335.68ns    2.98M
+no_combining_dup                                 101.03%   332.25ns    3.01M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    272.91%   123.00ns    8.13M
+combining_dedicated_notc_sync_dup                270.56%   124.07ns    8.06M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   200.44%   167.47ns    5.97M
+combining_dedicated_notc_async_dup               208.36%   161.10ns    6.21M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      258.40%   129.91ns    7.70M
+combining_dedicated_tc_sync_dup                  249.16%   134.72ns    7.42M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     378.86%    88.60ns   11.29M
+combining_dedicated_tc_async_dup                 299.32%   112.15ns    8.92M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 272.18%   123.33ns    8.11M
+combining_no_dedicated_notc_sync_dup             275.26%   121.95ns    8.20M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                296.23%   113.32ns    8.82M
+combining_no_dedicated_notc_async_dup            311.17%   107.88ns    9.27M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   283.30%   118.49ns    8.44M
+combining_no_dedicated_tc_sync_dup               263.86%   127.22ns    7.86M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  426.62%    78.68ns   12.71M
+combining_no_dedicated_tc_async_dup              445.17%    75.40ns   13.26M
+----------------------------------------------------------------------------
+============================================================================
+
+------------------------------------ custom interface
+
+---------------------------------- Number of threads = 1
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          329.49ns    3.03M
+no_combining_dup                                  99.91%   329.79ns    3.03M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                     98.69%   333.88ns    3.00M
+combining_dedicated_notc_sync_dup                 98.70%   333.83ns    3.00M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                    98.22%   335.47ns    2.98M
+combining_dedicated_notc_async_dup                98.16%   335.66ns    2.98M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                       98.70%   333.85ns    3.00M
+combining_dedicated_tc_sync_dup                   98.78%   333.58ns    3.00M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                      98.14%   335.73ns    2.98M
+combining_dedicated_tc_async_dup                  97.92%   336.49ns    2.97M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                  98.94%   333.00ns    3.00M
+combining_no_dedicated_notc_sync_dup              98.86%   333.29ns    3.00M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                 98.36%   334.99ns    2.99M
+combining_no_dedicated_notc_async_dup             98.61%   334.15ns    2.99M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                    99.07%   332.58ns    3.01M
+combining_no_dedicated_tc_sync_dup                99.12%   332.41ns    3.01M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                   97.08%   339.38ns    2.95M
+combining_no_dedicated_tc_async_dup               97.54%   337.81ns    2.96M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 2
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          216.71ns    4.61M
+no_combining_dup                                 100.34%   215.97ns    4.63M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                     95.42%   227.11ns    4.40M
+combining_dedicated_notc_sync_dup                 94.16%   230.15ns    4.34M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                    91.84%   235.97ns    4.24M
+combining_dedicated_notc_async_dup                91.41%   237.08ns    4.22M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                       96.79%   223.90ns    4.47M
+combining_dedicated_tc_sync_dup                   96.54%   224.47ns    4.45M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                      90.90%   238.41ns    4.19M
+combining_dedicated_tc_async_dup                  95.45%   227.03ns    4.40M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 101.13%   214.28ns    4.67M
+combining_no_dedicated_notc_sync_dup             100.11%   216.48ns    4.62M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                 96.40%   224.80ns    4.45M
+combining_no_dedicated_notc_async_dup             96.36%   224.90ns    4.45M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   100.86%   214.85ns    4.65M
+combining_no_dedicated_tc_sync_dup               101.91%   212.65ns    4.70M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                   95.66%   226.54ns    4.41M
+combining_no_dedicated_tc_async_dup               95.88%   226.03ns    4.42M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 3
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          189.61ns    5.27M
+no_combining_dup                                 100.22%   189.20ns    5.29M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    103.18%   183.76ns    5.44M
+combining_dedicated_notc_sync_dup                103.66%   182.92ns    5.47M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                    77.14%   245.81ns    4.07M
+combining_dedicated_notc_async_dup                90.25%   210.10ns    4.76M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                       89.88%   210.95ns    4.74M
+combining_dedicated_tc_sync_dup                   87.83%   215.90ns    4.63M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                      89.33%   212.26ns    4.71M
+combining_dedicated_tc_async_dup                  85.19%   222.56ns    4.49M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                  98.43%   192.64ns    5.19M
+combining_no_dedicated_notc_sync_dup             101.15%   187.46ns    5.33M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                 83.77%   226.36ns    4.42M
+combining_no_dedicated_notc_async_dup             84.69%   223.89ns    4.47M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                    85.47%   221.85ns    4.51M
+combining_no_dedicated_tc_sync_dup                86.32%   219.65ns    4.55M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  105.62%   179.52ns    5.57M
+combining_no_dedicated_tc_async_dup              105.26%   180.14ns    5.55M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 4
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          237.50ns    4.21M
+no_combining_dup                                  99.80%   237.97ns    4.20M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    112.56%   210.99ns    4.74M
+combining_dedicated_notc_sync_dup                104.08%   228.20ns    4.38M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   101.44%   234.12ns    4.27M
+combining_dedicated_notc_async_dup               100.73%   235.77ns    4.24M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      111.70%   212.62ns    4.70M
+combining_dedicated_tc_sync_dup                  113.00%   210.18ns    4.76M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     131.11%   181.15ns    5.52M
+combining_dedicated_tc_async_dup                 132.65%   179.04ns    5.59M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 115.76%   205.17ns    4.87M
+combining_no_dedicated_notc_sync_dup             114.70%   207.06ns    4.83M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                111.63%   212.76ns    4.70M
+combining_no_dedicated_notc_async_dup            111.91%   212.22ns    4.71M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   120.07%   197.80ns    5.06M
+combining_no_dedicated_tc_sync_dup               118.25%   200.85ns    4.98M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  153.73%   154.49ns    6.47M
+combining_no_dedicated_tc_async_dup              153.08%   155.15ns    6.45M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 6
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          281.56ns    3.55M
+no_combining_dup                                  99.97%   281.65ns    3.55M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    144.76%   194.50ns    5.14M
+combining_dedicated_notc_sync_dup                149.96%   187.76ns    5.33M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   147.72%   190.61ns    5.25M
+combining_dedicated_notc_async_dup               140.86%   199.89ns    5.00M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      154.17%   182.63ns    5.48M
+combining_dedicated_tc_sync_dup                  156.60%   179.80ns    5.56M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     202.42%   139.10ns    7.19M
+combining_dedicated_tc_async_dup                 203.44%   138.40ns    7.23M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 168.33%   167.27ns    5.98M
+combining_no_dedicated_notc_sync_dup             166.02%   169.59ns    5.90M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                166.44%   169.16ns    5.91M
+combining_no_dedicated_notc_async_dup            160.14%   175.82ns    5.69M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   181.79%   154.88ns    6.46M
+combining_no_dedicated_tc_sync_dup               180.25%   156.20ns    6.40M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  240.56%   117.04ns    8.54M
+combining_no_dedicated_tc_async_dup              240.74%   116.96ns    8.55M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 8
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          312.99ns    3.19M
+no_combining_dup                                  98.93%   316.37ns    3.16M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    182.71%   171.30ns    5.84M
+combining_dedicated_notc_sync_dup                183.23%   170.82ns    5.85M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   183.16%   170.88ns    5.85M
+combining_dedicated_notc_async_dup               181.29%   172.64ns    5.79M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      191.49%   163.45ns    6.12M
+combining_dedicated_tc_sync_dup                  191.04%   163.84ns    6.10M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     302.89%   103.34ns    9.68M
+combining_dedicated_tc_async_dup                 304.07%   102.94ns    9.71M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 220.41%   142.00ns    7.04M
+combining_no_dedicated_notc_sync_dup             219.90%   142.34ns    7.03M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                218.66%   143.14ns    6.99M
+combining_no_dedicated_notc_async_dup            218.74%   143.09ns    6.99M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   241.82%   129.43ns    7.73M
+combining_no_dedicated_tc_sync_dup               241.72%   129.48ns    7.72M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  352.39%    88.82ns   11.26M
+combining_no_dedicated_tc_async_dup              350.17%    89.38ns   11.19M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 12
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          350.05ns    2.86M
+no_combining_dup                                  99.06%   353.37ns    2.83M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    266.87%   131.17ns    7.62M
+combining_dedicated_notc_sync_dup                245.79%   142.42ns    7.02M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   238.57%   146.73ns    6.82M
+combining_dedicated_notc_async_dup               240.02%   145.84ns    6.86M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      316.70%   110.53ns    9.05M
+combining_dedicated_tc_sync_dup                  321.05%   109.03ns    9.17M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     403.10%    86.84ns   11.52M
+combining_dedicated_tc_async_dup                 409.94%    85.39ns   11.71M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 300.23%   116.59ns    8.58M
+combining_no_dedicated_notc_sync_dup             299.07%   117.04ns    8.54M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                297.79%   117.55ns    8.51M
+combining_no_dedicated_notc_async_dup            296.66%   118.00ns    8.47M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   328.07%   106.70ns    9.37M
+combining_no_dedicated_tc_sync_dup               331.52%   105.59ns    9.47M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  424.57%    82.45ns   12.13M
+combining_no_dedicated_tc_async_dup              409.47%    85.49ns   11.70M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 16
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          360.47ns    2.77M
+no_combining_dup                                 100.11%   360.07ns    2.78M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    320.54%   112.46ns    8.89M
+combining_dedicated_notc_sync_dup                313.31%   115.05ns    8.69M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   296.83%   121.44ns    8.23M
+combining_dedicated_notc_async_dup               289.91%   124.34ns    8.04M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      364.27%    98.96ns   10.11M
+combining_dedicated_tc_sync_dup                  361.10%    99.82ns   10.02M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     424.43%    84.93ns   11.77M
+combining_dedicated_tc_async_dup                 418.07%    86.22ns   11.60M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 373.13%    96.60ns   10.35M
+combining_no_dedicated_notc_sync_dup             364.35%    98.93ns   10.11M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                361.40%    99.74ns   10.03M
+combining_no_dedicated_notc_async_dup            366.49%    98.36ns   10.17M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   382.22%    94.31ns   10.60M
+combining_no_dedicated_tc_sync_dup               380.64%    94.70ns   10.56M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  461.14%    78.17ns   12.79M
+combining_no_dedicated_tc_async_dup              481.50%    74.86ns   13.36M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 24
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          348.97ns    2.87M
+no_combining_dup                                 100.12%   348.54ns    2.87M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    234.17%   149.02ns    6.71M
+combining_dedicated_notc_sync_dup                205.54%   169.78ns    5.89M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   248.28%   140.55ns    7.11M
+combining_dedicated_notc_async_dup               239.71%   145.58ns    6.87M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      272.87%   127.89ns    7.82M
+combining_dedicated_tc_sync_dup                  235.76%   148.02ns    6.76M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     295.71%   118.01ns    8.47M
+combining_dedicated_tc_async_dup                 265.87%   131.25ns    7.62M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 298.96%   116.73ns    8.57M
+combining_no_dedicated_notc_sync_dup             297.67%   117.23ns    8.53M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                298.44%   116.93ns    8.55M
+combining_no_dedicated_notc_async_dup            292.80%   119.18ns    8.39M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   316.44%   110.28ns    9.07M
+combining_no_dedicated_tc_sync_dup               317.52%   109.90ns    9.10M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  432.64%    80.66ns   12.40M
+combining_no_dedicated_tc_async_dup              441.55%    79.03ns   12.65M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 32
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          338.90ns    2.95M
+no_combining_dup                                 100.01%   338.87ns    2.95M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    204.34%   165.85ns    6.03M
+combining_dedicated_notc_sync_dup                202.84%   167.07ns    5.99M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   192.27%   176.26ns    5.67M
+combining_dedicated_notc_async_dup               188.61%   179.68ns    5.57M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      247.57%   136.89ns    7.31M
+combining_dedicated_tc_sync_dup                  285.53%   118.69ns    8.43M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     277.97%   121.92ns    8.20M
+combining_dedicated_tc_async_dup                 231.11%   146.64ns    6.82M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 299.20%   113.27ns    8.83M
+combining_no_dedicated_notc_sync_dup             289.53%   117.05ns    8.54M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                282.29%   120.05ns    8.33M
+combining_no_dedicated_notc_async_dup            305.09%   111.08ns    9.00M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   312.52%   108.44ns    9.22M
+combining_no_dedicated_tc_sync_dup               324.88%   104.31ns    9.59M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  420.99%    80.50ns   12.42M
+combining_no_dedicated_tc_async_dup              406.58%    83.35ns   12.00M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 48
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          334.84ns    2.99M
+no_combining_dup                                  99.57%   336.29ns    2.97M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    212.82%   157.34ns    6.36M
+combining_dedicated_notc_sync_dup                198.39%   168.78ns    5.93M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   166.74%   200.82ns    4.98M
+combining_dedicated_notc_async_dup               197.07%   169.91ns    5.89M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      246.35%   135.92ns    7.36M
+combining_dedicated_tc_sync_dup                  209.52%   159.81ns    6.26M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     293.94%   113.91ns    8.78M
+combining_dedicated_tc_async_dup                 280.74%   119.27ns    8.38M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 301.60%   111.02ns    9.01M
+combining_no_dedicated_notc_sync_dup             296.10%   113.09ns    8.84M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                308.91%   108.40ns    9.23M
+combining_no_dedicated_notc_async_dup            298.48%   112.18ns    8.91M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   331.11%   101.13ns    9.89M
+combining_no_dedicated_tc_sync_dup               329.37%   101.66ns    9.84M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  451.58%    74.15ns   13.49M
+combining_no_dedicated_tc_async_dup              431.37%    77.62ns   12.88M
+----------------------------------------------------------------------------
+============================================================================
+
+---------------------------------- Number of threads = 64
+============================================================================
+folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative  time/iter  iters/s
+============================================================================
+no_combining_base                                          336.22ns    2.97M
+no_combining_dup                                 100.69%   333.92ns    2.99M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_dedicated_notc_sync                    230.57%   145.82ns    6.86M
+combining_dedicated_notc_sync_dup                221.08%   152.08ns    6.58M
+----------------------------------------------------------------------------
+combining_dedicated_notc_async                   232.38%   144.69ns    6.91M
+combining_dedicated_notc_async_dup               192.77%   174.41ns    5.73M
+----------------------------------------------------------------------------
+combining_dedicated_tc_sync                      284.07%   118.36ns    8.45M
+combining_dedicated_tc_sync_dup                  298.03%   112.81ns    8.86M
+----------------------------------------------------------------------------
+combining_dedicated_tc_async                     361.07%    93.12ns   10.74M
+combining_dedicated_tc_async_dup                 324.11%   103.74ns    9.64M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_sync                 284.58%   118.15ns    8.46M
+combining_no_dedicated_notc_sync_dup             301.73%   111.43ns    8.97M
+----------------------------------------------------------------------------
+combining_no_dedicated_notc_async                294.87%   114.02ns    8.77M
+combining_no_dedicated_notc_async_dup            287.51%   116.94ns    8.55M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_sync                   317.96%   105.74ns    9.46M
+combining_no_dedicated_tc_sync_dup               332.45%   101.13ns    9.89M
+----------------------------------------------------------------------------
+combining_no_dedicated_tc_async                  441.96%    76.07ns   13.15M
+combining_no_dedicated_tc_async_dup              393.82%    85.37ns   11.71M
+----------------------------------------------------------------------------
+============================================================================
+[       OK ] FlatCombining.folly_benchmark (455269 ms)
+[ RUN      ] FlatCombining.direct_measurement
+
+Running benchmarks on machine with 32 logical cores
+
+------------------------------------ custom interface
+
+------------------------------------ Number of threads = 1
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             334 ns    331 ns    329 ns
+no_combining - dup              335 ns    332 ns    331 ns  99%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             340 ns    335 ns    332 ns  99%
+combining_notc_sync - dup       337 ns    335 ns    333 ns  98%
+---------------------------------------
+combining_notc_async            360 ns    343 ns    338 ns  97%
+combining_notc_async - dup      339 ns    337 ns    336 ns  98%
+---------------------------------------
+combining_tc_sync               337 ns    335 ns    333 ns  98%
+combining_tc_sync - dup         346 ns    336 ns    332 ns  99%
+---------------------------------------
+combining_tc_async              338 ns    336 ns    335 ns  98%
+combining_tc_async - dup        338 ns    336 ns    335 ns  98%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             338 ns    335 ns    333 ns  98%
+combining_notc_sync - dup       337 ns    334 ns    333 ns  98%
+---------------------------------------
+combining_notc_async            339 ns    336 ns    335 ns  98%
+combining_notc_async - dup      347 ns    340 ns    336 ns  98%
+---------------------------------------
+combining_tc_sync               337 ns    335 ns    333 ns  98%
+combining_tc_sync - dup         436 ns    386 ns    333 ns  98%
+---------------------------------------
+combining_tc_async              340 ns    337 ns    335 ns  98%
+combining_tc_async - dup        338 ns    336 ns    335 ns  98%
+---------------------------------------
+
+------------------------------------ Number of threads = 2
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             315 ns    226 ns    211 ns
+no_combining - dup              217 ns    216 ns    213 ns  98%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             251 ns    237 ns    229 ns  92%
+combining_notc_sync - dup       250 ns    241 ns    226 ns  93%
+---------------------------------------
+combining_notc_async            278 ns    268 ns    252 ns  83%
+combining_notc_async - dup      297 ns    263 ns    245 ns  86%
+---------------------------------------
+combining_tc_sync               254 ns    246 ns    234 ns  90%
+combining_tc_sync - dup         335 ns    252 ns    230 ns  91%
+---------------------------------------
+combining_tc_async              305 ns    282 ns    245 ns  86%
+combining_tc_async - dup        284 ns    256 ns    239 ns  88%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             230 ns    222 ns    217 ns  97%
+combining_notc_sync - dup       231 ns    225 ns    218 ns  96%
+---------------------------------------
+combining_notc_async            244 ns    238 ns    233 ns  90%
+combining_notc_async - dup      241 ns    236 ns    231 ns  91%
+---------------------------------------
+combining_tc_sync               283 ns    239 ns    221 ns  95%
+combining_tc_sync - dup         299 ns    247 ns    225 ns  93%
+---------------------------------------
+combining_tc_async              290 ns    270 ns    244 ns  86%
+combining_tc_async - dup        290 ns    251 ns    238 ns  88%
+---------------------------------------
+
+------------------------------------ Number of threads = 3
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             211 ns    197 ns    190 ns
+no_combining - dup              209 ns    201 ns    195 ns  97%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             258 ns    197 ns    168 ns 112%
+combining_notc_sync - dup       274 ns    200 ns    162 ns 117%
+---------------------------------------
+combining_notc_async            307 ns    281 ns    260 ns  73%
+combining_notc_async - dup      284 ns    258 ns    216 ns  88%
+---------------------------------------
+combining_tc_sync               228 ns    215 ns    192 ns  98%
+combining_tc_sync - dup         216 ns    203 ns    178 ns 107%
+---------------------------------------
+combining_tc_async              246 ns    233 ns    220 ns  86%
+combining_tc_async - dup        236 ns    221 ns    208 ns  91%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             204 ns    198 ns    184 ns 103%
+combining_notc_sync - dup       203 ns    198 ns    193 ns  98%
+---------------------------------------
+combining_notc_async            238 ns    225 ns    218 ns  87%
+combining_notc_async - dup      231 ns    227 ns    223 ns  85%
+---------------------------------------
+combining_tc_sync               220 ns    216 ns    211 ns  90%
+combining_tc_sync - dup         227 ns    223 ns    219 ns  87%
+---------------------------------------
+combining_tc_async              182 ns    181 ns    179 ns 106%
+combining_tc_async - dup        186 ns    181 ns    180 ns 105%
+---------------------------------------
+
+------------------------------------ Number of threads = 4
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             258 ns    245 ns    238 ns
+no_combining - dup              262 ns    249 ns    245 ns  97%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             264 ns    250 ns    220 ns 107%
+combining_notc_sync - dup       260 ns    254 ns    231 ns 102%
+---------------------------------------
+combining_notc_async            266 ns    255 ns    233 ns 102%
+combining_notc_async - dup      268 ns    260 ns    252 ns  94%
+---------------------------------------
+combining_tc_sync               250 ns    240 ns    215 ns 110%
+combining_tc_sync - dup         252 ns    242 ns    217 ns 109%
+---------------------------------------
+combining_tc_async              199 ns    190 ns    183 ns 129%
+combining_tc_async - dup        199 ns    189 ns    178 ns 133%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             223 ns    211 ns    203 ns 116%
+combining_notc_sync - dup       218 ns    211 ns    202 ns 117%
+---------------------------------------
+combining_notc_async            222 ns    213 ns    207 ns 114%
+combining_notc_async - dup      236 ns    222 ns    215 ns 110%
+---------------------------------------
+combining_tc_sync               202 ns    199 ns    197 ns 120%
+combining_tc_sync - dup         207 ns    199 ns    194 ns 122%
+---------------------------------------
+combining_tc_async              162 ns    157 ns    152 ns 155%
+combining_tc_async - dup        188 ns    161 ns    154 ns 154%
+---------------------------------------
+
+------------------------------------ Number of threads = 6
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             298 ns    292 ns    281 ns
+no_combining - dup              296 ns    289 ns    270 ns 104%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             221 ns    211 ns    196 ns 143%
+combining_notc_sync - dup       247 ns    211 ns    192 ns 146%
+---------------------------------------
+combining_notc_async            216 ns    205 ns    194 ns 144%
+combining_notc_async - dup      215 ns    206 ns    197 ns 142%
+---------------------------------------
+combining_tc_sync               225 ns    204 ns    185 ns 151%
+combining_tc_sync - dup         229 ns    210 ns    186 ns 151%
+---------------------------------------
+combining_tc_async              165 ns    152 ns    144 ns 194%
+combining_tc_async - dup        166 ns    150 ns    143 ns 195%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             184 ns    182 ns    180 ns 155%
+combining_notc_sync - dup       176 ns    174 ns    172 ns 163%
+---------------------------------------
+combining_notc_async            179 ns    177 ns    174 ns 161%
+combining_notc_async - dup      186 ns    181 ns    177 ns 158%
+---------------------------------------
+combining_tc_sync               164 ns    163 ns    160 ns 174%
+combining_tc_sync - dup         171 ns    168 ns    161 ns 173%
+---------------------------------------
+combining_tc_async              142 ns    139 ns    138 ns 202%
+combining_tc_async - dup        141 ns    136 ns    119 ns 235%
+---------------------------------------
+
+------------------------------------ Number of threads = 8
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             333 ns    328 ns    315 ns
+no_combining - dup              336 ns    330 ns    327 ns  96%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             203 ns    179 ns    172 ns 183%
+combining_notc_sync - dup       190 ns    177 ns    171 ns 183%
+---------------------------------------
+combining_notc_async            204 ns    183 ns    170 ns 185%
+combining_notc_async - dup      201 ns    187 ns    176 ns 179%
+---------------------------------------
+combining_tc_sync               177 ns    170 ns    165 ns 190%
+combining_tc_sync - dup         178 ns    167 ns    164 ns 192%
+---------------------------------------
+combining_tc_async              134 ns    115 ns    105 ns 300%
+combining_tc_async - dup        132 ns    115 ns    103 ns 304%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             154 ns    145 ns    143 ns 220%
+combining_notc_sync - dup       153 ns    144 ns    142 ns 222%
+---------------------------------------
+combining_notc_async            145 ns    144 ns    143 ns 219%
+combining_notc_async - dup      157 ns    148 ns    144 ns 218%
+---------------------------------------
+combining_tc_sync               142 ns    134 ns    130 ns 241%
+combining_tc_sync - dup         144 ns    136 ns    130 ns 241%
+---------------------------------------
+combining_tc_async              118 ns     99 ns     91 ns 344%
+combining_tc_async - dup        118 ns     95 ns     91 ns 344%
+---------------------------------------
+
+------------------------------------ Number of threads = 12
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             361 ns    357 ns    353 ns
+no_combining - dup              361 ns    357 ns    355 ns  99%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             190 ns    157 ns    138 ns 255%
+combining_notc_sync - dup       162 ns    149 ns    138 ns 255%
+---------------------------------------
+combining_notc_async            163 ns    153 ns    145 ns 242%
+combining_notc_async - dup      194 ns    158 ns    152 ns 231%
+---------------------------------------
+combining_tc_sync               181 ns    128 ns    111 ns 316%
+combining_tc_sync - dup         183 ns    148 ns    121 ns 289%
+---------------------------------------
+combining_tc_async               92 ns     89 ns     87 ns 402%
+combining_tc_async - dup        152 ns    105 ns     87 ns 405%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             120 ns    119 ns    118 ns 298%
+combining_notc_sync - dup       120 ns    119 ns    118 ns 298%
+---------------------------------------
+combining_notc_async            122 ns    120 ns    120 ns 294%
+combining_notc_async - dup      121 ns    120 ns    118 ns 297%
+---------------------------------------
+combining_tc_sync               110 ns    108 ns    106 ns 331%
+combining_tc_sync - dup         110 ns    109 ns    107 ns 327%
+---------------------------------------
+combining_tc_async               88 ns     87 ns     85 ns 411%
+combining_tc_async - dup         90 ns     88 ns     85 ns 411%
+---------------------------------------
+
+------------------------------------ Number of threads = 16
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             363 ns    361 ns    360 ns
+no_combining - dup              362 ns    361 ns    358 ns 100%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             177 ns    136 ns    111 ns 323%
+combining_notc_sync - dup       185 ns    148 ns    112 ns 320%
+---------------------------------------
+combining_notc_async            191 ns    151 ns    122 ns 294%
+combining_notc_async - dup      179 ns    157 ns    118 ns 305%
+---------------------------------------
+combining_tc_sync               154 ns    125 ns    100 ns 360%
+combining_tc_sync - dup         166 ns    130 ns     98 ns 367%
+---------------------------------------
+combining_tc_async              143 ns    107 ns     86 ns 418%
+combining_tc_async - dup        132 ns    112 ns     88 ns 407%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             121 ns    103 ns     98 ns 367%
+combining_notc_sync - dup       117 ns    104 ns     99 ns 362%
+---------------------------------------
+combining_notc_async            116 ns    105 ns     99 ns 363%
+combining_notc_async - dup      112 ns    104 ns    100 ns 359%
+---------------------------------------
+combining_tc_sync               111 ns    101 ns     94 ns 381%
+combining_tc_sync - dup         113 ns     98 ns     93 ns 387%
+---------------------------------------
+combining_tc_async               97 ns     85 ns     74 ns 484%
+combining_tc_async - dup         98 ns     86 ns     78 ns 457%
+---------------------------------------
+
+------------------------------------ Number of threads = 24
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             352 ns    351 ns    349 ns
+no_combining - dup              352 ns    351 ns    348 ns 100%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             214 ns    173 ns    149 ns 234%
+combining_notc_sync - dup       212 ns    166 ns    137 ns 254%
+---------------------------------------
+combining_notc_async            232 ns    198 ns    161 ns 216%
+combining_notc_async - dup      225 ns    191 ns    149 ns 234%
+---------------------------------------
+combining_tc_sync               192 ns    152 ns    129 ns 270%
+combining_tc_sync - dup         176 ns    156 ns    121 ns 286%
+---------------------------------------
+combining_tc_async              202 ns    147 ns    118 ns 296%
+combining_tc_async - dup        200 ns    158 ns    120 ns 291%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             161 ns    125 ns    115 ns 303%
+combining_notc_sync - dup       144 ns    127 ns    116 ns 299%
+---------------------------------------
+combining_notc_async            135 ns    122 ns    116 ns 298%
+combining_notc_async - dup      341 ns    148 ns    117 ns 298%
+---------------------------------------
+combining_tc_sync               130 ns    118 ns    109 ns 319%
+combining_tc_sync - dup         116 ns    110 ns    105 ns 332%
+---------------------------------------
+combining_tc_async               97 ns     86 ns     79 ns 442%
+combining_tc_async - dup         95 ns     86 ns     79 ns 440%
+---------------------------------------
+
+------------------------------------ Number of threads = 32
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             337 ns    336 ns    333 ns
+no_combining - dup              338 ns    336 ns    333 ns  99%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             193 ns    177 ns    162 ns 204%
+combining_notc_sync - dup       211 ns    181 ns    156 ns 213%
+---------------------------------------
+combining_notc_async            245 ns    200 ns    162 ns 205%
+combining_notc_async - dup      216 ns    197 ns    149 ns 223%
+---------------------------------------
+combining_tc_sync               195 ns    167 ns    121 ns 274%
+combining_tc_sync - dup         179 ns    164 ns    143 ns 231%
+---------------------------------------
+combining_tc_async              187 ns    152 ns    108 ns 307%
+combining_tc_async - dup        182 ns    151 ns    125 ns 266%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             189 ns    127 ns    114 ns 290%
+combining_notc_sync - dup       126 ns    118 ns    110 ns 302%
+---------------------------------------
+combining_notc_async            233 ns    129 ns    112 ns 297%
+combining_notc_async - dup      170 ns    126 ns    113 ns 293%
+---------------------------------------
+combining_tc_sync               948 ns    212 ns    107 ns 309%
+combining_tc_sync - dup         137 ns    112 ns    104 ns 318%
+---------------------------------------
+combining_tc_async               90 ns     86 ns     79 ns 421%
+combining_tc_async - dup         94 ns     87 ns     80 ns 414%
+---------------------------------------
+
+------------------------------------ Number of threads = 48
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             340 ns    336 ns    334 ns
+no_combining - dup              336 ns    335 ns    334 ns 100%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             214 ns    176 ns    137 ns 243%
+combining_notc_sync - dup       210 ns    173 ns    128 ns 260%
+---------------------------------------
+combining_notc_async            217 ns    186 ns    162 ns 205%
+combining_notc_async - dup      215 ns    186 ns    149 ns 224%
+---------------------------------------
+combining_tc_sync               206 ns    171 ns    145 ns 230%
+combining_tc_sync - dup         179 ns    149 ns    126 ns 265%
+---------------------------------------
+combining_tc_async              175 ns    138 ns    108 ns 309%
+combining_tc_async - dup        169 ns    134 ns    110 ns 301%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync            1798 ns    293 ns    118 ns 282%
+combining_notc_sync - dup       171 ns    122 ns    105 ns 318%
+---------------------------------------
+combining_notc_async            227 ns    132 ns    110 ns 302%
+combining_notc_async - dup      226 ns    137 ns    111 ns 301%
+---------------------------------------
+combining_tc_sync               111 ns    106 ns    102 ns 327%
+combining_tc_sync - dup         127 ns    110 ns    104 ns 321%
+---------------------------------------
+combining_tc_async              297 ns    117 ns     77 ns 433%
+combining_tc_async - dup        742 ns    149 ns     77 ns 432%
+---------------------------------------
+
+------------------------------------ Number of threads = 64
+
+Test_name, Max time, Avg time, Min time, % base min / min
+
+no_combining - base             338 ns    333 ns    331 ns
+no_combining - dup              335 ns    333 ns    331 ns  99%
+---------------------------------------
+---- dedicated-------------------------
+combining_notc_sync             198 ns    163 ns    148 ns 223%
+combining_notc_sync - dup       172 ns    154 ns    124 ns 266%
+---------------------------------------
+combining_notc_async            211 ns    177 ns    158 ns 209%
+combining_notc_async - dup      182 ns    166 ns    152 ns 216%
+---------------------------------------
+combining_tc_sync               195 ns    133 ns    112 ns 294%
+combining_tc_sync - dup         158 ns    135 ns    108 ns 305%
+---------------------------------------
+combining_tc_async              145 ns    119 ns     95 ns 347%
+combining_tc_async - dup        159 ns    130 ns     95 ns 346%
+---------------------------------------
+---- no dedicated----------------------
+combining_notc_sync             188 ns    123 ns    107 ns 308%
+combining_notc_sync - dup       546 ns    159 ns    107 ns 307%
+---------------------------------------
+combining_notc_async            558 ns    160 ns    108 ns 304%
+combining_notc_async - dup      192 ns    127 ns    107 ns 308%
+---------------------------------------
+combining_tc_sync               325 ns    130 ns    101 ns 325%
+combining_tc_sync - dup        1766 ns    273 ns    101 ns 325%
+---------------------------------------
+combining_tc_async              417 ns    118 ns     74 ns 446%
+combining_tc_async - dup        838 ns    212 ns     72 ns 455%
+---------------------------------------
+[       OK ] FlatCombining.direct_measurement (178622 ms)
+[----------] 2 tests from FlatCombining (633891 ms total)
+
+[----------] Global test environment tear-down
+[==========] 2 tests from 1 test case ran. (633891 ms total)
+[  PASSED  ] 2 tests.
+
+---
+
+$ lscpu
+
+Architecture:          x86_64
+CPU op-mode(s):        32-bit, 64-bit
+Byte Order:            Little Endian
+CPU(s):                32
+On-line CPU(s) list:   0-31
+Thread(s) per core:    2
+Core(s) per socket:    8
+Socket(s):             2
+NUMA node(s):          2
+Vendor ID:             GenuineIntel
+CPU family:            6
+Model:                 45
+Model name:            Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz
+Stepping:              6
+CPU MHz:               2200.000
+CPU max MHz:           2200.0000
+CPU min MHz:           1200.0000
+BogoMIPS:              4399.87
+Virtualization:        VT-x
+L1d cache:             32K
+L1i cache:             32K
+L2 cache:              256K
+L3 cache:              20480K
+NUMA node0 CPU(s):     0-7,16-23
+NUMA node1 CPU(s):     8-15,24-31
+
+Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep
+mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
+tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts
+rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq
+dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca
+sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm
+epb tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm arat pln pts
+
+---
+
+ */
diff --git a/folly/experimental/flat_combining/test/FlatCombiningExamples.h b/folly/experimental/flat_combining/test/FlatCombiningExamples.h
new file mode 100644 (file)
index 0000000..17c25ca
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+#include <folly/Baton.h>
+#include <folly/experimental/flat_combining/FlatCombining.h>
+
+namespace folly {
+
+struct Line {
+  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+  uint64_t val_;
+};
+
+class Data { // Sequential data structure
+ public:
+  explicit Data(size_t size) : size_(size) {
+    x_ = std::make_unique<Line[]>(size_);
+  }
+
+  uint64_t getVal() {
+    uint64_t val = x_[0].val_;
+    for (size_t i = 1; i < size_; ++i) {
+      assert(x_[i].val_ == val);
+    }
+    return val;
+  }
+
+  // add
+
+  void add(uint64_t val) {
+    uint64_t oldval = x_[0].val_;
+    for (size_t i = 0; i < size_; ++i) {
+      assert(x_[i].val_ == oldval);
+      x_[i].val_ = oldval + val;
+    }
+  }
+
+  uint64_t fetchAdd(uint64_t val) {
+    uint64_t res = x_[0].val_;
+    for (size_t i = 0; i < size_; ++i) {
+      assert(x_[i].val_ == res);
+      x_[i].val_ += val;
+    }
+    return res;
+  }
+
+ private:
+  size_t size_;
+  std::unique_ptr<Line[]> x_;
+};
+
+// Example of FC concurrent data structure using simple interface
+
+template <
+    typename Mutex = std::mutex,
+    template <typename> class Atom = std::atomic>
+class FcSimpleExample
+    : public FlatCombining<FcSimpleExample<Mutex, Atom>, Mutex, Atom> {
+  using FC = FlatCombining<FcSimpleExample<Mutex, Atom>, Mutex, Atom>;
+  using Rec = typename FC::Rec;
+
+ public:
+  explicit FcSimpleExample(
+      size_t size,
+      bool dedicated = true,
+      uint32_t numRecs = 0,
+      uint32_t maxOps = 0)
+      : FC(dedicated, numRecs, maxOps), data_(size) {}
+
+  uint64_t getVal() {
+    return data_.getVal();
+  }
+
+  // add
+
+  void addNoFC(uint64_t val) {
+    this->requestNoFC([&] { data_.add(val); });
+  }
+
+  void add(uint64_t val, Rec* rec = nullptr) {
+    auto opFn = [&, val] { // asynchronous -- capture val by value
+      data_.add(val);
+    };
+    this->requestFC(opFn, rec, false);
+  }
+
+  // fetchAdd
+
+  uint64_t fetchAddNoFC(uint64_t val) {
+    uint64_t res;
+    auto opFn = [&] { res = data_.fetchAdd(val); };
+    this->requestNoFC(opFn);
+    return res;
+  }
+
+  uint64_t fetchAdd(uint64_t val, Rec* rec = nullptr) {
+    uint64_t res;
+    auto opFn = [&] { res = data_.fetchAdd(val); };
+    this->requestFC(opFn, rec);
+    return res;
+  }
+
+ private:
+  Data data_;
+};
+
+// Example of FC data structure using custom request processing
+
+class Req {
+ public:
+  enum class Type { ADD, FETCHADD };
+
+  void setType(Type type) {
+    type_ = type;
+  }
+
+  Type getType() {
+    return type_;
+  }
+
+  void setVal(uint64_t val) {
+    val_ = val;
+  }
+
+  uint64_t getVal() {
+    return val_;
+  }
+
+  void setRes(uint64_t res) {
+    res_ = res;
+  }
+
+  uint64_t getRes() {
+    return res_;
+  }
+
+ private:
+  Type type_;
+  uint64_t val_;
+  uint64_t res_;
+};
+
+template <
+    typename Req,
+    typename Mutex = std::mutex,
+    template <typename> class Atom = std::atomic>
+class FcCustomExample : public FlatCombining<
+                            FcCustomExample<Req, Mutex, Atom>,
+                            Mutex,
+                            Atom,
+                            Req> {
+  using FC = FlatCombining<FcCustomExample<Req, Mutex, Atom>, Mutex, Atom, Req>;
+  using Rec = typename FC::Rec;
+
+ public:
+  explicit FcCustomExample(
+      int size,
+      bool dedicated = true,
+      uint32_t numRecs = 0,
+      uint32_t maxOps = 0)
+      : FC(dedicated, numRecs, maxOps), data_(size) {}
+
+  uint64_t getVal() {
+    return data_.getVal();
+  }
+
+  // add
+
+  void addNoFC(uint64_t val) {
+    this->requestNoFC([&] { data_.add(val); });
+  }
+
+  void add(uint64_t val, Rec* rec = nullptr) {
+    auto opFn = [&, val] { data_.add(val); };
+    auto fillFn = [&](Req& req) {
+      req.setType(Req::Type::ADD);
+      req.setVal(val);
+    };
+    this->requestFC(opFn, fillFn, rec, false); // asynchronous
+  }
+
+  // fetchAdd
+
+  uint64_t fetchAddNoFC(uint64_t val) {
+    uint64_t res;
+    auto opFn = [&] { res = data_.fetchAdd(val); };
+    this->requestNoFC(opFn);
+    return res;
+  }
+
+  uint64_t fetchAdd(uint64_t val, Rec* rec = nullptr) {
+    uint64_t res;
+    auto opFn = [&] { res = data_.fetchAdd(val); };
+    auto fillFn = [&](Req& req) {
+      req.setType(Req::Type::FETCHADD);
+      req.setVal(val);
+    };
+    auto resFn = [&](Req& req) { res = req.getRes(); };
+    this->requestFC(opFn, fillFn, resFn, rec);
+    return res;
+  }
+
+  // custom combined op processing - overrides FlatCombining::combinedOp(Req&)
+  void combinedOp(Req& req) {
+    switch (req.getType()) {
+      case Req::Type::ADD: {
+        data_.add(req.getVal());
+      } break;
+      case Req::Type::FETCHADD: {
+        req.setRes(data_.fetchAdd(req.getVal()));
+      } break;
+      default: { assert(false); }
+    }
+  }
+
+ private:
+  Data data_;
+};
+
+} // namespace folly {
diff --git a/folly/experimental/flat_combining/test/FlatCombiningTest.cpp b/folly/experimental/flat_combining/test/FlatCombiningTest.cpp
new file mode 100644 (file)
index 0000000..ff745b2
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h>
+
+#include <folly/portability/GTest.h>
+#include <glog/logging.h>
+
+using namespace folly::test;
+
+constexpr int LINES = 5;
+constexpr int NUM_RECS = 20;
+constexpr int WORK = 0;
+constexpr int ITERS = 100;
+
+static std::vector<int> nthr = {1, 10, 20};
+
+struct Params {
+  bool combining, simple, dedicated, tc, syncop;
+};
+
+class FlatCombiningTest : public ::testing::TestWithParam<Params> {};
+
+TEST_P(FlatCombiningTest, combining) {
+  Params p = GetParam();
+  for (auto n : nthr) {
+    run_test(
+        n,
+        LINES,
+        NUM_RECS,
+        WORK,
+        ITERS,
+        p.combining,
+        p.simple,
+        p.dedicated,
+        p.tc,
+        p.syncop,
+        true,
+        true);
+  }
+}
+
+TEST_P(FlatCombiningTest, more_threads_than_records) {
+  int n = 20;
+  int num_recs = 1;
+
+  Params p = GetParam();
+  run_test(
+      n,
+      LINES,
+      num_recs,
+      WORK,
+      ITERS,
+      p.combining,
+      p.simple,
+      p.dedicated,
+      p.tc,
+      p.syncop,
+      true,
+      true);
+}
+
+constexpr Params params[] = {
+    {false, false, false, false, false}, // no combining
+    // simple combining
+    //  dedicated
+    {true, true, true, false, true}, // no-tc sync
+    {true, true, true, false, false}, // no-tc async
+    {true, true, true, true, true}, // tc sync
+    {true, true, true, true, false}, // tc async
+    //   no dedicated
+    {true, true, false, false, true}, // no-tc sync
+    {true, true, false, false, false}, // no-tc async
+    {true, true, false, true, true}, // tc sync
+    {true, true, false, true, false}, // tc async
+    // custom combining
+    //  dedicated
+    {true, false, true, false, true}, // no-tc sync
+    {true, false, true, false, false}, // no-tc async
+    {true, false, true, true, true}, // tc sync
+    {true, false, true, true, false}, // tc async
+    //   no dedicated
+    {true, false, false, false, true}, // no-tc sync
+    {true, false, false, false, false}, // no-tc async
+    {true, false, false, true, true}, // tc sync
+    {true, false, false, true, false}, // tc async
+};
+
+INSTANTIATE_TEST_CASE_P(Foo, FlatCombiningTest, ::testing::ValuesIn(params));
diff --git a/folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h b/folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h
new file mode 100644 (file)
index 0000000..f9b4980
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/experimental/flat_combining/test/FlatCombiningExamples.h>
+
+#include <folly/Benchmark.h>
+#include <glog/logging.h>
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+
+namespace folly {
+namespace test {
+
+void doWork(int work) {
+  uint64_t a = 0;
+  for (int i = work; i > 0; --i) {
+    a += i;
+  }
+  folly::doNotOptimizeAway(a);
+}
+
+template <
+    typename Example,
+    typename Req = bool,
+    typename Mutex = std::mutex,
+    template <typename> class Atom = std::atomic>
+uint64_t fc_test(
+    int nthreads,
+    int lines,
+    int numRecs,
+    int work,
+    int ops,
+    bool combining,
+    bool dedicated,
+    bool tc,
+    bool syncops,
+    bool excl = false,
+    bool allocAll = false) {
+  using FC = FlatCombining<Example, Mutex, Atom, Req>;
+  using Rec = typename FC::Rec;
+
+  folly::BenchmarkSuspender susp;
+
+  std::atomic<bool> start{false};
+  std::atomic<int> started{0};
+  Example ex(lines, dedicated, numRecs);
+  std::atomic<uint64_t> total{0};
+  bool mutex = false;
+
+  if (allocAll) {
+    std::vector<Rec*> v(numRecs);
+    for (int i = 0; i < numRecs; ++i) {
+      v[i] = ex.allocRec();
+    }
+    for (int i = numRecs; i > 0; --i) {
+      ex.freeRec(v[i - 1]);
+    }
+  }
+
+  std::vector<std::thread> threads(nthreads);
+  for (int tid = 0; tid < nthreads; ++tid) {
+    threads[tid] = std::thread([&, tid] {
+      started.fetch_add(1);
+      Rec* myrec = (combining && tc) ? ex.allocRec() : nullptr;
+      uint64_t sum = 0;
+      while (!start.load())
+        ;
+
+      if (!combining) {
+        // no combining
+        for (int i = tid; i < ops; i += nthreads) {
+          sum += ex.fetchAddNoFC(1);
+          doWork(work); // unrelated work
+        }
+      } else if (syncops) {
+        // sync combining
+        for (int i = tid; i < ops; i += nthreads) {
+          sum += ex.fetchAdd(1, myrec);
+          doWork(work); // unrelated work
+        }
+      } else {
+        // async combining
+        for (int i = tid; i < ops; i += nthreads) {
+          ex.add(1, myrec);
+          doWork(work); // unrelated work
+        }
+      }
+
+      if (excl) {
+        // test of unstructured exclusive access
+        ex.acquireExclusive();
+        {
+          CHECK(!mutex);
+          mutex = true;
+          VLOG(2) << tid << " " << ex.getVal() << " ...........";
+          using namespace std::chrono_literals;
+          /* sleep override */ // for coverage
+          std::this_thread::sleep_for(10ms);
+          VLOG(2) << tid << " " << ex.getVal() << " ===========";
+          CHECK(mutex);
+          mutex = false;
+        }
+        ex.releaseExclusive();
+      }
+
+      total.fetch_add(sum);
+      if (combining && tc) {
+        ex.freeRec(myrec);
+      }
+    });
+  }
+
+  while (started.load() < nthreads)
+    ;
+  auto tbegin = std::chrono::steady_clock::now();
+
+  // begin time measurement
+  susp.dismiss();
+  start.store(true);
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  if (!syncops) {
+    // complete any pending asynch ops
+    ex.drainAll();
+  }
+
+  // end time measurement
+  uint64_t duration = 0;
+  BENCHMARK_SUSPEND {
+    auto tend = std::chrono::steady_clock::now();
+    CHECK_EQ(ops, ex.getVal());
+    if (syncops) {
+      uint64_t n = (uint64_t)ops;
+      uint64_t expected = n * (n - 1) / 2;
+      CHECK_EQ(expected, total);
+    }
+    duration =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(tend - tbegin)
+            .count();
+  }
+  return duration;
+}
+
+uint64_t run_test(
+    int nthreads,
+    int lines,
+    int numRecs,
+    int work,
+    int ops,
+    bool combining,
+    bool simple,
+    bool dedicated,
+    bool tc,
+    bool syncops,
+    bool excl = false,
+    bool allocAll = false) {
+  using M = std::mutex;
+  if (simple) {
+    using Example = FcSimpleExample<M>;
+    return fc_test<Example, bool, M>(
+        nthreads,
+        lines,
+        numRecs,
+        work,
+        ops,
+        combining,
+        dedicated,
+        tc,
+        syncops,
+        excl,
+        allocAll);
+  } else {
+    using Example = FcCustomExample<Req, M>;
+    return fc_test<Example, Req, M>(
+        nthreads,
+        lines,
+        numRecs,
+        work,
+        ops,
+        combining,
+        dedicated,
+        tc,
+        syncops,
+        excl,
+        allocAll);
+  }
+}
+
+} // namespace test {
+} // namespace folly {