From: Maged Michael Date: Thu, 23 Mar 2017 14:13:23 +0000 (-0700) Subject: Flat Combining X-Git-Tag: v2017.03.27.00~12 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=85361b0c20be740ccc91ed496f0fb73a7b8f0503;p=folly.git Flat Combining Summary: Flat combining template that takes the following template parameters: T Concurrent data structure using FC interface Mutex Mutex type (default std::mutex) Atom Atomic template (default std::atomic) Req Optional request structure to hold custom info (default dummy type bool) Flat combining (FC) was introduced in the SPAA 2010 paper Flat Combining and the Synchronization-Parallelism Tradeoff, by Danny Hendler, Itai Incze, Nir Shavit, and Moran Tzafrir. http://mcg.cs.tau.ac.il/projects/projects/flat-combining Reviewed By: djwatson Differential Revision: D4602402 fbshipit-source-id: 38327f752a3e92bb01e5496c321d8c87c818087a --- diff --git a/folly/experimental/flat_combining/FlatCombining.h b/folly/experimental/flat_combining/FlatCombining.h new file mode 100644 index 00000000..9c033503 --- /dev/null +++ b/folly/experimental/flat_combining/FlatCombining.h @@ -0,0 +1,658 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace folly { + +/// Flat combining (FC) was introduced in the SPAA 2010 paper Flat +/// Combining and the Synchronization-Parallelism Tradeoff, by Danny +/// Hendler, Itai Incze, Nir Shavit, and Moran Tzafrir. +/// http://mcg.cs.tau.ac.il/projects/projects/flat-combining +/// +/// FC is an alternative to coarse-grained locking for making +/// sequential data structures thread-safe while minimizing the +/// synchroniation overheads and cache coherence traffic associated +/// with locking. +/// +/// Under FC, when a thread finds the lock contended, it can +/// request (using a request record) that the lock holder execute its +/// operation on the shared data structure. There can be a designated +/// combiner thread or any thread can act as the combiner when it +/// holds the lock. +/// +/// Potential advantages of FC include: +/// - Reduced cache coherence traffic +/// - Reduced synchronization overheads, as the overheads of releasing +/// and acquiring the lock are eliminated from the critical path of +/// operating on the data structure. +/// - Opportunities for smart combining, where executing multiple +/// operations together may take less time than executng the +/// operations separately, e.g., K delete_min operations on a +/// priority queue may be combined to take O(K + log N) time instead +/// of O(K * log N). +/// +/// This implementation of flat combining supports: + +/// - A simple interface that requires minimal extra code by the +/// user. To use this interface efficiently the user-provided +/// functions must be copyable to folly::Functio without dynamic +/// allocation. If this is impossible or inconvenient, the user is +/// encouraged to use the custom interface described below. +/// - A custom interface that supports custom combinining and custom +/// request structure, either for the sake of smart combining or for +/// efficiently supporting operations that are not be copyable to +/// folly::Function without synamic allocation. +/// - Both synchronous and asynchronous operations. +/// - Request records with and without thread-caching. +/// - Combining with and without a dedicated combiner thread. +/// +/// This implementation differs from the algorithm in the SPAA 2010 paper: +/// - It does not require thread caching of request records +/// - It supports a dedicated combiner +/// - It supports asynchronous operations +/// +/// The generic FC class template supports generic data structures and +/// utilities with arbitrary operations. The template supports static +/// polymorphism for the combining function to enable custom smart +/// combining. +/// +/// A simple example of using the FC template: +/// class ConcurrentFoo : public FlatCombining { +/// Foo foo_; // sequential data structure +/// public: +/// T bar(V v) { // thread-safe execution of foo_.bar(v) +/// T result; +/// // Note: fn must be copyable to folly::Function without dynamic +/// // allocation. Otherwise, it is recommended to use the custom +/// // interface and manage the function arguments and results +/// // explicitly in a custom request structure. +/// auto fn = [&] { result = foo_.bar(v); }; +/// this->requestFC(fn); +/// return result; +/// } +/// }; +/// +/// See test/FlatCombiningExamples.h for more examples. See the +/// comments for requestFC() below for a list of simple and custom +/// variants of that function. + +template < + typename T, // concurrent data structure using FC interface + typename Mutex = std::mutex, + template class Atom = std::atomic, + typename Req = /* default dummy type */ bool> +class FlatCombining { + using SavedFn = folly::Function; + + public: + /// Combining request record. + class Rec { + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + folly::Baton valid_; + folly::Baton done_; + folly::Baton disconnected_; + size_t index_; + size_t next_; + uint64_t last_; + Req req_; + SavedFn fn_; + + public: + Rec() { + setDone(); + setDisconnected(); + } + + void setValid() { + valid_.post(); + } + + void clearValid() { + valid_.reset(); + } + + bool isValid() const { + return valid_.try_wait(); + } + + void setDone() { + done_.post(); + } + + void clearDone() { + done_.reset(); + } + + bool isDone() const { + return done_.try_wait(); + } + + void awaitDone() { + done_.wait(); + } + + void setDisconnected() { + disconnected_.post(); + } + + void clearDisconnected() { + disconnected_.reset(); + } + + bool isDisconnected() const { + return disconnected_.try_wait(); + } + + void setIndex(const size_t index) { + index_ = index; + } + + size_t getIndex() const { + return index_; + } + + void setNext(const size_t next) { + next_ = next; + } + + size_t getNext() const { + return next_; + } + + void setLast(const uint64_t pass) { + last_ = pass; + } + + uint64_t getLast() const { + return last_; + } + + Req& getReq() { + return req_; + } + + template + void setFn(Func&& fn) { + fn_ = std::forward(fn); + assert(fn_); + // If the following assertion is triggered, the user should + // either change the provided function, i.e., fn, to fit in + // folly::Function without allocation or use the custom + // interface to request combining for fn and manage its + // arguments (and results, if any) explicitly in a custom + // request structure. + assert(!fn_.hasAllocatedMemory()); + } + + void clearFn() { + fn_ = {}; + assert(!fn_); + } + + SavedFn& getFn() { + return fn_; + } + + void complete() { + clearValid(); + assert(!isDone()); + setDone(); + } + }; + + using Pool = folly::IndexedMemPool; + + public: + /// The constructor takes three optional arguments: + /// - Optional dedicated combiner thread (default true) + /// - Number of records (if 0, then kDefaultNumRecs) + /// - A hint for the max. number of combined operations per + /// combining session that is checked at the beginning of each pass + /// on the request records (if 0, then kDefaultMaxops) + explicit FlatCombining( + const bool dedicated = true, + uint32_t numRecs = 0, // number of combining records + const uint32_t maxOps = 0 // hint of max ops per combining session + ) + : numRecs_(numRecs == 0 ? kDefaultNumRecs : numRecs), + maxOps_(maxOps == 0 ? kDefaultMaxOps : maxOps), + recs_(NULL_INDEX), + dedicated_(dedicated), + recsPool_(numRecs_) { + if (dedicated_) { + // dedicated combiner thread + combiner_ = std::thread([this] { dedicatedCombining(); }); + } + } + + /// Destructor: If there is a dedicated combiner, the destructor + /// flags it to shutdown. Otherwise, the destructor waits for all + /// pending asynchronous requests to be completed. + ~FlatCombining() { + if (dedicated_) { + shutdown(); + combiner_.join(); + } else { + drainAll(); + } + } + + // Wait for all pending operations to complete. Useful primarily + // when there are asynchronous operations without a dedicated + // combiner. + void drainAll() { + for (size_t i = getRecsHead(); i != NULL_INDEX; i = nextIndex(i)) { + Rec& rec = recsPool_[i]; + awaitDone(rec); + } + } + + // Give the caller exclusive access. + void acquireExclusive() { + m_.lock(); + } + + // Try to give the caller exclusive access. Returns true iff successful. + bool tryExclusive() { + return m_.try_lock(); + } + + // Release exclusive access. The caller must have exclusive access. + void releaseExclusive() { + m_.unlock(); + } + + // Execute an operation without combining + template + void requestNoFC(OpFunc& opFn) { + std::lock_guard guard(m_); + opFn(); + } + + // This function first tries to execute the operation without + // combining. If unuccessful, it allocates a combining record if + // needed. If there are no available records, it waits for exclusive + // access and executes the operation. If a record is available and + // ready for use, it fills the record and indicates that the request + // is valid for combining. If the request is synchronous (by default + // or necessity), it waits for the operation to be completed by a + // combiner and optionally extracts the result, if any. + // + // This function can be called in several forms: + // Simple forms that do not require the user to define a Req structure + // or to override any request processing member functions: + // requestFC(opFn) + // requestFC(opFn, rec) // provides its own pre-allocated record + // requestFC(opFn, rec, syncop) // asynchronous if syncop == false + // Custom forms that require the user to define a Req structure and to + // override some request processing member functions: + // requestFC(opFn, fillFn) + // requestFC(opFn, fillFn, rec) + // requestFC(opFn, fillFn, rec, syncop) + // requestFC(opFn, fillFn, resFn) + // requestFC(opFn, fillFn, resFn, rec) + template + void requestFC(OpFunc&& opFn, Rec* rec = nullptr, bool syncop = true) { + auto dummy = [](Req&) {}; + requestOp( + std::forward(opFn), + dummy /* fillFn */, + dummy /* resFn */, + rec, + syncop, + false /* simple */); + } + template + void requestFC( + OpFunc&& opFn, + const FillFunc& fillFn, + Rec* rec = nullptr, + bool syncop = true) { + auto dummy = [](Req&) {}; + requestOp( + std::forward(opFn), + fillFn, + dummy /* resFn */, + rec, + syncop, + true /* custom */); + } + template + void requestFC( + OpFunc&& opFn, + const FillFunc& fillFn, + const ResFn& resFn, + Rec* rec = nullptr) { + // must wait for result to execute resFn -- so it must be synchronous + requestOp( + std::forward(opFn), + fillFn, + resFn, + rec, + true /* sync */, + true /* custom*/); + } + + // Allocate a record. + Rec* allocRec() { + auto idx = recsPool_.allocIndex(); + if (idx == NULL_INDEX) { + outOfSpaceCount_.fetch_add(1); + return nullptr; + } + Rec& rec = recsPool_[idx]; + rec.setIndex(idx); + return &rec; + } + + // Free a record + void freeRec(Rec* rec) { + if (rec == nullptr) { + return; + } + auto idx = rec->getIndex(); + recsPool_.recycleIndex(idx); + } + + // Returns a count of the number of combined operations so far. + uint64_t getCombinedOpCount() { + std::lock_guard guard(m_); + return combined_; + } + + // Returns a count of the number of combining passes so far. + uint64_t getCombiningPasses() { + std::lock_guard guard(m_); + return passes_; + } + + uint64_t getOutOfSpaceCount() { + return outOfSpaceCount_.load(); + } + + protected: + const size_t NULL_INDEX = 0; + const uint32_t kDefaultMaxOps = 100; + const uint64_t kDefaultNumRecs = 64; + const uint64_t kIdleThreshold = 10; + + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + Mutex m_; + + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + folly::Baton pending_; + Atom shutdown_{false}; + + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + uint32_t numRecs_; + uint32_t maxOps_; + Atom recs_; + bool dedicated_; + std::thread combiner_; + Pool recsPool_; + + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + uint64_t combined_ = 0; + uint64_t passes_ = 0; + uint64_t sessions_ = 0; + Atom outOfSpaceCount_{0}; + + template + void requestOp( + OpFunc&& opFn, + const FillFunc& fillFn, + const ResFn& resFn, + Rec* rec, + bool syncop, + const bool custom) { + std::unique_lock l(this->m_, std::defer_lock); + if (l.try_lock()) { + // No contention + tryCombining(); + opFn(); + return; + } + + // Try FC + bool tc = (rec != nullptr); + if (!tc) { + // if an async op doesn't have a thread-cached record then turn + // it into a synchronous op. + syncop = true; + rec = allocRec(); + } + if (rec == nullptr) { + // Can't use FC - Must acquire lock + l.lock(); + opFn(); + return; + } + + // Use FC + // Wait if record is in use + awaitDone(*rec); + rec->clearDone(); + // Fill record + if (custom) { + // Fill the request (custom) + Req& req = rec->getReq(); + fillFn(req); + rec->clearFn(); + } else { + rec->setFn(std::forward(opFn)); + } + // Indicate that record is valid + assert(!rec->isValid()); + rec->setValid(); + // end of combining critical path + setPending(); + // store-load order setValid before isDisconnected + std::atomic_thread_fence(std::memory_order_seq_cst); + if (rec->isDisconnected()) { + rec->clearDisconnected(); + pushRec(rec->getIndex()); + setPending(); + } + // If synchronous wait for the request to be completed + if (syncop) { + awaitDone(*rec); + if (custom) { + Req& req = rec->getReq(); + resFn(req); // Extract the result (custom) + } + if (!tc) { + freeRec(rec); // Free the temporary record. + } + } + } + + void pushRec(size_t idx) { + Rec& rec = recsPool_[idx]; + while (true) { + auto head = recs_.load(std::memory_order_acquire); + rec.setNext(head); // there shouldn't be a data race here + if (recs_.compare_exchange_weak(head, idx)) { + return; + } + } + } + + size_t getRecsHead() { + return recs_.load(std::memory_order_acquire); + } + + size_t nextIndex(size_t idx) { + return recsPool_[idx].getNext(); + } + + void clearPending() { + pending_.reset(); + } + + void setPending() { + pending_.post(); + } + + bool isPending() const { + return pending_.try_wait(); + } + + void awaitPending() { + pending_.wait(); + } + + uint64_t combiningSession() { + uint64_t combined = 0; + do { + uint64_t count = static_cast(this)->combiningPass(); + if (count == 0) { + break; + } + combined += count; + ++this->passes_; + } while (combined < this->maxOps_); + return combined; + } + + void tryCombining() { + if (!dedicated_) { + while (isPending()) { + clearPending(); + combined_ += combiningSession(); + } + } + } + + void dedicatedCombining() { + while (true) { + awaitPending(); + clearPending(); + if (shutdown_.load()) { + break; + } + while (true) { + uint64_t count; + ++sessions_; + { + std::lock_guard guard(m_); + count = combiningSession(); + combined_ += count; + } + if (count < maxOps_) { + break; + } + } + } + } + + void awaitDone(Rec& rec) { + if (dedicated_) { + rec.awaitDone(); + } else { + awaitDoneTryLock(rec); + } + } + + /// Waits for the request to be done and occasionally tries to + /// acquire the lock and to do combining. Used only in the absence + /// of a dedicated combiner. + void awaitDoneTryLock(Rec& rec) { + assert(!dedicated_); + int count = 0; + while (!rec.isDone()) { + if (count == 0) { + std::unique_lock l(m_, std::defer_lock); + if (l.try_lock()) { + setPending(); + tryCombining(); + } + } else { + folly::asm_volatile_pause(); + if (++count == 1000) { + count = 0; + } + } + } + } + + void shutdown() { + shutdown_.store(true); + setPending(); + } + + /// The following member functions may be overridden for customization + + void combinedOp(Req&) { + throw std::runtime_error( + "FlatCombining::combinedOp(Req&) must be overridden in the derived" + " class if called."); + } + + void processReq(Rec& rec) { + SavedFn& opFn = rec.getFn(); + if (opFn) { + // simple interface + opFn(); + } else { + // custom interface + Req& req = rec.getReq(); + static_cast(this)->combinedOp(req); // defined in derived class + } + rec.setLast(passes_); + rec.complete(); + } + + uint64_t combiningPass() { + uint64_t count = 0; + auto idx = getRecsHead(); + Rec* prev = nullptr; + while (idx != NULL_INDEX) { + Rec& rec = recsPool_[idx]; + auto next = rec.getNext(); + bool valid = rec.isValid(); + if (!valid && (passes_ - rec.getLast() > kIdleThreshold) && + (prev != nullptr)) { + // Disconnect + prev->setNext(next); + rec.setDisconnected(); + // store-load order setDisconnected before isValid + std::atomic_thread_fence(std::memory_order_seq_cst); + valid = rec.isValid(); + } else { + prev = &rec; + } + if (valid) { + processReq(rec); + ++count; + } + idx = next; + } + return count; + } +}; + +} // namespace folly { diff --git a/folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpp b/folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpp new file mode 100644 index 00000000..457a7a66 --- /dev/null +++ b/folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpp @@ -0,0 +1,1787 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +using namespace folly::test; + +// use option --benchmark to run folly::Benchmark +// use option --direct to run direct benchmark measurements +DEFINE_bool(direct, false, "run direct measurement"); +DEFINE_int32(reps, 10, "number of reps"); +DEFINE_int32(ops, 100000, "number of operations per rep"); +DEFINE_int32(lines, 5, "number of cache lines accessed per operation"); +DEFINE_int32(numRecs, 8, "number of records"); +DEFINE_int32(work, 1000, "amount of unrelated work per operation"); + +static std::vector nthr = {1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64}; +static int nthreads; +static bool fc; +static bool simple; +static bool dedicated; +static bool tc; +static bool syncops; + +// baseline - no combining +BENCHMARK(no_combining_base, iters) { + fc = false; + dedicated = false; + tc = false; + syncops = false; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(no_combining_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} +BENCHMARK_DRAW_LINE() + +// dedicated combiner + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_dedicated_notc_sync, iters) { + fc = true; + dedicated = true; + tc = false; + syncops = true; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_dedicated_notc_sync_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_dedicated_notc_async, iters) { + syncops = false; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_dedicated_notc_async_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_dedicated_tc_sync, iters) { + tc = true; + syncops = true; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_dedicated_tc_sync_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_dedicated_tc_async, iters) { + tc = true; + syncops = false; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_dedicated_tc_async_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +// no dedicated combiner + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_no_dedicated_notc_sync, iters) { + dedicated = false; + tc = false; + syncops = true; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_no_dedicated_notc_sync_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_no_dedicated_notc_async, iters) { + syncops = false; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_no_dedicated_notc_async_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_no_dedicated_tc_sync, iters) { + tc = true; + syncops = true; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_no_dedicated_tc_sync_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +BENCHMARK_RELATIVE(combining_no_dedicated_tc_async, iters) { + tc = true; + syncops = false; + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_RELATIVE(combining_no_dedicated_tc_async_dup, iters) { + run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + iters, + fc, + simple, + dedicated, + tc, + syncops); +} + +BENCHMARK_DRAW_LINE() + +void benchmarkSetup() { + int numCores = std::thread::hardware_concurrency(); + std::cout << "\nRunning benchmarks on machine with " << numCores + << " logical cores" << std::endl; +} + +TEST(FlatCombining, folly_benchmark) { + if (FLAGS_benchmark) { + benchmarkSetup(); + for (bool b : {true, false}) { + simple = b; + std::string str = simple ? "simple" : "custom"; + std::cout << "\n------------------------------------ " << str + << " interface" << std::endl; + for (int i : nthr) { + std::cout << "\n---------------------------------- Number of threads = " + << i << std::endl; + nthreads = i; + folly::runBenchmarks(); + } + } + } +} + +// Direct measurement - not using folly::Benchmark + +static uint64_t test( + std::string name, + bool fc, + bool dedicated, + bool tc, + bool syncops, + uint64_t base) { + uint64_t min = UINTMAX_MAX; + uint64_t max = 0; + uint64_t sum = 0; + + for (int i = 0; i < FLAGS_reps; ++i) { + uint64_t dur = run_test( + nthreads, + FLAGS_lines, + FLAGS_numRecs, + FLAGS_work, + FLAGS_ops, + fc, + simple, + dedicated, + tc, + syncops); + sum += dur; + min = std::min(min, dur); + max = std::max(max, dur); + } + uint64_t avg = sum / FLAGS_reps; + + uint64_t res = min; + std::cout << name; + std::cout << " " << std::setw(4) << max / FLAGS_ops << " ns"; + std::cout << " " << std::setw(4) << avg / FLAGS_ops << " ns"; + std::cout << " " << std::setw(4) << res / FLAGS_ops << " ns"; + if (base) { + std::cout << " " << std::setw(3) << 100 * base / res << "%"; + } + std::cout << std::endl; + return res; +} + +TEST(FlatCombining, direct_measurement) { + if (!FLAGS_direct) { + return; + } + benchmarkSetup(); + simple = false; + std::string str = simple ? "simple" : "custom"; + std::cout << "\n------------------------------------ " << str << " interface" + << std::endl; + for (int i : nthr) { + nthreads = i; + std::cout << "\n------------------------------------ Number of threads = " + << i << "\n" + << std::endl; + std::cout << "Test_name, Max time, Avg time, Min time, % base min / min\n" + << std::endl; + + uint64_t base = + test("no_combining - base ", false, false, false, false, 0); + test("no_combining - dup ", false, false, false, false, base); + std::cout << "---------------------------------------" << std::endl; + + std::cout << "---- dedicated-------------------------" << std::endl; + test("combining_notc_sync ", true, true, false, true, base); + test("combining_notc_sync - dup ", true, true, false, true, base); + std::cout << "---------------------------------------" << std::endl; + test("combining_notc_async ", true, true, false, false, base); + test("combining_notc_async - dup ", true, true, false, false, base); + std::cout << "---------------------------------------" << std::endl; + test("combining_tc_sync ", true, true, true, true, base); + test("combining_tc_sync - dup ", true, true, true, true, base); + std::cout << "---------------------------------------" << std::endl; + test("combining_tc_async ", true, true, true, false, base); + test("combining_tc_async - dup ", true, true, true, false, base); + std::cout << "---------------------------------------" << std::endl; + + std::cout << "---- no dedicated----------------------" << std::endl; + test("combining_notc_sync ", true, false, false, true, base); + test("combining_notc_sync - dup ", true, false, false, true, base); + std::cout << "---------------------------------------" << std::endl; + test("combining_notc_async ", true, false, false, false, base); + test("combining_notc_async - dup ", true, false, false, false, base); + std::cout << "---------------------------------------" << std::endl; + test("combining_tc_sync ", true, false, true, true, base); + test("combining_tc_sync - dup ", true, false, true, true, base); + std::cout << "---------------------------------------" << std::endl; + test("combining_tc_async ", true, false, true, false, base); + test("combining_tc_async - dup ", true, false, true, false, base); + std::cout << "---------------------------------------" << std::endl; + } +} + +/* +See benchmark results in https://phabricator.intern.facebook.com/P57204895 + +The results are from a run using the command +$ numactl -N 1 flat_combining_benchmark --benchmark --bm_min_iters=100000 --direct + +Using the default parameters of the benchmark: In each iteration, the +operation on the shared data structure updates 5 cache lines and +performs unrelated work (~300ns) after each operation. The benchmark +doesn't do any smart combining (i.e., saving or dropping some work +based on understanding the details of the combined operations). + +Direct measurements are used to evaluate the high variance in some cases. +Duplicate runs are included in order to assess the relevance of outliers. + +---- +[==========] Running 2 tests from 1 test case. +[----------] Global test environment set-up. +[----------] 2 tests from FlatCombining +[ RUN ] FlatCombining.folly_benchmark + +Running benchmarks on machine with 32 logical cores + +------------------------------------ simple interface + +---------------------------------- Number of threads = 1 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 330.43ns 3.03M +no_combining_dup 100.09% 330.13ns 3.03M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 93.17% 354.66ns 2.82M +combining_dedicated_notc_sync_dup 93.57% 353.15ns 2.83M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 99.35% 332.60ns 3.01M +combining_dedicated_notc_async_dup 99.07% 333.54ns 3.00M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 93.05% 355.13ns 2.82M +combining_dedicated_tc_sync_dup 92.87% 355.81ns 2.81M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 99.17% 333.21ns 3.00M +combining_dedicated_tc_async_dup 99.28% 332.84ns 3.00M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 93.51% 353.38ns 2.83M +combining_no_dedicated_notc_sync_dup 93.27% 354.26ns 2.82M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 99.40% 332.44ns 3.01M +combining_no_dedicated_notc_async_dup 99.13% 333.34ns 3.00M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 93.38% 353.86ns 2.83M +combining_no_dedicated_tc_sync_dup 93.52% 353.31ns 2.83M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 99.29% 332.78ns 3.00M +combining_no_dedicated_tc_async_dup 99.19% 333.11ns 3.00M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 2 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 213.60ns 4.68M +no_combining_dup 100.84% 211.82ns 4.72M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 89.84% 237.76ns 4.21M +combining_dedicated_notc_sync_dup 89.85% 237.73ns 4.21M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 93.80% 227.72ns 4.39M +combining_dedicated_notc_async_dup 87.85% 243.15ns 4.11M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 86.81% 246.06ns 4.06M +combining_dedicated_tc_sync_dup 87.15% 245.09ns 4.08M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 92.14% 231.82ns 4.31M +combining_dedicated_tc_async_dup 92.04% 232.08ns 4.31M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 95.20% 224.36ns 4.46M +combining_no_dedicated_notc_sync_dup 95.40% 223.91ns 4.47M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 95.41% 223.89ns 4.47M +combining_no_dedicated_notc_async_dup 95.86% 222.82ns 4.49M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 94.43% 226.21ns 4.42M +combining_no_dedicated_tc_sync_dup 94.28% 226.56ns 4.41M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 96.62% 221.07ns 4.52M +combining_no_dedicated_tc_async_dup 97.24% 219.66ns 4.55M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 3 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 188.20ns 5.31M +no_combining_dup 94.07% 200.07ns 5.00M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 95.39% 197.30ns 5.07M +combining_dedicated_notc_sync_dup 94.50% 199.16ns 5.02M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 75.29% 249.96ns 4.00M +combining_dedicated_notc_async_dup 72.97% 257.91ns 3.88M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 91.26% 206.22ns 4.85M +combining_dedicated_tc_sync_dup 90.68% 207.54ns 4.82M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 89.64% 209.95ns 4.76M +combining_dedicated_tc_async_dup 88.21% 213.36ns 4.69M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 96.19% 195.66ns 5.11M +combining_no_dedicated_notc_sync_dup 93.27% 201.78ns 4.96M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 81.12% 231.99ns 4.31M +combining_no_dedicated_notc_async_dup 82.48% 228.19ns 4.38M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 79.48% 236.78ns 4.22M +combining_no_dedicated_tc_sync_dup 79.73% 236.04ns 4.24M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 100.70% 186.90ns 5.35M +combining_no_dedicated_tc_async_dup 99.43% 189.27ns 5.28M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 4 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 242.84ns 4.12M +no_combining_dup 100.78% 240.96ns 4.15M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 100.91% 240.65ns 4.16M +combining_dedicated_notc_sync_dup 99.76% 243.42ns 4.11M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 102.06% 237.95ns 4.20M +combining_dedicated_notc_async_dup 101.63% 238.94ns 4.19M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 109.79% 221.18ns 4.52M +combining_dedicated_tc_sync_dup 108.94% 222.92ns 4.49M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 133.01% 182.58ns 5.48M +combining_dedicated_tc_async_dup 134.91% 180.00ns 5.56M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 108.77% 223.25ns 4.48M +combining_no_dedicated_notc_sync_dup 107.64% 225.61ns 4.43M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 115.14% 210.91ns 4.74M +combining_no_dedicated_notc_async_dup 115.06% 211.05ns 4.74M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 116.36% 208.70ns 4.79M +combining_no_dedicated_tc_sync_dup 115.70% 209.89ns 4.76M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 159.69% 152.07ns 6.58M +combining_no_dedicated_tc_async_dup 158.27% 153.43ns 6.52M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 6 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 281.36ns 3.55M +no_combining_dup 98.56% 285.46ns 3.50M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 132.39% 212.51ns 4.71M +combining_dedicated_notc_sync_dup 133.10% 211.38ns 4.73M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 141.35% 199.05ns 5.02M +combining_dedicated_notc_async_dup 143.18% 196.51ns 5.09M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 138.94% 202.50ns 4.94M +combining_dedicated_tc_sync_dup 138.64% 202.93ns 4.93M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 199.76% 140.85ns 7.10M +combining_dedicated_tc_async_dup 200.28% 140.48ns 7.12M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 155.48% 180.96ns 5.53M +combining_no_dedicated_notc_sync_dup 150.82% 186.55ns 5.36M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 162.23% 173.43ns 5.77M +combining_no_dedicated_notc_async_dup 161.33% 174.39ns 5.73M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 167.90% 167.57ns 5.97M +combining_no_dedicated_tc_sync_dup 164.84% 170.69ns 5.86M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 242.51% 116.02ns 8.62M +combining_no_dedicated_tc_async_dup 245.67% 114.53ns 8.73M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 8 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 315.57ns 3.17M +no_combining_dup 98.83% 319.32ns 3.13M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 170.48% 185.11ns 5.40M +combining_dedicated_notc_sync_dup 174.57% 180.77ns 5.53M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 178.57% 176.72ns 5.66M +combining_dedicated_notc_async_dup 181.30% 174.06ns 5.75M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 195.40% 161.50ns 6.19M +combining_dedicated_tc_sync_dup 197.18% 160.05ns 6.25M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 322.03% 97.99ns 10.20M +combining_dedicated_tc_async_dup 324.51% 97.24ns 10.28M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 205.61% 153.48ns 6.52M +combining_no_dedicated_notc_sync_dup 204.94% 153.98ns 6.49M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 217.81% 144.88ns 6.90M +combining_no_dedicated_notc_async_dup 218.58% 144.37ns 6.93M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 223.96% 140.91ns 7.10M +combining_no_dedicated_tc_sync_dup 224.55% 140.53ns 7.12M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 364.58% 86.56ns 11.55M +combining_no_dedicated_tc_async_dup 363.33% 86.86ns 11.51M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 12 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 353.59ns 2.83M +no_combining_dup 99.91% 353.91ns 2.83M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 276.36% 127.95ns 7.82M +combining_dedicated_notc_sync_dup 278.88% 126.79ns 7.89M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 249.52% 141.71ns 7.06M +combining_dedicated_notc_async_dup 247.26% 143.00ns 6.99M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 318.57% 110.99ns 9.01M +combining_dedicated_tc_sync_dup 326.27% 108.37ns 9.23M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 428.50% 82.52ns 12.12M +combining_dedicated_tc_async_dup 429.19% 82.39ns 12.14M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 276.54% 127.86ns 7.82M +combining_no_dedicated_notc_sync_dup 275.59% 128.31ns 7.79M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 298.92% 118.29ns 8.45M +combining_no_dedicated_notc_async_dup 298.93% 118.28ns 8.45M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 300.56% 117.64ns 8.50M +combining_no_dedicated_tc_sync_dup 296.95% 119.07ns 8.40M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 431.06% 82.03ns 12.19M +combining_no_dedicated_tc_async_dup 430.40% 82.15ns 12.17M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 16 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 358.57ns 2.79M +no_combining_dup 99.97% 358.70ns 2.79M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 319.73% 112.15ns 8.92M +combining_dedicated_notc_sync_dup 327.86% 109.37ns 9.14M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 296.17% 121.07ns 8.26M +combining_dedicated_notc_async_dup 306.86% 116.85ns 8.56M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 337.53% 106.24ns 9.41M +combining_dedicated_tc_sync_dup 347.98% 103.04ns 9.70M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 423.80% 84.61ns 11.82M +combining_dedicated_tc_async_dup 421.07% 85.16ns 11.74M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 321.94% 111.38ns 8.98M +combining_no_dedicated_notc_sync_dup 318.54% 112.57ns 8.88M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 364.71% 98.32ns 10.17M +combining_no_dedicated_notc_async_dup 364.22% 98.45ns 10.16M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 322.91% 111.04ns 9.01M +combining_no_dedicated_tc_sync_dup 322.42% 111.21ns 8.99M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 466.30% 76.90ns 13.00M +combining_no_dedicated_tc_async_dup 462.76% 77.49ns 12.91M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 24 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 348.54ns 2.87M +no_combining_dup 99.96% 348.69ns 2.87M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 260.21% 133.95ns 7.47M +combining_dedicated_notc_sync_dup 257.84% 135.18ns 7.40M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 242.25% 143.88ns 6.95M +combining_dedicated_notc_async_dup 235.88% 147.76ns 6.77M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 262.45% 132.80ns 7.53M +combining_dedicated_tc_sync_dup 251.14% 138.78ns 7.21M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 256.89% 135.68ns 7.37M +combining_dedicated_tc_async_dup 304.76% 114.37ns 8.74M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 270.20% 129.00ns 7.75M +combining_no_dedicated_notc_sync_dup 271.69% 128.29ns 7.80M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 298.35% 116.82ns 8.56M +combining_no_dedicated_notc_async_dup 289.04% 120.59ns 8.29M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 286.59% 121.62ns 8.22M +combining_no_dedicated_tc_sync_dup 292.21% 119.28ns 8.38M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 471.86% 73.87ns 13.54M +combining_no_dedicated_tc_async_dup 458.16% 76.08ns 13.14M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 32 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 337.61ns 2.96M +no_combining_dup 99.41% 339.60ns 2.94M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 204.50% 165.09ns 6.06M +combining_dedicated_notc_sync_dup 233.28% 144.72ns 6.91M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 187.20% 180.35ns 5.54M +combining_dedicated_notc_async_dup 192.76% 175.15ns 5.71M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 220.56% 153.07ns 6.53M +combining_dedicated_tc_sync_dup 207.62% 162.61ns 6.15M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 317.11% 106.46ns 9.39M +combining_dedicated_tc_async_dup 318.92% 105.86ns 9.45M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 259.29% 130.21ns 7.68M +combining_no_dedicated_notc_sync_dup 248.33% 135.95ns 7.36M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 290.40% 116.26ns 8.60M +combining_no_dedicated_notc_async_dup 299.92% 112.57ns 8.88M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 281.91% 119.76ns 8.35M +combining_no_dedicated_tc_sync_dup 284.19% 118.80ns 8.42M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 435.16% 77.58ns 12.89M +combining_no_dedicated_tc_async_dup 389.67% 86.64ns 11.54M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 48 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 334.48ns 2.99M +no_combining_dup 100.00% 334.46ns 2.99M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 257.01% 130.14ns 7.68M +combining_dedicated_notc_sync_dup 254.13% 131.62ns 7.60M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 189.56% 176.45ns 5.67M +combining_dedicated_notc_async_dup 247.68% 135.05ns 7.40M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 259.47% 128.91ns 7.76M +combining_dedicated_tc_sync_dup 281.34% 118.89ns 8.41M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 301.96% 110.77ns 9.03M +combining_dedicated_tc_async_dup 347.65% 96.21ns 10.39M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 268.45% 124.60ns 8.03M +combining_no_dedicated_notc_sync_dup 272.54% 122.73ns 8.15M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 306.04% 109.29ns 9.15M +combining_no_dedicated_notc_async_dup 294.38% 113.62ns 8.80M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 280.89% 119.08ns 8.40M +combining_no_dedicated_tc_sync_dup 276.01% 121.18ns 8.25M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 466.45% 71.71ns 13.95M +combining_no_dedicated_tc_async_dup 465.45% 71.86ns 13.92M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 64 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 335.68ns 2.98M +no_combining_dup 101.03% 332.25ns 3.01M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 272.91% 123.00ns 8.13M +combining_dedicated_notc_sync_dup 270.56% 124.07ns 8.06M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 200.44% 167.47ns 5.97M +combining_dedicated_notc_async_dup 208.36% 161.10ns 6.21M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 258.40% 129.91ns 7.70M +combining_dedicated_tc_sync_dup 249.16% 134.72ns 7.42M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 378.86% 88.60ns 11.29M +combining_dedicated_tc_async_dup 299.32% 112.15ns 8.92M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 272.18% 123.33ns 8.11M +combining_no_dedicated_notc_sync_dup 275.26% 121.95ns 8.20M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 296.23% 113.32ns 8.82M +combining_no_dedicated_notc_async_dup 311.17% 107.88ns 9.27M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 283.30% 118.49ns 8.44M +combining_no_dedicated_tc_sync_dup 263.86% 127.22ns 7.86M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 426.62% 78.68ns 12.71M +combining_no_dedicated_tc_async_dup 445.17% 75.40ns 13.26M +---------------------------------------------------------------------------- +============================================================================ + +------------------------------------ custom interface + +---------------------------------- Number of threads = 1 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 329.49ns 3.03M +no_combining_dup 99.91% 329.79ns 3.03M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 98.69% 333.88ns 3.00M +combining_dedicated_notc_sync_dup 98.70% 333.83ns 3.00M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 98.22% 335.47ns 2.98M +combining_dedicated_notc_async_dup 98.16% 335.66ns 2.98M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 98.70% 333.85ns 3.00M +combining_dedicated_tc_sync_dup 98.78% 333.58ns 3.00M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 98.14% 335.73ns 2.98M +combining_dedicated_tc_async_dup 97.92% 336.49ns 2.97M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 98.94% 333.00ns 3.00M +combining_no_dedicated_notc_sync_dup 98.86% 333.29ns 3.00M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 98.36% 334.99ns 2.99M +combining_no_dedicated_notc_async_dup 98.61% 334.15ns 2.99M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 99.07% 332.58ns 3.01M +combining_no_dedicated_tc_sync_dup 99.12% 332.41ns 3.01M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 97.08% 339.38ns 2.95M +combining_no_dedicated_tc_async_dup 97.54% 337.81ns 2.96M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 2 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 216.71ns 4.61M +no_combining_dup 100.34% 215.97ns 4.63M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 95.42% 227.11ns 4.40M +combining_dedicated_notc_sync_dup 94.16% 230.15ns 4.34M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 91.84% 235.97ns 4.24M +combining_dedicated_notc_async_dup 91.41% 237.08ns 4.22M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 96.79% 223.90ns 4.47M +combining_dedicated_tc_sync_dup 96.54% 224.47ns 4.45M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 90.90% 238.41ns 4.19M +combining_dedicated_tc_async_dup 95.45% 227.03ns 4.40M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 101.13% 214.28ns 4.67M +combining_no_dedicated_notc_sync_dup 100.11% 216.48ns 4.62M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 96.40% 224.80ns 4.45M +combining_no_dedicated_notc_async_dup 96.36% 224.90ns 4.45M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 100.86% 214.85ns 4.65M +combining_no_dedicated_tc_sync_dup 101.91% 212.65ns 4.70M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 95.66% 226.54ns 4.41M +combining_no_dedicated_tc_async_dup 95.88% 226.03ns 4.42M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 3 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 189.61ns 5.27M +no_combining_dup 100.22% 189.20ns 5.29M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 103.18% 183.76ns 5.44M +combining_dedicated_notc_sync_dup 103.66% 182.92ns 5.47M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 77.14% 245.81ns 4.07M +combining_dedicated_notc_async_dup 90.25% 210.10ns 4.76M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 89.88% 210.95ns 4.74M +combining_dedicated_tc_sync_dup 87.83% 215.90ns 4.63M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 89.33% 212.26ns 4.71M +combining_dedicated_tc_async_dup 85.19% 222.56ns 4.49M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 98.43% 192.64ns 5.19M +combining_no_dedicated_notc_sync_dup 101.15% 187.46ns 5.33M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 83.77% 226.36ns 4.42M +combining_no_dedicated_notc_async_dup 84.69% 223.89ns 4.47M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 85.47% 221.85ns 4.51M +combining_no_dedicated_tc_sync_dup 86.32% 219.65ns 4.55M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 105.62% 179.52ns 5.57M +combining_no_dedicated_tc_async_dup 105.26% 180.14ns 5.55M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 4 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 237.50ns 4.21M +no_combining_dup 99.80% 237.97ns 4.20M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 112.56% 210.99ns 4.74M +combining_dedicated_notc_sync_dup 104.08% 228.20ns 4.38M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 101.44% 234.12ns 4.27M +combining_dedicated_notc_async_dup 100.73% 235.77ns 4.24M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 111.70% 212.62ns 4.70M +combining_dedicated_tc_sync_dup 113.00% 210.18ns 4.76M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 131.11% 181.15ns 5.52M +combining_dedicated_tc_async_dup 132.65% 179.04ns 5.59M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 115.76% 205.17ns 4.87M +combining_no_dedicated_notc_sync_dup 114.70% 207.06ns 4.83M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 111.63% 212.76ns 4.70M +combining_no_dedicated_notc_async_dup 111.91% 212.22ns 4.71M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 120.07% 197.80ns 5.06M +combining_no_dedicated_tc_sync_dup 118.25% 200.85ns 4.98M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 153.73% 154.49ns 6.47M +combining_no_dedicated_tc_async_dup 153.08% 155.15ns 6.45M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 6 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 281.56ns 3.55M +no_combining_dup 99.97% 281.65ns 3.55M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 144.76% 194.50ns 5.14M +combining_dedicated_notc_sync_dup 149.96% 187.76ns 5.33M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 147.72% 190.61ns 5.25M +combining_dedicated_notc_async_dup 140.86% 199.89ns 5.00M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 154.17% 182.63ns 5.48M +combining_dedicated_tc_sync_dup 156.60% 179.80ns 5.56M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 202.42% 139.10ns 7.19M +combining_dedicated_tc_async_dup 203.44% 138.40ns 7.23M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 168.33% 167.27ns 5.98M +combining_no_dedicated_notc_sync_dup 166.02% 169.59ns 5.90M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 166.44% 169.16ns 5.91M +combining_no_dedicated_notc_async_dup 160.14% 175.82ns 5.69M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 181.79% 154.88ns 6.46M +combining_no_dedicated_tc_sync_dup 180.25% 156.20ns 6.40M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 240.56% 117.04ns 8.54M +combining_no_dedicated_tc_async_dup 240.74% 116.96ns 8.55M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 8 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 312.99ns 3.19M +no_combining_dup 98.93% 316.37ns 3.16M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 182.71% 171.30ns 5.84M +combining_dedicated_notc_sync_dup 183.23% 170.82ns 5.85M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 183.16% 170.88ns 5.85M +combining_dedicated_notc_async_dup 181.29% 172.64ns 5.79M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 191.49% 163.45ns 6.12M +combining_dedicated_tc_sync_dup 191.04% 163.84ns 6.10M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 302.89% 103.34ns 9.68M +combining_dedicated_tc_async_dup 304.07% 102.94ns 9.71M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 220.41% 142.00ns 7.04M +combining_no_dedicated_notc_sync_dup 219.90% 142.34ns 7.03M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 218.66% 143.14ns 6.99M +combining_no_dedicated_notc_async_dup 218.74% 143.09ns 6.99M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 241.82% 129.43ns 7.73M +combining_no_dedicated_tc_sync_dup 241.72% 129.48ns 7.72M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 352.39% 88.82ns 11.26M +combining_no_dedicated_tc_async_dup 350.17% 89.38ns 11.19M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 12 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 350.05ns 2.86M +no_combining_dup 99.06% 353.37ns 2.83M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 266.87% 131.17ns 7.62M +combining_dedicated_notc_sync_dup 245.79% 142.42ns 7.02M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 238.57% 146.73ns 6.82M +combining_dedicated_notc_async_dup 240.02% 145.84ns 6.86M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 316.70% 110.53ns 9.05M +combining_dedicated_tc_sync_dup 321.05% 109.03ns 9.17M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 403.10% 86.84ns 11.52M +combining_dedicated_tc_async_dup 409.94% 85.39ns 11.71M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 300.23% 116.59ns 8.58M +combining_no_dedicated_notc_sync_dup 299.07% 117.04ns 8.54M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 297.79% 117.55ns 8.51M +combining_no_dedicated_notc_async_dup 296.66% 118.00ns 8.47M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 328.07% 106.70ns 9.37M +combining_no_dedicated_tc_sync_dup 331.52% 105.59ns 9.47M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 424.57% 82.45ns 12.13M +combining_no_dedicated_tc_async_dup 409.47% 85.49ns 11.70M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 16 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 360.47ns 2.77M +no_combining_dup 100.11% 360.07ns 2.78M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 320.54% 112.46ns 8.89M +combining_dedicated_notc_sync_dup 313.31% 115.05ns 8.69M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 296.83% 121.44ns 8.23M +combining_dedicated_notc_async_dup 289.91% 124.34ns 8.04M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 364.27% 98.96ns 10.11M +combining_dedicated_tc_sync_dup 361.10% 99.82ns 10.02M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 424.43% 84.93ns 11.77M +combining_dedicated_tc_async_dup 418.07% 86.22ns 11.60M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 373.13% 96.60ns 10.35M +combining_no_dedicated_notc_sync_dup 364.35% 98.93ns 10.11M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 361.40% 99.74ns 10.03M +combining_no_dedicated_notc_async_dup 366.49% 98.36ns 10.17M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 382.22% 94.31ns 10.60M +combining_no_dedicated_tc_sync_dup 380.64% 94.70ns 10.56M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 461.14% 78.17ns 12.79M +combining_no_dedicated_tc_async_dup 481.50% 74.86ns 13.36M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 24 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 348.97ns 2.87M +no_combining_dup 100.12% 348.54ns 2.87M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 234.17% 149.02ns 6.71M +combining_dedicated_notc_sync_dup 205.54% 169.78ns 5.89M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 248.28% 140.55ns 7.11M +combining_dedicated_notc_async_dup 239.71% 145.58ns 6.87M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 272.87% 127.89ns 7.82M +combining_dedicated_tc_sync_dup 235.76% 148.02ns 6.76M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 295.71% 118.01ns 8.47M +combining_dedicated_tc_async_dup 265.87% 131.25ns 7.62M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 298.96% 116.73ns 8.57M +combining_no_dedicated_notc_sync_dup 297.67% 117.23ns 8.53M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 298.44% 116.93ns 8.55M +combining_no_dedicated_notc_async_dup 292.80% 119.18ns 8.39M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 316.44% 110.28ns 9.07M +combining_no_dedicated_tc_sync_dup 317.52% 109.90ns 9.10M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 432.64% 80.66ns 12.40M +combining_no_dedicated_tc_async_dup 441.55% 79.03ns 12.65M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 32 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 338.90ns 2.95M +no_combining_dup 100.01% 338.87ns 2.95M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 204.34% 165.85ns 6.03M +combining_dedicated_notc_sync_dup 202.84% 167.07ns 5.99M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 192.27% 176.26ns 5.67M +combining_dedicated_notc_async_dup 188.61% 179.68ns 5.57M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 247.57% 136.89ns 7.31M +combining_dedicated_tc_sync_dup 285.53% 118.69ns 8.43M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 277.97% 121.92ns 8.20M +combining_dedicated_tc_async_dup 231.11% 146.64ns 6.82M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 299.20% 113.27ns 8.83M +combining_no_dedicated_notc_sync_dup 289.53% 117.05ns 8.54M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 282.29% 120.05ns 8.33M +combining_no_dedicated_notc_async_dup 305.09% 111.08ns 9.00M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 312.52% 108.44ns 9.22M +combining_no_dedicated_tc_sync_dup 324.88% 104.31ns 9.59M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 420.99% 80.50ns 12.42M +combining_no_dedicated_tc_async_dup 406.58% 83.35ns 12.00M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 48 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 334.84ns 2.99M +no_combining_dup 99.57% 336.29ns 2.97M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 212.82% 157.34ns 6.36M +combining_dedicated_notc_sync_dup 198.39% 168.78ns 5.93M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 166.74% 200.82ns 4.98M +combining_dedicated_notc_async_dup 197.07% 169.91ns 5.89M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 246.35% 135.92ns 7.36M +combining_dedicated_tc_sync_dup 209.52% 159.81ns 6.26M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 293.94% 113.91ns 8.78M +combining_dedicated_tc_async_dup 280.74% 119.27ns 8.38M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 301.60% 111.02ns 9.01M +combining_no_dedicated_notc_sync_dup 296.10% 113.09ns 8.84M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 308.91% 108.40ns 9.23M +combining_no_dedicated_notc_async_dup 298.48% 112.18ns 8.91M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 331.11% 101.13ns 9.89M +combining_no_dedicated_tc_sync_dup 329.37% 101.66ns 9.84M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 451.58% 74.15ns 13.49M +combining_no_dedicated_tc_async_dup 431.37% 77.62ns 12.88M +---------------------------------------------------------------------------- +============================================================================ + +---------------------------------- Number of threads = 64 +============================================================================ +folly/experimental/flat_combining/test/FlatCombiningBenchmark.cpprelative time/iter iters/s +============================================================================ +no_combining_base 336.22ns 2.97M +no_combining_dup 100.69% 333.92ns 2.99M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_dedicated_notc_sync 230.57% 145.82ns 6.86M +combining_dedicated_notc_sync_dup 221.08% 152.08ns 6.58M +---------------------------------------------------------------------------- +combining_dedicated_notc_async 232.38% 144.69ns 6.91M +combining_dedicated_notc_async_dup 192.77% 174.41ns 5.73M +---------------------------------------------------------------------------- +combining_dedicated_tc_sync 284.07% 118.36ns 8.45M +combining_dedicated_tc_sync_dup 298.03% 112.81ns 8.86M +---------------------------------------------------------------------------- +combining_dedicated_tc_async 361.07% 93.12ns 10.74M +combining_dedicated_tc_async_dup 324.11% 103.74ns 9.64M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +combining_no_dedicated_notc_sync 284.58% 118.15ns 8.46M +combining_no_dedicated_notc_sync_dup 301.73% 111.43ns 8.97M +---------------------------------------------------------------------------- +combining_no_dedicated_notc_async 294.87% 114.02ns 8.77M +combining_no_dedicated_notc_async_dup 287.51% 116.94ns 8.55M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_sync 317.96% 105.74ns 9.46M +combining_no_dedicated_tc_sync_dup 332.45% 101.13ns 9.89M +---------------------------------------------------------------------------- +combining_no_dedicated_tc_async 441.96% 76.07ns 13.15M +combining_no_dedicated_tc_async_dup 393.82% 85.37ns 11.71M +---------------------------------------------------------------------------- +============================================================================ +[ OK ] FlatCombining.folly_benchmark (455269 ms) +[ RUN ] FlatCombining.direct_measurement + +Running benchmarks on machine with 32 logical cores + +------------------------------------ custom interface + +------------------------------------ Number of threads = 1 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 334 ns 331 ns 329 ns +no_combining - dup 335 ns 332 ns 331 ns 99% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 340 ns 335 ns 332 ns 99% +combining_notc_sync - dup 337 ns 335 ns 333 ns 98% +--------------------------------------- +combining_notc_async 360 ns 343 ns 338 ns 97% +combining_notc_async - dup 339 ns 337 ns 336 ns 98% +--------------------------------------- +combining_tc_sync 337 ns 335 ns 333 ns 98% +combining_tc_sync - dup 346 ns 336 ns 332 ns 99% +--------------------------------------- +combining_tc_async 338 ns 336 ns 335 ns 98% +combining_tc_async - dup 338 ns 336 ns 335 ns 98% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 338 ns 335 ns 333 ns 98% +combining_notc_sync - dup 337 ns 334 ns 333 ns 98% +--------------------------------------- +combining_notc_async 339 ns 336 ns 335 ns 98% +combining_notc_async - dup 347 ns 340 ns 336 ns 98% +--------------------------------------- +combining_tc_sync 337 ns 335 ns 333 ns 98% +combining_tc_sync - dup 436 ns 386 ns 333 ns 98% +--------------------------------------- +combining_tc_async 340 ns 337 ns 335 ns 98% +combining_tc_async - dup 338 ns 336 ns 335 ns 98% +--------------------------------------- + +------------------------------------ Number of threads = 2 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 315 ns 226 ns 211 ns +no_combining - dup 217 ns 216 ns 213 ns 98% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 251 ns 237 ns 229 ns 92% +combining_notc_sync - dup 250 ns 241 ns 226 ns 93% +--------------------------------------- +combining_notc_async 278 ns 268 ns 252 ns 83% +combining_notc_async - dup 297 ns 263 ns 245 ns 86% +--------------------------------------- +combining_tc_sync 254 ns 246 ns 234 ns 90% +combining_tc_sync - dup 335 ns 252 ns 230 ns 91% +--------------------------------------- +combining_tc_async 305 ns 282 ns 245 ns 86% +combining_tc_async - dup 284 ns 256 ns 239 ns 88% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 230 ns 222 ns 217 ns 97% +combining_notc_sync - dup 231 ns 225 ns 218 ns 96% +--------------------------------------- +combining_notc_async 244 ns 238 ns 233 ns 90% +combining_notc_async - dup 241 ns 236 ns 231 ns 91% +--------------------------------------- +combining_tc_sync 283 ns 239 ns 221 ns 95% +combining_tc_sync - dup 299 ns 247 ns 225 ns 93% +--------------------------------------- +combining_tc_async 290 ns 270 ns 244 ns 86% +combining_tc_async - dup 290 ns 251 ns 238 ns 88% +--------------------------------------- + +------------------------------------ Number of threads = 3 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 211 ns 197 ns 190 ns +no_combining - dup 209 ns 201 ns 195 ns 97% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 258 ns 197 ns 168 ns 112% +combining_notc_sync - dup 274 ns 200 ns 162 ns 117% +--------------------------------------- +combining_notc_async 307 ns 281 ns 260 ns 73% +combining_notc_async - dup 284 ns 258 ns 216 ns 88% +--------------------------------------- +combining_tc_sync 228 ns 215 ns 192 ns 98% +combining_tc_sync - dup 216 ns 203 ns 178 ns 107% +--------------------------------------- +combining_tc_async 246 ns 233 ns 220 ns 86% +combining_tc_async - dup 236 ns 221 ns 208 ns 91% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 204 ns 198 ns 184 ns 103% +combining_notc_sync - dup 203 ns 198 ns 193 ns 98% +--------------------------------------- +combining_notc_async 238 ns 225 ns 218 ns 87% +combining_notc_async - dup 231 ns 227 ns 223 ns 85% +--------------------------------------- +combining_tc_sync 220 ns 216 ns 211 ns 90% +combining_tc_sync - dup 227 ns 223 ns 219 ns 87% +--------------------------------------- +combining_tc_async 182 ns 181 ns 179 ns 106% +combining_tc_async - dup 186 ns 181 ns 180 ns 105% +--------------------------------------- + +------------------------------------ Number of threads = 4 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 258 ns 245 ns 238 ns +no_combining - dup 262 ns 249 ns 245 ns 97% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 264 ns 250 ns 220 ns 107% +combining_notc_sync - dup 260 ns 254 ns 231 ns 102% +--------------------------------------- +combining_notc_async 266 ns 255 ns 233 ns 102% +combining_notc_async - dup 268 ns 260 ns 252 ns 94% +--------------------------------------- +combining_tc_sync 250 ns 240 ns 215 ns 110% +combining_tc_sync - dup 252 ns 242 ns 217 ns 109% +--------------------------------------- +combining_tc_async 199 ns 190 ns 183 ns 129% +combining_tc_async - dup 199 ns 189 ns 178 ns 133% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 223 ns 211 ns 203 ns 116% +combining_notc_sync - dup 218 ns 211 ns 202 ns 117% +--------------------------------------- +combining_notc_async 222 ns 213 ns 207 ns 114% +combining_notc_async - dup 236 ns 222 ns 215 ns 110% +--------------------------------------- +combining_tc_sync 202 ns 199 ns 197 ns 120% +combining_tc_sync - dup 207 ns 199 ns 194 ns 122% +--------------------------------------- +combining_tc_async 162 ns 157 ns 152 ns 155% +combining_tc_async - dup 188 ns 161 ns 154 ns 154% +--------------------------------------- + +------------------------------------ Number of threads = 6 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 298 ns 292 ns 281 ns +no_combining - dup 296 ns 289 ns 270 ns 104% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 221 ns 211 ns 196 ns 143% +combining_notc_sync - dup 247 ns 211 ns 192 ns 146% +--------------------------------------- +combining_notc_async 216 ns 205 ns 194 ns 144% +combining_notc_async - dup 215 ns 206 ns 197 ns 142% +--------------------------------------- +combining_tc_sync 225 ns 204 ns 185 ns 151% +combining_tc_sync - dup 229 ns 210 ns 186 ns 151% +--------------------------------------- +combining_tc_async 165 ns 152 ns 144 ns 194% +combining_tc_async - dup 166 ns 150 ns 143 ns 195% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 184 ns 182 ns 180 ns 155% +combining_notc_sync - dup 176 ns 174 ns 172 ns 163% +--------------------------------------- +combining_notc_async 179 ns 177 ns 174 ns 161% +combining_notc_async - dup 186 ns 181 ns 177 ns 158% +--------------------------------------- +combining_tc_sync 164 ns 163 ns 160 ns 174% +combining_tc_sync - dup 171 ns 168 ns 161 ns 173% +--------------------------------------- +combining_tc_async 142 ns 139 ns 138 ns 202% +combining_tc_async - dup 141 ns 136 ns 119 ns 235% +--------------------------------------- + +------------------------------------ Number of threads = 8 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 333 ns 328 ns 315 ns +no_combining - dup 336 ns 330 ns 327 ns 96% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 203 ns 179 ns 172 ns 183% +combining_notc_sync - dup 190 ns 177 ns 171 ns 183% +--------------------------------------- +combining_notc_async 204 ns 183 ns 170 ns 185% +combining_notc_async - dup 201 ns 187 ns 176 ns 179% +--------------------------------------- +combining_tc_sync 177 ns 170 ns 165 ns 190% +combining_tc_sync - dup 178 ns 167 ns 164 ns 192% +--------------------------------------- +combining_tc_async 134 ns 115 ns 105 ns 300% +combining_tc_async - dup 132 ns 115 ns 103 ns 304% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 154 ns 145 ns 143 ns 220% +combining_notc_sync - dup 153 ns 144 ns 142 ns 222% +--------------------------------------- +combining_notc_async 145 ns 144 ns 143 ns 219% +combining_notc_async - dup 157 ns 148 ns 144 ns 218% +--------------------------------------- +combining_tc_sync 142 ns 134 ns 130 ns 241% +combining_tc_sync - dup 144 ns 136 ns 130 ns 241% +--------------------------------------- +combining_tc_async 118 ns 99 ns 91 ns 344% +combining_tc_async - dup 118 ns 95 ns 91 ns 344% +--------------------------------------- + +------------------------------------ Number of threads = 12 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 361 ns 357 ns 353 ns +no_combining - dup 361 ns 357 ns 355 ns 99% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 190 ns 157 ns 138 ns 255% +combining_notc_sync - dup 162 ns 149 ns 138 ns 255% +--------------------------------------- +combining_notc_async 163 ns 153 ns 145 ns 242% +combining_notc_async - dup 194 ns 158 ns 152 ns 231% +--------------------------------------- +combining_tc_sync 181 ns 128 ns 111 ns 316% +combining_tc_sync - dup 183 ns 148 ns 121 ns 289% +--------------------------------------- +combining_tc_async 92 ns 89 ns 87 ns 402% +combining_tc_async - dup 152 ns 105 ns 87 ns 405% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 120 ns 119 ns 118 ns 298% +combining_notc_sync - dup 120 ns 119 ns 118 ns 298% +--------------------------------------- +combining_notc_async 122 ns 120 ns 120 ns 294% +combining_notc_async - dup 121 ns 120 ns 118 ns 297% +--------------------------------------- +combining_tc_sync 110 ns 108 ns 106 ns 331% +combining_tc_sync - dup 110 ns 109 ns 107 ns 327% +--------------------------------------- +combining_tc_async 88 ns 87 ns 85 ns 411% +combining_tc_async - dup 90 ns 88 ns 85 ns 411% +--------------------------------------- + +------------------------------------ Number of threads = 16 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 363 ns 361 ns 360 ns +no_combining - dup 362 ns 361 ns 358 ns 100% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 177 ns 136 ns 111 ns 323% +combining_notc_sync - dup 185 ns 148 ns 112 ns 320% +--------------------------------------- +combining_notc_async 191 ns 151 ns 122 ns 294% +combining_notc_async - dup 179 ns 157 ns 118 ns 305% +--------------------------------------- +combining_tc_sync 154 ns 125 ns 100 ns 360% +combining_tc_sync - dup 166 ns 130 ns 98 ns 367% +--------------------------------------- +combining_tc_async 143 ns 107 ns 86 ns 418% +combining_tc_async - dup 132 ns 112 ns 88 ns 407% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 121 ns 103 ns 98 ns 367% +combining_notc_sync - dup 117 ns 104 ns 99 ns 362% +--------------------------------------- +combining_notc_async 116 ns 105 ns 99 ns 363% +combining_notc_async - dup 112 ns 104 ns 100 ns 359% +--------------------------------------- +combining_tc_sync 111 ns 101 ns 94 ns 381% +combining_tc_sync - dup 113 ns 98 ns 93 ns 387% +--------------------------------------- +combining_tc_async 97 ns 85 ns 74 ns 484% +combining_tc_async - dup 98 ns 86 ns 78 ns 457% +--------------------------------------- + +------------------------------------ Number of threads = 24 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 352 ns 351 ns 349 ns +no_combining - dup 352 ns 351 ns 348 ns 100% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 214 ns 173 ns 149 ns 234% +combining_notc_sync - dup 212 ns 166 ns 137 ns 254% +--------------------------------------- +combining_notc_async 232 ns 198 ns 161 ns 216% +combining_notc_async - dup 225 ns 191 ns 149 ns 234% +--------------------------------------- +combining_tc_sync 192 ns 152 ns 129 ns 270% +combining_tc_sync - dup 176 ns 156 ns 121 ns 286% +--------------------------------------- +combining_tc_async 202 ns 147 ns 118 ns 296% +combining_tc_async - dup 200 ns 158 ns 120 ns 291% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 161 ns 125 ns 115 ns 303% +combining_notc_sync - dup 144 ns 127 ns 116 ns 299% +--------------------------------------- +combining_notc_async 135 ns 122 ns 116 ns 298% +combining_notc_async - dup 341 ns 148 ns 117 ns 298% +--------------------------------------- +combining_tc_sync 130 ns 118 ns 109 ns 319% +combining_tc_sync - dup 116 ns 110 ns 105 ns 332% +--------------------------------------- +combining_tc_async 97 ns 86 ns 79 ns 442% +combining_tc_async - dup 95 ns 86 ns 79 ns 440% +--------------------------------------- + +------------------------------------ Number of threads = 32 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 337 ns 336 ns 333 ns +no_combining - dup 338 ns 336 ns 333 ns 99% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 193 ns 177 ns 162 ns 204% +combining_notc_sync - dup 211 ns 181 ns 156 ns 213% +--------------------------------------- +combining_notc_async 245 ns 200 ns 162 ns 205% +combining_notc_async - dup 216 ns 197 ns 149 ns 223% +--------------------------------------- +combining_tc_sync 195 ns 167 ns 121 ns 274% +combining_tc_sync - dup 179 ns 164 ns 143 ns 231% +--------------------------------------- +combining_tc_async 187 ns 152 ns 108 ns 307% +combining_tc_async - dup 182 ns 151 ns 125 ns 266% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 189 ns 127 ns 114 ns 290% +combining_notc_sync - dup 126 ns 118 ns 110 ns 302% +--------------------------------------- +combining_notc_async 233 ns 129 ns 112 ns 297% +combining_notc_async - dup 170 ns 126 ns 113 ns 293% +--------------------------------------- +combining_tc_sync 948 ns 212 ns 107 ns 309% +combining_tc_sync - dup 137 ns 112 ns 104 ns 318% +--------------------------------------- +combining_tc_async 90 ns 86 ns 79 ns 421% +combining_tc_async - dup 94 ns 87 ns 80 ns 414% +--------------------------------------- + +------------------------------------ Number of threads = 48 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 340 ns 336 ns 334 ns +no_combining - dup 336 ns 335 ns 334 ns 100% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 214 ns 176 ns 137 ns 243% +combining_notc_sync - dup 210 ns 173 ns 128 ns 260% +--------------------------------------- +combining_notc_async 217 ns 186 ns 162 ns 205% +combining_notc_async - dup 215 ns 186 ns 149 ns 224% +--------------------------------------- +combining_tc_sync 206 ns 171 ns 145 ns 230% +combining_tc_sync - dup 179 ns 149 ns 126 ns 265% +--------------------------------------- +combining_tc_async 175 ns 138 ns 108 ns 309% +combining_tc_async - dup 169 ns 134 ns 110 ns 301% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 1798 ns 293 ns 118 ns 282% +combining_notc_sync - dup 171 ns 122 ns 105 ns 318% +--------------------------------------- +combining_notc_async 227 ns 132 ns 110 ns 302% +combining_notc_async - dup 226 ns 137 ns 111 ns 301% +--------------------------------------- +combining_tc_sync 111 ns 106 ns 102 ns 327% +combining_tc_sync - dup 127 ns 110 ns 104 ns 321% +--------------------------------------- +combining_tc_async 297 ns 117 ns 77 ns 433% +combining_tc_async - dup 742 ns 149 ns 77 ns 432% +--------------------------------------- + +------------------------------------ Number of threads = 64 + +Test_name, Max time, Avg time, Min time, % base min / min + +no_combining - base 338 ns 333 ns 331 ns +no_combining - dup 335 ns 333 ns 331 ns 99% +--------------------------------------- +---- dedicated------------------------- +combining_notc_sync 198 ns 163 ns 148 ns 223% +combining_notc_sync - dup 172 ns 154 ns 124 ns 266% +--------------------------------------- +combining_notc_async 211 ns 177 ns 158 ns 209% +combining_notc_async - dup 182 ns 166 ns 152 ns 216% +--------------------------------------- +combining_tc_sync 195 ns 133 ns 112 ns 294% +combining_tc_sync - dup 158 ns 135 ns 108 ns 305% +--------------------------------------- +combining_tc_async 145 ns 119 ns 95 ns 347% +combining_tc_async - dup 159 ns 130 ns 95 ns 346% +--------------------------------------- +---- no dedicated---------------------- +combining_notc_sync 188 ns 123 ns 107 ns 308% +combining_notc_sync - dup 546 ns 159 ns 107 ns 307% +--------------------------------------- +combining_notc_async 558 ns 160 ns 108 ns 304% +combining_notc_async - dup 192 ns 127 ns 107 ns 308% +--------------------------------------- +combining_tc_sync 325 ns 130 ns 101 ns 325% +combining_tc_sync - dup 1766 ns 273 ns 101 ns 325% +--------------------------------------- +combining_tc_async 417 ns 118 ns 74 ns 446% +combining_tc_async - dup 838 ns 212 ns 72 ns 455% +--------------------------------------- +[ OK ] FlatCombining.direct_measurement (178622 ms) +[----------] 2 tests from FlatCombining (633891 ms total) + +[----------] Global test environment tear-down +[==========] 2 tests from 1 test case ran. (633891 ms total) +[ PASSED ] 2 tests. + +--- + +$ lscpu + +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Byte Order: Little Endian +CPU(s): 32 +On-line CPU(s) list: 0-31 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 2 +NUMA node(s): 2 +Vendor ID: GenuineIntel +CPU family: 6 +Model: 45 +Model name: Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz +Stepping: 6 +CPU MHz: 2200.000 +CPU max MHz: 2200.0000 +CPU min MHz: 1200.0000 +BogoMIPS: 4399.87 +Virtualization: VT-x +L1d cache: 32K +L1i cache: 32K +L2 cache: 256K +L3 cache: 20480K +NUMA node0 CPU(s): 0-7,16-23 +NUMA node1 CPU(s): 8-15,24-31 + +Flags: fpu vme de pse tsc msr pae mce cx8 apic sep +mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht +tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts +rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq +dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca +sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm +epb tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm arat pln pts + +--- + + */ diff --git a/folly/experimental/flat_combining/test/FlatCombiningExamples.h b/folly/experimental/flat_combining/test/FlatCombiningExamples.h new file mode 100644 index 00000000..17c25ca8 --- /dev/null +++ b/folly/experimental/flat_combining/test/FlatCombiningExamples.h @@ -0,0 +1,239 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace folly { + +struct Line { + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING + uint64_t val_; +}; + +class Data { // Sequential data structure + public: + explicit Data(size_t size) : size_(size) { + x_ = std::make_unique(size_); + } + + uint64_t getVal() { + uint64_t val = x_[0].val_; + for (size_t i = 1; i < size_; ++i) { + assert(x_[i].val_ == val); + } + return val; + } + + // add + + void add(uint64_t val) { + uint64_t oldval = x_[0].val_; + for (size_t i = 0; i < size_; ++i) { + assert(x_[i].val_ == oldval); + x_[i].val_ = oldval + val; + } + } + + uint64_t fetchAdd(uint64_t val) { + uint64_t res = x_[0].val_; + for (size_t i = 0; i < size_; ++i) { + assert(x_[i].val_ == res); + x_[i].val_ += val; + } + return res; + } + + private: + size_t size_; + std::unique_ptr x_; +}; + +// Example of FC concurrent data structure using simple interface + +template < + typename Mutex = std::mutex, + template class Atom = std::atomic> +class FcSimpleExample + : public FlatCombining, Mutex, Atom> { + using FC = FlatCombining, Mutex, Atom>; + using Rec = typename FC::Rec; + + public: + explicit FcSimpleExample( + size_t size, + bool dedicated = true, + uint32_t numRecs = 0, + uint32_t maxOps = 0) + : FC(dedicated, numRecs, maxOps), data_(size) {} + + uint64_t getVal() { + return data_.getVal(); + } + + // add + + void addNoFC(uint64_t val) { + this->requestNoFC([&] { data_.add(val); }); + } + + void add(uint64_t val, Rec* rec = nullptr) { + auto opFn = [&, val] { // asynchronous -- capture val by value + data_.add(val); + }; + this->requestFC(opFn, rec, false); + } + + // fetchAdd + + uint64_t fetchAddNoFC(uint64_t val) { + uint64_t res; + auto opFn = [&] { res = data_.fetchAdd(val); }; + this->requestNoFC(opFn); + return res; + } + + uint64_t fetchAdd(uint64_t val, Rec* rec = nullptr) { + uint64_t res; + auto opFn = [&] { res = data_.fetchAdd(val); }; + this->requestFC(opFn, rec); + return res; + } + + private: + Data data_; +}; + +// Example of FC data structure using custom request processing + +class Req { + public: + enum class Type { ADD, FETCHADD }; + + void setType(Type type) { + type_ = type; + } + + Type getType() { + return type_; + } + + void setVal(uint64_t val) { + val_ = val; + } + + uint64_t getVal() { + return val_; + } + + void setRes(uint64_t res) { + res_ = res; + } + + uint64_t getRes() { + return res_; + } + + private: + Type type_; + uint64_t val_; + uint64_t res_; +}; + +template < + typename Req, + typename Mutex = std::mutex, + template class Atom = std::atomic> +class FcCustomExample : public FlatCombining< + FcCustomExample, + Mutex, + Atom, + Req> { + using FC = FlatCombining, Mutex, Atom, Req>; + using Rec = typename FC::Rec; + + public: + explicit FcCustomExample( + int size, + bool dedicated = true, + uint32_t numRecs = 0, + uint32_t maxOps = 0) + : FC(dedicated, numRecs, maxOps), data_(size) {} + + uint64_t getVal() { + return data_.getVal(); + } + + // add + + void addNoFC(uint64_t val) { + this->requestNoFC([&] { data_.add(val); }); + } + + void add(uint64_t val, Rec* rec = nullptr) { + auto opFn = [&, val] { data_.add(val); }; + auto fillFn = [&](Req& req) { + req.setType(Req::Type::ADD); + req.setVal(val); + }; + this->requestFC(opFn, fillFn, rec, false); // asynchronous + } + + // fetchAdd + + uint64_t fetchAddNoFC(uint64_t val) { + uint64_t res; + auto opFn = [&] { res = data_.fetchAdd(val); }; + this->requestNoFC(opFn); + return res; + } + + uint64_t fetchAdd(uint64_t val, Rec* rec = nullptr) { + uint64_t res; + auto opFn = [&] { res = data_.fetchAdd(val); }; + auto fillFn = [&](Req& req) { + req.setType(Req::Type::FETCHADD); + req.setVal(val); + }; + auto resFn = [&](Req& req) { res = req.getRes(); }; + this->requestFC(opFn, fillFn, resFn, rec); + return res; + } + + // custom combined op processing - overrides FlatCombining::combinedOp(Req&) + void combinedOp(Req& req) { + switch (req.getType()) { + case Req::Type::ADD: { + data_.add(req.getVal()); + } break; + case Req::Type::FETCHADD: { + req.setRes(data_.fetchAdd(req.getVal())); + } break; + default: { assert(false); } + } + } + + private: + Data data_; +}; + +} // namespace folly { diff --git a/folly/experimental/flat_combining/test/FlatCombiningTest.cpp b/folly/experimental/flat_combining/test/FlatCombiningTest.cpp new file mode 100644 index 00000000..ff745b26 --- /dev/null +++ b/folly/experimental/flat_combining/test/FlatCombiningTest.cpp @@ -0,0 +1,102 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +using namespace folly::test; + +constexpr int LINES = 5; +constexpr int NUM_RECS = 20; +constexpr int WORK = 0; +constexpr int ITERS = 100; + +static std::vector nthr = {1, 10, 20}; + +struct Params { + bool combining, simple, dedicated, tc, syncop; +}; + +class FlatCombiningTest : public ::testing::TestWithParam {}; + +TEST_P(FlatCombiningTest, combining) { + Params p = GetParam(); + for (auto n : nthr) { + run_test( + n, + LINES, + NUM_RECS, + WORK, + ITERS, + p.combining, + p.simple, + p.dedicated, + p.tc, + p.syncop, + true, + true); + } +} + +TEST_P(FlatCombiningTest, more_threads_than_records) { + int n = 20; + int num_recs = 1; + + Params p = GetParam(); + run_test( + n, + LINES, + num_recs, + WORK, + ITERS, + p.combining, + p.simple, + p.dedicated, + p.tc, + p.syncop, + true, + true); +} + +constexpr Params params[] = { + {false, false, false, false, false}, // no combining + // simple combining + // dedicated + {true, true, true, false, true}, // no-tc sync + {true, true, true, false, false}, // no-tc async + {true, true, true, true, true}, // tc sync + {true, true, true, true, false}, // tc async + // no dedicated + {true, true, false, false, true}, // no-tc sync + {true, true, false, false, false}, // no-tc async + {true, true, false, true, true}, // tc sync + {true, true, false, true, false}, // tc async + // custom combining + // dedicated + {true, false, true, false, true}, // no-tc sync + {true, false, true, false, false}, // no-tc async + {true, false, true, true, true}, // tc sync + {true, false, true, true, false}, // tc async + // no dedicated + {true, false, false, false, true}, // no-tc sync + {true, false, false, false, false}, // no-tc async + {true, false, false, true, true}, // tc sync + {true, false, false, true, false}, // tc async +}; + +INSTANTIATE_TEST_CASE_P(Foo, FlatCombiningTest, ::testing::ValuesIn(params)); diff --git a/folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h b/folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h new file mode 100644 index 00000000..f9b4980d --- /dev/null +++ b/folly/experimental/flat_combining/test/FlatCombiningTestHelpers.h @@ -0,0 +1,210 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include +#include +#include + +namespace folly { +namespace test { + +void doWork(int work) { + uint64_t a = 0; + for (int i = work; i > 0; --i) { + a += i; + } + folly::doNotOptimizeAway(a); +} + +template < + typename Example, + typename Req = bool, + typename Mutex = std::mutex, + template class Atom = std::atomic> +uint64_t fc_test( + int nthreads, + int lines, + int numRecs, + int work, + int ops, + bool combining, + bool dedicated, + bool tc, + bool syncops, + bool excl = false, + bool allocAll = false) { + using FC = FlatCombining; + using Rec = typename FC::Rec; + + folly::BenchmarkSuspender susp; + + std::atomic start{false}; + std::atomic started{0}; + Example ex(lines, dedicated, numRecs); + std::atomic total{0}; + bool mutex = false; + + if (allocAll) { + std::vector v(numRecs); + for (int i = 0; i < numRecs; ++i) { + v[i] = ex.allocRec(); + } + for (int i = numRecs; i > 0; --i) { + ex.freeRec(v[i - 1]); + } + } + + std::vector threads(nthreads); + for (int tid = 0; tid < nthreads; ++tid) { + threads[tid] = std::thread([&, tid] { + started.fetch_add(1); + Rec* myrec = (combining && tc) ? ex.allocRec() : nullptr; + uint64_t sum = 0; + while (!start.load()) + ; + + if (!combining) { + // no combining + for (int i = tid; i < ops; i += nthreads) { + sum += ex.fetchAddNoFC(1); + doWork(work); // unrelated work + } + } else if (syncops) { + // sync combining + for (int i = tid; i < ops; i += nthreads) { + sum += ex.fetchAdd(1, myrec); + doWork(work); // unrelated work + } + } else { + // async combining + for (int i = tid; i < ops; i += nthreads) { + ex.add(1, myrec); + doWork(work); // unrelated work + } + } + + if (excl) { + // test of unstructured exclusive access + ex.acquireExclusive(); + { + CHECK(!mutex); + mutex = true; + VLOG(2) << tid << " " << ex.getVal() << " ..........."; + using namespace std::chrono_literals; + /* sleep override */ // for coverage + std::this_thread::sleep_for(10ms); + VLOG(2) << tid << " " << ex.getVal() << " ==========="; + CHECK(mutex); + mutex = false; + } + ex.releaseExclusive(); + } + + total.fetch_add(sum); + if (combining && tc) { + ex.freeRec(myrec); + } + }); + } + + while (started.load() < nthreads) + ; + auto tbegin = std::chrono::steady_clock::now(); + + // begin time measurement + susp.dismiss(); + start.store(true); + + for (auto& t : threads) { + t.join(); + } + + if (!syncops) { + // complete any pending asynch ops + ex.drainAll(); + } + + // end time measurement + uint64_t duration = 0; + BENCHMARK_SUSPEND { + auto tend = std::chrono::steady_clock::now(); + CHECK_EQ(ops, ex.getVal()); + if (syncops) { + uint64_t n = (uint64_t)ops; + uint64_t expected = n * (n - 1) / 2; + CHECK_EQ(expected, total); + } + duration = + std::chrono::duration_cast(tend - tbegin) + .count(); + } + return duration; +} + +uint64_t run_test( + int nthreads, + int lines, + int numRecs, + int work, + int ops, + bool combining, + bool simple, + bool dedicated, + bool tc, + bool syncops, + bool excl = false, + bool allocAll = false) { + using M = std::mutex; + if (simple) { + using Example = FcSimpleExample; + return fc_test( + nthreads, + lines, + numRecs, + work, + ops, + combining, + dedicated, + tc, + syncops, + excl, + allocAll); + } else { + using Example = FcCustomExample; + return fc_test( + nthreads, + lines, + numRecs, + work, + ops, + combining, + dedicated, + tc, + syncops, + excl, + allocAll); + } +} + +} // namespace test { +} // namespace folly {