From efc07505fad326d69aa0f96b8d2e74c722dd98a5 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 8 Dec 2016 10:28:21 -0800 Subject: [PATCH] update small locks benchmark Summary: Add fairness benchmark, contended benchmark Reviewed By: davidtgoldblatt Differential Revision: D3706680 fbshipit-source-id: 01814fcd6b0cc3dd12815a2b2c248abc6a3771ec --- folly/test/SmallLocksBenchmark.cpp | 356 ++++++++++++++++++++++++++++- 1 file changed, 348 insertions(+), 8 deletions(-) diff --git a/folly/test/SmallLocksBenchmark.cpp b/folly/test/SmallLocksBenchmark.cpp index 7d67883d..9b86adf3 100644 --- a/folly/test/SmallLocksBenchmark.cpp +++ b/folly/test/SmallLocksBenchmark.cpp @@ -16,6 +16,225 @@ #include #include +#include +#include +#include +#include +#include + +/* "Work cycle" is just an additional nop loop iteration. + * A smaller number of work cyles will result in more contention, + * which is what we're trying to measure. The relative ratio of + * locked to unlocked cycles will simulate how big critical sections + * are in production code + */ +DEFINE_int32(work, 100, "Number of work cycles"); +DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles"); +DEFINE_int32( + threads, + std::thread::hardware_concurrency(), + "Number of threads for fairness test"); + +static void burn(size_t n) { + for (size_t i = 0; i < n; ++i) { + folly::doNotOptimizeAway(i); + } +} + +namespace { + +struct SimpleBarrier { + explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {} + + void wait() { + std::unique_lock lockHeld(lock_); + if (++num_ == count_) { + cv_.notify_all(); + } else { + cv_.wait(lockHeld); + } + } + + private: + std::mutex lock_; + std::condition_variable cv_; + size_t num_{0}; + size_t count_; +}; +} + +template +class InitLock { + Lock lock_; + + public: + InitLock() { + lock_.init(); + } + void lock() { + lock_.lock(); + } + void unlock() { + lock_.unlock(); + } +}; + +template +static void runContended(size_t numOps, size_t numThreads) { + folly::BenchmarkSuspender braces; + size_t totalthreads = std::thread::hardware_concurrency(); + if (totalthreads < numThreads) { + totalthreads = numThreads; + } + size_t threadgroups = totalthreads / numThreads; + struct lockstruct { + char padding1[128]; + Lock lock; + char padding2[128]; + long value = 1; + }; + + auto locks = + (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct)); + + char padding3[128]; + (void)padding3; + std::vector threads(totalthreads); + + SimpleBarrier runbarrier(totalthreads + 1); + + for (size_t t = 0; t < totalthreads; ++t) { + threads[t] = std::thread([&, t, totalthreads] { + lockstruct* lock = &locks[t % threadgroups]; + runbarrier.wait(); + for (size_t op = 0; op < numOps; op += 1) { + lock->lock.lock(); + burn(FLAGS_work); + lock->value++; + lock->lock.unlock(); + burn(FLAGS_unlocked_work); + } + }); + } + + runbarrier.wait(); + braces.dismissing([&] { + for (auto& thr : threads) { + thr.join(); + } + }); +} + +template +static void runFairness() { + size_t numThreads = FLAGS_threads; + size_t totalthreads = std::thread::hardware_concurrency(); + if (totalthreads < numThreads) { + totalthreads = numThreads; + } + long threadgroups = totalthreads / numThreads; + struct lockstruct { + char padding1[128]; + Lock lock; + }; + + auto locks = + (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct)); + + char padding3[64]; + (void)padding3; + std::vector threads(totalthreads); + + std::atomic stop{false}; + + std::mutex rlock; + std::vector results; + std::vector maxes; + + std::vector aqTime; + std::vector aqTimeSq; + + SimpleBarrier runbarrier(totalthreads + 1); + + for (size_t t = 0; t < totalthreads; ++t) { + threads[t] = std::thread([&, t, totalthreads] { + lockstruct* lock = &locks[t % threadgroups]; + long value = 0; + std::chrono::microseconds max(0); + std::chrono::microseconds time(0); + unsigned long timeSq(0); + runbarrier.wait(); + while (!stop) { + std::chrono::steady_clock::time_point prelock = + std::chrono::steady_clock::now(); + lock->lock.lock(); + std::chrono::steady_clock::time_point postlock = + std::chrono::steady_clock::now(); + auto diff = std::chrono::duration_cast( + postlock - prelock); + time += diff; + timeSq += diff.count() * diff.count(); + if (diff > max) { + max = diff; + } + burn(FLAGS_work); + value++; + lock->lock.unlock(); + burn(FLAGS_unlocked_work); + } + { + std::lock_guard g(rlock); + results.push_back(value); + maxes.push_back(max); + aqTime.push_back(time); + aqTimeSq.push_back(timeSq); + } + }); + } + + runbarrier.wait(); + /* sleep override */ + std::this_thread::sleep_for(std::chrono::seconds(2)); + stop = true; + + for (auto& thr : threads) { + thr.join(); + } + + // Calulate some stats + unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0); + double m = sum / results.size(); + + double accum = 0.0; + std::for_each(results.begin(), results.end(), [&](const double d) { + accum += (d - m) * (d - m); + }); + double stdev = sqrt(accum / (results.size() - 1)); + std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end()); + std::chrono::microseconds agAqTime = std::accumulate( + aqTime.begin(), aqTime.end(), std::chrono::microseconds(0)); + unsigned long agAqTimeSq = + std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0); + std::chrono::microseconds mean = agAqTime / sum; + double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) / + sum / (sum - 1); + double stddev2 = sqrt(variance); + + printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev); + printf( + "Lock time stats in us: mean %li stddev %.0f max %li\n", + mean.count(), + stddev2, + mx.count()); +} + +BENCHMARK(StdMutexUncontendedBenchmark, iters) { + std::mutex lock; + while (iters--) { + lock.lock(); + lock.unlock(); + } +} BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) { folly::MicroSpinLock lock; @@ -45,14 +264,6 @@ BENCHMARK(MicroLockUncontendedBenchmark, iters) { } } -BENCHMARK(StdMutexUncontendedBenchmark, iters) { - std::mutex lock; - while (iters--) { - lock.lock(); - lock.unlock(); - } -} - struct VirtualBase { virtual void foo() = 0; virtual ~VirtualBase() {} @@ -78,8 +289,137 @@ BENCHMARK(VirtualFunctionCall, iters) { } #endif +BENCHMARK_DRAW_LINE() + +#define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__)) +#define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__)) + +static void std_mutex(size_t numOps, size_t numThreads) { + runContended(numOps, numThreads); +} +static void folly_microspin(size_t numOps, size_t numThreads) { + runContended>(numOps, numThreads); +} +static void folly_picospin(size_t numOps, size_t numThreads) { + runContended>>(numOps, numThreads); +} +static void folly_microlock(size_t numOps, size_t numThreads) { + runContended(numOps, numThreads); +} + +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 1thread, 1) +BENCH_REL(folly_microspin, 1thread, 1) +BENCH_REL(folly_picospin, 1thread, 1) +BENCH_REL(folly_microlock, 1thread, 1) +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 2thread, 2) +BENCH_REL(folly_microspin, 2thread, 2) +BENCH_REL(folly_picospin, 2thread, 2) +BENCH_REL(folly_microlock, 2thread, 2) +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 4thread, 4) +BENCH_REL(folly_microspin, 4thread, 4) +BENCH_REL(folly_picospin, 4thread, 4) +BENCH_REL(folly_microlock, 4thread, 4) +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 8thread, 8) +BENCH_REL(folly_microspin, 8thread, 8) +BENCH_REL(folly_picospin, 8thread, 8) +BENCH_REL(folly_microlock, 8thread, 8) +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 16thread, 16) +BENCH_REL(folly_microspin, 16thread, 16) +BENCH_REL(folly_picospin, 16thread, 16) +BENCH_REL(folly_microlock, 16thread, 16) +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 32thread, 32) +BENCH_REL(folly_microspin, 32thread, 32) +BENCH_REL(folly_picospin, 32thread, 32) +BENCH_REL(folly_microlock, 32thread, 32) +BENCHMARK_DRAW_LINE() +BENCH_BASE(std_mutex, 64thread, 64) +BENCH_REL(folly_microspin, 64thread, 64) +BENCH_REL(folly_picospin, 64thread, 64) +BENCH_REL(folly_microlock, 64thread, 64) + +#define FairnessTest(type) \ + { \ + printf(#type ": \n"); \ + runFairness(); \ + } + int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); + + FairnessTest(std::mutex); + FairnessTest(InitLock); + FairnessTest(InitLock>); + FairnessTest(folly::MicroLock); + folly::runBenchmarks(); + return 0; } + +/* +locks_benchmark --bm_min_iters=1000000 + +std::mutex: +Sum: 2895849 Mean: 90495 stddev: 3147 +Lock time stats in us: mean 11 stddev 1483 max 17823 +InitLock: +Sum: 3114365 Mean: 97323 stddev: 92604 +Lock time stats in us: mean 19 stddev 1379 max 235710 +InitLock>: +Sum: 1189713 Mean: 37178 stddev: 23295 +Lock time stats in us: mean 51 stddev 3610 max 414093 +folly::MicroLock: +Sum: 1890150 Mean: 59067 stddev: 26795 +Lock time stats in us: mean 31 stddev 2272 max 70996 +============================================================================ +folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s +============================================================================ +StdMutexUncontendedBenchmark 25.97ns 38.51M +MicroSpinLockUncontendedBenchmark 16.26ns 61.49M +PicoSpinLockUncontendedBenchmark 15.92ns 62.82M +MicroLockUncontendedBenchmark 28.54ns 35.03M +VirtualFunctionCall 1.73ns 576.51M +---------------------------------------------------------------------------- +---------------------------------------------------------------------------- +std_mutex(1thread) 898.49ns 1.11M +folly_microspin(1thread) 87.68% 1.02us 975.85K +folly_picospin(1thread) 95.06% 945.15ns 1.06M +folly_microlock(1thread) 87.34% 1.03us 972.05K +---------------------------------------------------------------------------- +std_mutex(2thread) 1.29us 773.03K +folly_microspin(2thread) 90.35% 1.43us 698.40K +folly_picospin(2thread) 126.11% 1.03us 974.90K +folly_microlock(2thread) 109.99% 1.18us 850.24K +---------------------------------------------------------------------------- +std_mutex(4thread) 3.03us 330.05K +folly_microspin(4thread) 94.03% 3.22us 310.33K +folly_picospin(4thread) 127.16% 2.38us 419.70K +folly_microlock(4thread) 64.39% 4.71us 212.51K +---------------------------------------------------------------------------- +std_mutex(8thread) 4.79us 208.81K +folly_microspin(8thread) 71.50% 6.70us 149.30K +folly_picospin(8thread) 54.58% 8.77us 113.96K +folly_microlock(8thread) 55.58% 8.62us 116.05K +---------------------------------------------------------------------------- +std_mutex(16thread) 12.14us 82.39K +folly_microspin(16thread) 102.02% 11.90us 84.05K +folly_picospin(16thread) 62.30% 19.48us 51.33K +folly_microlock(16thread) 64.54% 18.81us 53.17K +---------------------------------------------------------------------------- +std_mutex(32thread) 22.52us 44.41K +folly_microspin(32thread) 88.25% 25.52us 39.19K +folly_picospin(32thread) 46.04% 48.91us 20.45K +folly_microlock(32thread) 67.08% 33.57us 29.79K +---------------------------------------------------------------------------- +std_mutex(64thread) 43.29us 23.10K +folly_microspin(64thread) 98.01% 44.17us 22.64K +folly_picospin(64thread) 48.62% 89.04us 11.23K +folly_microlock(64thread) 62.68% 69.07us 14.48K +============================================================================ +*/ -- 2.34.1