update small locks benchmark

author Dave Watson <davejwatson@fb.com>

Thu, 8 Dec 2016 18:28:21 +0000 (10:28 -0800)

committer Facebook Github Bot <facebook-github-bot-bot@fb.com>

Fri, 9 Dec 2016 18:03:50 +0000 (10:03 -0800)
author Dave Watson <davejwatson@fb.com>
Thu, 8 Dec 2016 18:28:21 +0000 (10:28 -0800)
committer Facebook Github Bot <facebook-github-bot-bot@fb.com>
Fri, 9 Dec 2016 18:03:50 +0000 (10:03 -0800)
diff --git a/folly/test/SmallLocksBenchmark.cpp b/folly/test/SmallLocksBenchmark.cpp

index 7d67883ddc634e84c4ccc4036d5a5f80663fc47e..9b86adf3b57de816e38f492170eba24cddaf35ab 100644 (file)
--- a/folly/test/SmallLocksBenchmark.cpp
+++ b/folly/test/SmallLocksBenchmark.cpp
@@ -16,6 +16,225 @@
  
  #include <folly/Benchmark.h>
  #include <folly/SmallLocks.h>
+#include <algorithm>
+#include <condition_variable>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+/* "Work cycle" is just an additional nop loop iteration.
+ * A smaller number of work cyles will result in more contention,
+ * which is what we're trying to measure.  The relative ratio of
+ * locked to unlocked cycles will simulate how big critical sections
+ * are in production code
+ */
+DEFINE_int32(work, 100, "Number of work cycles");
+DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
+DEFINE_int32(
+    threads,
+    std::thread::hardware_concurrency(),
+    "Number of threads for fairness test");
+
+static void burn(size_t n) {
+  for (size_t i = 0; i < n; ++i) {
+    folly::doNotOptimizeAway(i);
+  }
+}
+
+namespace {
+
+struct SimpleBarrier {
+  explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
+
+  void wait() {
+    std::unique_lock<std::mutex> lockHeld(lock_);
+    if (++num_ == count_) {
+      cv_.notify_all();
+    } else {
+      cv_.wait(lockHeld);
+    }
+  }
+
+ private:
+  std::mutex lock_;
+  std::condition_variable cv_;
+  size_t num_{0};
+  size_t count_;
+};
+}
+
+template <typename Lock>
+class InitLock {
+  Lock lock_;
+
+ public:
+  InitLock() {
+    lock_.init();
+  }
+  void lock() {
+    lock_.lock();
+  }
+  void unlock() {
+    lock_.unlock();
+  }
+};
+
+template <typename Lock>
+static void runContended(size_t numOps, size_t numThreads) {
+  folly::BenchmarkSuspender braces;
+  size_t totalthreads = std::thread::hardware_concurrency();
+  if (totalthreads < numThreads) {
+    totalthreads = numThreads;
+  }
+  size_t threadgroups = totalthreads / numThreads;
+  struct lockstruct {
+    char padding1[128];
+    Lock lock;
+    char padding2[128];
+    long value = 1;
+  };
+
+  auto locks =
+      (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
+
+  char padding3[128];
+  (void)padding3;
+  std::vector<std::thread> threads(totalthreads);
+
+  SimpleBarrier runbarrier(totalthreads + 1);
+
+  for (size_t t = 0; t < totalthreads; ++t) {
+    threads[t] = std::thread([&, t, totalthreads] {
+      lockstruct* lock = &locks[t % threadgroups];
+      runbarrier.wait();
+      for (size_t op = 0; op < numOps; op += 1) {
+        lock->lock.lock();
+        burn(FLAGS_work);
+        lock->value++;
+        lock->lock.unlock();
+        burn(FLAGS_unlocked_work);
+      }
+    });
+  }
+
+  runbarrier.wait();
+  braces.dismissing([&] {
+    for (auto& thr : threads) {
+      thr.join();
+    }
+  });
+}
+
+template <typename Lock>
+static void runFairness() {
+  size_t numThreads = FLAGS_threads;
+  size_t totalthreads = std::thread::hardware_concurrency();
+  if (totalthreads < numThreads) {
+    totalthreads = numThreads;
+  }
+  long threadgroups = totalthreads / numThreads;
+  struct lockstruct {
+    char padding1[128];
+    Lock lock;
+  };
+
+  auto locks =
+      (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
+
+  char padding3[64];
+  (void)padding3;
+  std::vector<std::thread> threads(totalthreads);
+
+  std::atomic<bool> stop{false};
+
+  std::mutex rlock;
+  std::vector<long> results;
+  std::vector<std::chrono::microseconds> maxes;
+
+  std::vector<std::chrono::microseconds> aqTime;
+  std::vector<unsigned long> aqTimeSq;
+
+  SimpleBarrier runbarrier(totalthreads + 1);
+
+  for (size_t t = 0; t < totalthreads; ++t) {
+    threads[t] = std::thread([&, t, totalthreads] {
+      lockstruct* lock = &locks[t % threadgroups];
+      long value = 0;
+      std::chrono::microseconds max(0);
+      std::chrono::microseconds time(0);
+      unsigned long timeSq(0);
+      runbarrier.wait();
+      while (!stop) {
+        std::chrono::steady_clock::time_point prelock =
+            std::chrono::steady_clock::now();
+        lock->lock.lock();
+        std::chrono::steady_clock::time_point postlock =
+            std::chrono::steady_clock::now();
+        auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
+            postlock - prelock);
+        time += diff;
+        timeSq += diff.count() * diff.count();
+        if (diff > max) {
+          max = diff;
+        }
+        burn(FLAGS_work);
+        value++;
+        lock->lock.unlock();
+        burn(FLAGS_unlocked_work);
+      }
+      {
+        std::lock_guard<std::mutex> g(rlock);
+        results.push_back(value);
+        maxes.push_back(max);
+        aqTime.push_back(time);
+        aqTimeSq.push_back(timeSq);
+      }
+    });
+  }
+
+  runbarrier.wait();
+  /* sleep override */
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+  stop = true;
+
+  for (auto& thr : threads) {
+    thr.join();
+  }
+
+  // Calulate some stats
+  unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
+  double m = sum / results.size();
+
+  double accum = 0.0;
+  std::for_each(results.begin(), results.end(), [&](const double d) {
+    accum += (d - m) * (d - m);
+  });
+  double stdev = sqrt(accum / (results.size() - 1));
+  std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
+  std::chrono::microseconds agAqTime = std::accumulate(
+      aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
+  unsigned long agAqTimeSq =
+      std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
+  std::chrono::microseconds mean = agAqTime / sum;
+  double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
+      sum / (sum - 1);
+  double stddev2 = sqrt(variance);
+
+  printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
+  printf(
+      "Lock time stats in us: mean %li stddev %.0f max %li\n",
+      mean.count(),
+      stddev2,
+      mx.count());
+}
+
+BENCHMARK(StdMutexUncontendedBenchmark, iters) {
+  std::mutex lock;
+  while (iters--) {
+    lock.lock();
+    lock.unlock();
+  }
+}
  
  BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
    folly::MicroSpinLock lock;
@@ -45,14 +264,6 @@ BENCHMARK(MicroLockUncontendedBenchmark, iters) {
    }
  }
  
-BENCHMARK(StdMutexUncontendedBenchmark, iters) {
-  std::mutex lock;
-  while (iters--) {
-    lock.lock();
-    lock.unlock();
-  }
-}
-
  struct VirtualBase {
    virtual void foo() = 0;
    virtual ~VirtualBase() {}
@@ -78,8 +289,137 @@ BENCHMARK(VirtualFunctionCall, iters) {
  }
  #endif
  
+BENCHMARK_DRAW_LINE()
+
+#define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
+#define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
+
+static void std_mutex(size_t numOps, size_t numThreads) {
+  runContended<std::mutex>(numOps, numThreads);
+}
+static void folly_microspin(size_t numOps, size_t numThreads) {
+  runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
+}
+static void folly_picospin(size_t numOps, size_t numThreads) {
+  runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
+}
+static void folly_microlock(size_t numOps, size_t numThreads) {
+  runContended<folly::MicroLock>(numOps, numThreads);
+}
+
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 1thread, 1)
+BENCH_REL(folly_microspin, 1thread, 1)
+BENCH_REL(folly_picospin, 1thread, 1)
+BENCH_REL(folly_microlock, 1thread, 1)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 2thread, 2)
+BENCH_REL(folly_microspin, 2thread, 2)
+BENCH_REL(folly_picospin, 2thread, 2)
+BENCH_REL(folly_microlock, 2thread, 2)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 4thread, 4)
+BENCH_REL(folly_microspin, 4thread, 4)
+BENCH_REL(folly_picospin, 4thread, 4)
+BENCH_REL(folly_microlock, 4thread, 4)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 8thread, 8)
+BENCH_REL(folly_microspin, 8thread, 8)
+BENCH_REL(folly_picospin, 8thread, 8)
+BENCH_REL(folly_microlock, 8thread, 8)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 16thread, 16)
+BENCH_REL(folly_microspin, 16thread, 16)
+BENCH_REL(folly_picospin, 16thread, 16)
+BENCH_REL(folly_microlock, 16thread, 16)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 32thread, 32)
+BENCH_REL(folly_microspin, 32thread, 32)
+BENCH_REL(folly_picospin, 32thread, 32)
+BENCH_REL(folly_microlock, 32thread, 32)
+BENCHMARK_DRAW_LINE()
+BENCH_BASE(std_mutex, 64thread, 64)
+BENCH_REL(folly_microspin, 64thread, 64)
+BENCH_REL(folly_picospin, 64thread, 64)
+BENCH_REL(folly_microlock, 64thread, 64)
+
+#define FairnessTest(type) \
+  {                        \
+    printf(#type ": \n");  \
+    runFairness<type>();   \
+  }
+
  int main(int argc, char** argv) {
    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  FairnessTest(std::mutex);
+  FairnessTest(InitLock<folly::MicroSpinLock>);
+  FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
+  FairnessTest(folly::MicroLock);
+
    folly::runBenchmarks();
+
    return 0;
  }
+
+/*
+locks_benchmark --bm_min_iters=1000000
+
+std::mutex:
+Sum: 2895849 Mean: 90495 stddev: 3147
+Lock time stats in us: mean 11 stddev 1483 max 17823
+InitLock<folly::MicroSpinLock>:
+Sum: 3114365 Mean: 97323 stddev: 92604
+Lock time stats in us: mean 19 stddev 1379 max 235710
+InitLock<folly::PicoSpinLock<uint16_t>>:
+Sum: 1189713 Mean: 37178 stddev: 23295
+Lock time stats in us: mean 51 stddev 3610 max 414093
+folly::MicroLock:
+Sum: 1890150 Mean: 59067 stddev: 26795
+Lock time stats in us: mean 31 stddev 2272 max 70996
+============================================================================
+folly/test/SmallLocksBenchmark.cpp              relative  time/iter  iters/s
+============================================================================
+StdMutexUncontendedBenchmark                                25.97ns   38.51M
+MicroSpinLockUncontendedBenchmark                           16.26ns   61.49M
+PicoSpinLockUncontendedBenchmark                            15.92ns   62.82M
+MicroLockUncontendedBenchmark                               28.54ns   35.03M
+VirtualFunctionCall                                          1.73ns  576.51M
+----------------------------------------------------------------------------
+----------------------------------------------------------------------------
+std_mutex(1thread)                                         898.49ns    1.11M
+folly_microspin(1thread)                          87.68%     1.02us  975.85K
+folly_picospin(1thread)                           95.06%   945.15ns    1.06M
+folly_microlock(1thread)                          87.34%     1.03us  972.05K
+----------------------------------------------------------------------------
+std_mutex(2thread)                                           1.29us  773.03K
+folly_microspin(2thread)                          90.35%     1.43us  698.40K
+folly_picospin(2thread)                          126.11%     1.03us  974.90K
+folly_microlock(2thread)                         109.99%     1.18us  850.24K
+----------------------------------------------------------------------------
+std_mutex(4thread)                                           3.03us  330.05K
+folly_microspin(4thread)                          94.03%     3.22us  310.33K
+folly_picospin(4thread)                          127.16%     2.38us  419.70K
+folly_microlock(4thread)                          64.39%     4.71us  212.51K
+----------------------------------------------------------------------------
+std_mutex(8thread)                                           4.79us  208.81K
+folly_microspin(8thread)                          71.50%     6.70us  149.30K
+folly_picospin(8thread)                           54.58%     8.77us  113.96K
+folly_microlock(8thread)                          55.58%     8.62us  116.05K
+----------------------------------------------------------------------------
+std_mutex(16thread)                                         12.14us   82.39K
+folly_microspin(16thread)                        102.02%    11.90us   84.05K
+folly_picospin(16thread)                          62.30%    19.48us   51.33K
+folly_microlock(16thread)                         64.54%    18.81us   53.17K
+----------------------------------------------------------------------------
+std_mutex(32thread)                                         22.52us   44.41K
+folly_microspin(32thread)                         88.25%    25.52us   39.19K
+folly_picospin(32thread)                          46.04%    48.91us   20.45K
+folly_microlock(32thread)                         67.08%    33.57us   29.79K
+----------------------------------------------------------------------------
+std_mutex(64thread)                                         43.29us   23.10K
+folly_microspin(64thread)                         98.01%    44.17us   22.64K
+folly_picospin(64thread)                          48.62%    89.04us   11.23K
+folly_microlock(64thread)                         62.68%    69.07us   14.48K
+============================================================================
+*/
author	Dave Watson <davejwatson@fb.com>
	Thu, 8 Dec 2016 18:28:21 +0000 (10:28 -0800)
committer	Facebook Github Bot <facebook-github-bot-bot@fb.com>
	Fri, 9 Dec 2016 18:03:50 +0000 (10:03 -0800)