folly/test/SmallLocksBenchmark.cpp

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <algorithm>
  18 #include <cmath>
  19 #include <condition_variable>
  20 #include <numeric>
  21 #include <thread>
  22 #include <vector>
  23
  24 #include <folly/Benchmark.h>
  25 #include <folly/SmallLocks.h>
  26
  27 /* "Work cycle" is just an additional nop loop iteration.
  28  * A smaller number of work cyles will result in more contention,
  29  * which is what we're trying to measure.  The relative ratio of
  30  * locked to unlocked cycles will simulate how big critical sections
  31  * are in production code
  32  */
  33 DEFINE_int32(work, 100, "Number of work cycles");
  34 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
  35 DEFINE_int32(
  36     threads,
  37     std::thread::hardware_concurrency(),
  38     "Number of threads for fairness test");
  39
  40 static void burn(size_t n) {
  41   for (size_t i = 0; i < n; ++i) {
  42     folly::doNotOptimizeAway(i);
  43   }
  44 }
  45
  46 namespace {
  47
  48 struct SimpleBarrier {
  49   explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
  50
  51   void wait() {
  52     std::unique_lock<std::mutex> lockHeld(lock_);
  53     if (++num_ == count_) {
  54       cv_.notify_all();
  55     } else {
  56       cv_.wait(lockHeld, [&]() { return num_ >= count_; });
  57     }
  58   }
  59
  60  private:
  61   std::mutex lock_;
  62   std::condition_variable cv_;
  63   size_t num_{0};
  64   size_t count_;
  65 };
  66 }
  67
  68 template <typename Lock>
  69 class InitLock {
  70   Lock lock_;
  71
  72  public:
  73   InitLock() {
  74     lock_.init();
  75   }
  76   void lock() {
  77     lock_.lock();
  78   }
  79   void unlock() {
  80     lock_.unlock();
  81   }
  82 };
  83
  84 template <typename Lock>
  85 static void runContended(size_t numOps, size_t numThreads) {
  86   folly::BenchmarkSuspender braces;
  87   size_t totalthreads = std::thread::hardware_concurrency();
  88   if (totalthreads < numThreads) {
  89     totalthreads = numThreads;
  90   }
  91   size_t threadgroups = totalthreads / numThreads;
  92   struct lockstruct {
  93     char padding1[128];
  94     Lock lock;
  95     char padding2[128];
  96     long value = 1;
  97   };
  98
  99   auto locks =
 100       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
 101
 102   char padding3[128];
 103   (void)padding3;
 104   std::vector<std::thread> threads(totalthreads);
 105
 106   SimpleBarrier runbarrier(totalthreads + 1);
 107
 108   for (size_t t = 0; t < totalthreads; ++t) {
 109     threads[t] = std::thread([&, t, totalthreads] {
 110       lockstruct* lock = &locks[t % threadgroups];
 111       runbarrier.wait();
 112       for (size_t op = 0; op < numOps; op += 1) {
 113         lock->lock.lock();
 114         burn(FLAGS_work);
 115         lock->value++;
 116         lock->lock.unlock();
 117         burn(FLAGS_unlocked_work);
 118       }
 119     });
 120   }
 121
 122   runbarrier.wait();
 123   braces.dismissing([&] {
 124     for (auto& thr : threads) {
 125       thr.join();
 126     }
 127   });
 128 }
 129
 130 template <typename Lock>
 131 static void runFairness() {
 132   size_t numThreads = FLAGS_threads;
 133   size_t totalthreads = std::thread::hardware_concurrency();
 134   if (totalthreads < numThreads) {
 135     totalthreads = numThreads;
 136   }
 137   long threadgroups = totalthreads / numThreads;
 138   struct lockstruct {
 139     char padding1[128];
 140     Lock lock;
 141   };
 142
 143   auto locks =
 144       (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
 145
 146   char padding3[64];
 147   (void)padding3;
 148   std::vector<std::thread> threads(totalthreads);
 149
 150   std::atomic<bool> stop{false};
 151
 152   std::mutex rlock;
 153   std::vector<long> results;
 154   std::vector<std::chrono::microseconds> maxes;
 155
 156   std::vector<std::chrono::microseconds> aqTime;
 157   std::vector<unsigned long> aqTimeSq;
 158
 159   SimpleBarrier runbarrier(totalthreads + 1);
 160
 161   for (size_t t = 0; t < totalthreads; ++t) {
 162     threads[t] = std::thread([&, t, totalthreads] {
 163       lockstruct* lock = &locks[t % threadgroups];
 164       long value = 0;
 165       std::chrono::microseconds max(0);
 166       std::chrono::microseconds time(0);
 167       unsigned long timeSq(0);
 168       runbarrier.wait();
 169       while (!stop) {
 170         std::chrono::steady_clock::time_point prelock =
 171             std::chrono::steady_clock::now();
 172         lock->lock.lock();
 173         std::chrono::steady_clock::time_point postlock =
 174             std::chrono::steady_clock::now();
 175         auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
 176             postlock - prelock);
 177         time += diff;
 178         timeSq += diff.count() * diff.count();
 179         if (diff > max) {
 180           max = diff;
 181         }
 182         burn(FLAGS_work);
 183         value++;
 184         lock->lock.unlock();
 185         burn(FLAGS_unlocked_work);
 186       }
 187       {
 188         std::lock_guard<std::mutex> g(rlock);
 189         results.push_back(value);
 190         maxes.push_back(max);
 191         aqTime.push_back(time);
 192         aqTimeSq.push_back(timeSq);
 193       }
 194     });
 195   }
 196
 197   runbarrier.wait();
 198   /* sleep override */
 199   std::this_thread::sleep_for(std::chrono::seconds(2));
 200   stop = true;
 201
 202   for (auto& thr : threads) {
 203     thr.join();
 204   }
 205
 206   // Calulate some stats
 207   unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
 208   double m = sum / results.size();
 209
 210   double accum = 0.0;
 211   std::for_each(results.begin(), results.end(), [&](const double d) {
 212     accum += (d - m) * (d - m);
 213   });
 214   double stdev = std::sqrt(accum / (results.size() - 1));
 215   std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
 216   std::chrono::microseconds agAqTime = std::accumulate(
 217       aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
 218   unsigned long agAqTimeSq =
 219       std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
 220   std::chrono::microseconds mean = agAqTime / sum;
 221   double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
 222       sum / (sum - 1);
 223   double stddev2 = std::sqrt(variance);
 224
 225   printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
 226   printf(
 227       "Lock time stats in us: mean %li stddev %.0f max %li\n",
 228       mean.count(),
 229       stddev2,
 230       mx.count());
 231 }
 232
 233 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
 234   std::mutex lock;
 235   while (iters--) {
 236     lock.lock();
 237     lock.unlock();
 238   }
 239 }
 240
 241 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
 242   folly::MicroSpinLock lock;
 243   lock.init();
 244   while (iters--) {
 245     lock.lock();
 246     lock.unlock();
 247   }
 248 }
 249
 250 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
 251   // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
 252   folly::PicoSpinLock<uint16_t> lock;
 253   lock.init();
 254   while (iters--) {
 255     lock.lock();
 256     lock.unlock();
 257   }
 258 }
 259
 260 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
 261   folly::MicroLock lock;
 262   lock.init();
 263   while (iters--) {
 264     lock.lock();
 265     lock.unlock();
 266   }
 267 }
 268
 269 struct VirtualBase {
 270   virtual void foo() = 0;
 271   virtual ~VirtualBase() {}
 272 };
 273
 274 struct VirtualImpl : VirtualBase {
 275   void foo() override { /* noop */
 276   }
 277   virtual ~VirtualImpl() {}
 278 };
 279
 280 #ifndef __clang__
 281 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
 282   return new VirtualImpl();
 283 }
 284
 285 BENCHMARK(VirtualFunctionCall, iters) {
 286   VirtualBase* vb = makeVirtual();
 287   while (iters--) {
 288     vb->foo();
 289   }
 290   delete vb;
 291 }
 292 #endif
 293
 294 BENCHMARK_DRAW_LINE()
 295
 296 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
 297 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
 298
 299 static void std_mutex(size_t numOps, size_t numThreads) {
 300   runContended<std::mutex>(numOps, numThreads);
 301 }
 302 static void folly_microspin(size_t numOps, size_t numThreads) {
 303   runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
 304 }
 305 static void folly_picospin(size_t numOps, size_t numThreads) {
 306   runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
 307 }
 308 static void folly_microlock(size_t numOps, size_t numThreads) {
 309   runContended<folly::MicroLock>(numOps, numThreads);
 310 }
 311
 312 BENCHMARK_DRAW_LINE()
 313 BENCH_BASE(std_mutex, 1thread, 1)
 314 BENCH_REL(folly_microspin, 1thread, 1)
 315 BENCH_REL(folly_picospin, 1thread, 1)
 316 BENCH_REL(folly_microlock, 1thread, 1)
 317 BENCHMARK_DRAW_LINE()
 318 BENCH_BASE(std_mutex, 2thread, 2)
 319 BENCH_REL(folly_microspin, 2thread, 2)
 320 BENCH_REL(folly_picospin, 2thread, 2)
 321 BENCH_REL(folly_microlock, 2thread, 2)
 322 BENCHMARK_DRAW_LINE()
 323 BENCH_BASE(std_mutex, 4thread, 4)
 324 BENCH_REL(folly_microspin, 4thread, 4)
 325 BENCH_REL(folly_picospin, 4thread, 4)
 326 BENCH_REL(folly_microlock, 4thread, 4)
 327 BENCHMARK_DRAW_LINE()
 328 BENCH_BASE(std_mutex, 8thread, 8)
 329 BENCH_REL(folly_microspin, 8thread, 8)
 330 BENCH_REL(folly_picospin, 8thread, 8)
 331 BENCH_REL(folly_microlock, 8thread, 8)
 332 BENCHMARK_DRAW_LINE()
 333 BENCH_BASE(std_mutex, 16thread, 16)
 334 BENCH_REL(folly_microspin, 16thread, 16)
 335 BENCH_REL(folly_picospin, 16thread, 16)
 336 BENCH_REL(folly_microlock, 16thread, 16)
 337 BENCHMARK_DRAW_LINE()
 338 BENCH_BASE(std_mutex, 32thread, 32)
 339 BENCH_REL(folly_microspin, 32thread, 32)
 340 BENCH_REL(folly_picospin, 32thread, 32)
 341 BENCH_REL(folly_microlock, 32thread, 32)
 342 BENCHMARK_DRAW_LINE()
 343 BENCH_BASE(std_mutex, 64thread, 64)
 344 BENCH_REL(folly_microspin, 64thread, 64)
 345 BENCH_REL(folly_picospin, 64thread, 64)
 346 BENCH_REL(folly_microlock, 64thread, 64)
 347
 348 #define FairnessTest(type) \
 349   {                        \
 350     printf(#type ": \n");  \
 351     runFairness<type>();   \
 352   }
 353
 354 int main(int argc, char** argv) {
 355   gflags::ParseCommandLineFlags(&argc, &argv, true);
 356
 357   FairnessTest(std::mutex);
 358   FairnessTest(InitLock<folly::MicroSpinLock>);
 359   FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
 360   FairnessTest(folly::MicroLock);
 361
 362   folly::runBenchmarks();
 363
 364   return 0;
 365 }
 366
 367 /*
 368 locks_benchmark --bm_min_iters=1000000
 369
 370 std::mutex:
 371 Sum: 2895849 Mean: 90495 stddev: 3147
 372 Lock time stats in us: mean 11 stddev 1483 max 17823
 373 InitLock<folly::MicroSpinLock>:
 374 Sum: 3114365 Mean: 97323 stddev: 92604
 375 Lock time stats in us: mean 19 stddev 1379 max 235710
 376 InitLock<folly::PicoSpinLock<uint16_t>>:
 377 Sum: 1189713 Mean: 37178 stddev: 23295
 378 Lock time stats in us: mean 51 stddev 3610 max 414093
 379 folly::MicroLock:
 380 Sum: 1890150 Mean: 59067 stddev: 26795
 381 Lock time stats in us: mean 31 stddev 2272 max 70996
 382 ============================================================================
 383 folly/test/SmallLocksBenchmark.cpp              relative  time/iter  iters/s
 384 ============================================================================
 385 StdMutexUncontendedBenchmark                                25.97ns   38.51M
 386 MicroSpinLockUncontendedBenchmark                           16.26ns   61.49M
 387 PicoSpinLockUncontendedBenchmark                            15.92ns   62.82M
 388 MicroLockUncontendedBenchmark                               28.54ns   35.03M
 389 VirtualFunctionCall                                          1.73ns  576.51M
 390 ----------------------------------------------------------------------------
 391 ----------------------------------------------------------------------------
 392 std_mutex(1thread)                                         898.49ns    1.11M
 393 folly_microspin(1thread)                          87.68%     1.02us  975.85K
 394 folly_picospin(1thread)                           95.06%   945.15ns    1.06M
 395 folly_microlock(1thread)                          87.34%     1.03us  972.05K
 396 ----------------------------------------------------------------------------
 397 std_mutex(2thread)                                           1.29us  773.03K
 398 folly_microspin(2thread)                          90.35%     1.43us  698.40K
 399 folly_picospin(2thread)                          126.11%     1.03us  974.90K
 400 folly_microlock(2thread)                         109.99%     1.18us  850.24K
 401 ----------------------------------------------------------------------------
 402 std_mutex(4thread)                                           3.03us  330.05K
 403 folly_microspin(4thread)                          94.03%     3.22us  310.33K
 404 folly_picospin(4thread)                          127.16%     2.38us  419.70K
 405 folly_microlock(4thread)                          64.39%     4.71us  212.51K
 406 ----------------------------------------------------------------------------
 407 std_mutex(8thread)                                           4.79us  208.81K
 408 folly_microspin(8thread)                          71.50%     6.70us  149.30K
 409 folly_picospin(8thread)                           54.58%     8.77us  113.96K
 410 folly_microlock(8thread)                          55.58%     8.62us  116.05K
 411 ----------------------------------------------------------------------------
 412 std_mutex(16thread)                                         12.14us   82.39K
 413 folly_microspin(16thread)                        102.02%    11.90us   84.05K
 414 folly_picospin(16thread)                          62.30%    19.48us   51.33K
 415 folly_microlock(16thread)                         64.54%    18.81us   53.17K
 416 ----------------------------------------------------------------------------
 417 std_mutex(32thread)                                         22.52us   44.41K
 418 folly_microspin(32thread)                         88.25%    25.52us   39.19K
 419 folly_picospin(32thread)                          46.04%    48.91us   20.45K
 420 folly_microlock(32thread)                         67.08%    33.57us   29.79K
 421 ----------------------------------------------------------------------------
 422 std_mutex(64thread)                                         43.29us   23.10K
 423 folly_microspin(64thread)                         98.01%    44.17us   22.64K
 424 folly_picospin(64thread)                          48.62%    89.04us   11.23K
 425 folly_microlock(64thread)                         62.68%    69.07us   14.48K
 426 ============================================================================
 427 */