2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <condition_variable>
24 #include <folly/Benchmark.h>
25 #include <folly/SmallLocks.h>
27 /* "Work cycle" is just an additional nop loop iteration.
28 * A smaller number of work cyles will result in more contention,
29 * which is what we're trying to measure. The relative ratio of
30 * locked to unlocked cycles will simulate how big critical sections
31 * are in production code
33 DEFINE_int32(work, 100, "Number of work cycles");
34 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
37 std::thread::hardware_concurrency(),
38 "Number of threads for fairness test");
40 static void burn(size_t n) {
41 for (size_t i = 0; i < n; ++i) {
42 folly::doNotOptimizeAway(i);
48 struct SimpleBarrier {
49 explicit SimpleBarrier(size_t count) : lock_(), cv_(), count_(count) {}
52 std::unique_lock<std::mutex> lockHeld(lock_);
53 if (++num_ == count_) {
56 cv_.wait(lockHeld, [&]() { return num_ >= count_; });
62 std::condition_variable cv_;
68 template <typename Lock>
84 template <typename Lock>
85 static void runContended(size_t numOps, size_t numThreads) {
86 folly::BenchmarkSuspender braces;
87 size_t totalthreads = std::thread::hardware_concurrency();
88 if (totalthreads < numThreads) {
89 totalthreads = numThreads;
91 size_t threadgroups = totalthreads / numThreads;
100 (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
104 std::vector<std::thread> threads(totalthreads);
106 SimpleBarrier runbarrier(totalthreads + 1);
108 for (size_t t = 0; t < totalthreads; ++t) {
109 threads[t] = std::thread([&, t, totalthreads] {
110 lockstruct* lock = &locks[t % threadgroups];
112 for (size_t op = 0; op < numOps; op += 1) {
117 burn(FLAGS_unlocked_work);
123 braces.dismissing([&] {
124 for (auto& thr : threads) {
130 template <typename Lock>
131 static void runFairness() {
132 size_t numThreads = FLAGS_threads;
133 size_t totalthreads = std::thread::hardware_concurrency();
134 if (totalthreads < numThreads) {
135 totalthreads = numThreads;
137 long threadgroups = totalthreads / numThreads;
144 (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
148 std::vector<std::thread> threads(totalthreads);
150 std::atomic<bool> stop{false};
153 std::vector<long> results;
154 std::vector<std::chrono::microseconds> maxes;
156 std::vector<std::chrono::microseconds> aqTime;
157 std::vector<unsigned long> aqTimeSq;
159 SimpleBarrier runbarrier(totalthreads + 1);
161 for (size_t t = 0; t < totalthreads; ++t) {
162 threads[t] = std::thread([&, t, totalthreads] {
163 lockstruct* lock = &locks[t % threadgroups];
165 std::chrono::microseconds max(0);
166 std::chrono::microseconds time(0);
167 unsigned long timeSq(0);
170 std::chrono::steady_clock::time_point prelock =
171 std::chrono::steady_clock::now();
173 std::chrono::steady_clock::time_point postlock =
174 std::chrono::steady_clock::now();
175 auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
178 timeSq += diff.count() * diff.count();
185 burn(FLAGS_unlocked_work);
188 std::lock_guard<std::mutex> g(rlock);
189 results.push_back(value);
190 maxes.push_back(max);
191 aqTime.push_back(time);
192 aqTimeSq.push_back(timeSq);
199 std::this_thread::sleep_for(std::chrono::seconds(2));
202 for (auto& thr : threads) {
206 // Calulate some stats
207 unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
208 double m = sum / results.size();
211 std::for_each(results.begin(), results.end(), [&](const double d) {
212 accum += (d - m) * (d - m);
214 double stdev = std::sqrt(accum / (results.size() - 1));
215 std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
216 std::chrono::microseconds agAqTime = std::accumulate(
217 aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
218 unsigned long agAqTimeSq =
219 std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
220 std::chrono::microseconds mean = agAqTime / sum;
221 double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
223 double stddev2 = std::sqrt(variance);
225 printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
227 "Lock time stats in us: mean %li stddev %.0f max %li\n",
233 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
241 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
242 folly::MicroSpinLock lock;
250 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
251 // uint8_t would be more fair, but PicoSpinLock needs at lesat two bytes
252 folly::PicoSpinLock<uint16_t> lock;
260 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
261 folly::MicroLock lock;
270 virtual void foo() = 0;
271 virtual ~VirtualBase() {}
274 struct VirtualImpl : VirtualBase {
275 void foo() override { /* noop */
277 virtual ~VirtualImpl() {}
281 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
282 return new VirtualImpl();
285 BENCHMARK(VirtualFunctionCall, iters) {
286 VirtualBase* vb = makeVirtual();
294 BENCHMARK_DRAW_LINE()
296 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
297 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
299 static void std_mutex(size_t numOps, size_t numThreads) {
300 runContended<std::mutex>(numOps, numThreads);
302 static void folly_microspin(size_t numOps, size_t numThreads) {
303 runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
305 static void folly_picospin(size_t numOps, size_t numThreads) {
306 runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
308 static void folly_microlock(size_t numOps, size_t numThreads) {
309 runContended<folly::MicroLock>(numOps, numThreads);
312 BENCHMARK_DRAW_LINE()
313 BENCH_BASE(std_mutex, 1thread, 1)
314 BENCH_REL(folly_microspin, 1thread, 1)
315 BENCH_REL(folly_picospin, 1thread, 1)
316 BENCH_REL(folly_microlock, 1thread, 1)
317 BENCHMARK_DRAW_LINE()
318 BENCH_BASE(std_mutex, 2thread, 2)
319 BENCH_REL(folly_microspin, 2thread, 2)
320 BENCH_REL(folly_picospin, 2thread, 2)
321 BENCH_REL(folly_microlock, 2thread, 2)
322 BENCHMARK_DRAW_LINE()
323 BENCH_BASE(std_mutex, 4thread, 4)
324 BENCH_REL(folly_microspin, 4thread, 4)
325 BENCH_REL(folly_picospin, 4thread, 4)
326 BENCH_REL(folly_microlock, 4thread, 4)
327 BENCHMARK_DRAW_LINE()
328 BENCH_BASE(std_mutex, 8thread, 8)
329 BENCH_REL(folly_microspin, 8thread, 8)
330 BENCH_REL(folly_picospin, 8thread, 8)
331 BENCH_REL(folly_microlock, 8thread, 8)
332 BENCHMARK_DRAW_LINE()
333 BENCH_BASE(std_mutex, 16thread, 16)
334 BENCH_REL(folly_microspin, 16thread, 16)
335 BENCH_REL(folly_picospin, 16thread, 16)
336 BENCH_REL(folly_microlock, 16thread, 16)
337 BENCHMARK_DRAW_LINE()
338 BENCH_BASE(std_mutex, 32thread, 32)
339 BENCH_REL(folly_microspin, 32thread, 32)
340 BENCH_REL(folly_picospin, 32thread, 32)
341 BENCH_REL(folly_microlock, 32thread, 32)
342 BENCHMARK_DRAW_LINE()
343 BENCH_BASE(std_mutex, 64thread, 64)
344 BENCH_REL(folly_microspin, 64thread, 64)
345 BENCH_REL(folly_picospin, 64thread, 64)
346 BENCH_REL(folly_microlock, 64thread, 64)
348 #define FairnessTest(type) \
350 printf(#type ": \n"); \
351 runFairness<type>(); \
354 int main(int argc, char** argv) {
355 gflags::ParseCommandLineFlags(&argc, &argv, true);
357 FairnessTest(std::mutex);
358 FairnessTest(InitLock<folly::MicroSpinLock>);
359 FairnessTest(InitLock<folly::PicoSpinLock<uint16_t>>);
360 FairnessTest(folly::MicroLock);
362 folly::runBenchmarks();
368 locks_benchmark --bm_min_iters=1000000
371 Sum: 2895849 Mean: 90495 stddev: 3147
372 Lock time stats in us: mean 11 stddev 1483 max 17823
373 InitLock<folly::MicroSpinLock>:
374 Sum: 3114365 Mean: 97323 stddev: 92604
375 Lock time stats in us: mean 19 stddev 1379 max 235710
376 InitLock<folly::PicoSpinLock<uint16_t>>:
377 Sum: 1189713 Mean: 37178 stddev: 23295
378 Lock time stats in us: mean 51 stddev 3610 max 414093
380 Sum: 1890150 Mean: 59067 stddev: 26795
381 Lock time stats in us: mean 31 stddev 2272 max 70996
382 ============================================================================
383 folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s
384 ============================================================================
385 StdMutexUncontendedBenchmark 25.97ns 38.51M
386 MicroSpinLockUncontendedBenchmark 16.26ns 61.49M
387 PicoSpinLockUncontendedBenchmark 15.92ns 62.82M
388 MicroLockUncontendedBenchmark 28.54ns 35.03M
389 VirtualFunctionCall 1.73ns 576.51M
390 ----------------------------------------------------------------------------
391 ----------------------------------------------------------------------------
392 std_mutex(1thread) 898.49ns 1.11M
393 folly_microspin(1thread) 87.68% 1.02us 975.85K
394 folly_picospin(1thread) 95.06% 945.15ns 1.06M
395 folly_microlock(1thread) 87.34% 1.03us 972.05K
396 ----------------------------------------------------------------------------
397 std_mutex(2thread) 1.29us 773.03K
398 folly_microspin(2thread) 90.35% 1.43us 698.40K
399 folly_picospin(2thread) 126.11% 1.03us 974.90K
400 folly_microlock(2thread) 109.99% 1.18us 850.24K
401 ----------------------------------------------------------------------------
402 std_mutex(4thread) 3.03us 330.05K
403 folly_microspin(4thread) 94.03% 3.22us 310.33K
404 folly_picospin(4thread) 127.16% 2.38us 419.70K
405 folly_microlock(4thread) 64.39% 4.71us 212.51K
406 ----------------------------------------------------------------------------
407 std_mutex(8thread) 4.79us 208.81K
408 folly_microspin(8thread) 71.50% 6.70us 149.30K
409 folly_picospin(8thread) 54.58% 8.77us 113.96K
410 folly_microlock(8thread) 55.58% 8.62us 116.05K
411 ----------------------------------------------------------------------------
412 std_mutex(16thread) 12.14us 82.39K
413 folly_microspin(16thread) 102.02% 11.90us 84.05K
414 folly_picospin(16thread) 62.30% 19.48us 51.33K
415 folly_microlock(16thread) 64.54% 18.81us 53.17K
416 ----------------------------------------------------------------------------
417 std_mutex(32thread) 22.52us 44.41K
418 folly_microspin(32thread) 88.25% 25.52us 39.19K
419 folly_picospin(32thread) 46.04% 48.91us 20.45K
420 folly_microlock(32thread) 67.08% 33.57us 29.79K
421 ----------------------------------------------------------------------------
422 std_mutex(64thread) 43.29us 23.10K
423 folly_microspin(64thread) 98.01% 44.17us 22.64K
424 folly_picospin(64thread) 48.62% 89.04us 11.23K
425 folly_microlock(64thread) 62.68% 69.07us 14.48K
426 ============================================================================