2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/ThreadCachedInt.h>
20 #include <condition_variable>
23 #include <glog/logging.h>
25 #include <folly/Benchmark.h>
26 #include <folly/Hash.h>
27 #include <folly/portability/GFlags.h>
28 #include <folly/portability/GTest.h>
30 using namespace folly;
32 using std::unique_ptr;
35 using Counter = ThreadCachedInt<int64_t>;
37 class ThreadCachedIntTest : public testing::Test {
39 uint32_t GetDeadThreadsTotal(const Counter& counter) {
40 return counter.readFast();
44 // Multithreaded tests. Creates a specified number of threads each of
45 // which iterates a different amount and dies.
48 // Set cacheSize to be large so cached data moves to target_ only when
50 Counter g_counter_for_mt_slow(0, UINT32_MAX);
51 Counter g_counter_for_mt_fast(0, UINT32_MAX);
53 // Used to sync between threads. The value of this variable is the
54 // maximum iteration index upto which Runner() is allowed to go.
55 uint32_t g_sync_for_mt(0);
56 std::condition_variable cv;
59 // Performs the specified number of iterations. Within each
60 // iteration, it increments counter 10 times. At the beginning of
61 // each iteration it checks g_sync_for_mt to see if it can proceed,
62 // otherwise goes into a loop sleeping and rechecking.
63 void Runner(Counter* counter, uint32_t iterations) {
64 for (uint32_t i = 0; i < iterations; ++i) {
65 std::unique_lock<std::mutex> lk(cv_m);
66 cv.wait(lk, [i] { return i < g_sync_for_mt; });
67 for (uint32_t j = 0; j < 10; ++j) {
68 counter->increment(1);
74 // Slow test with fewer threads where there are more busy waits and
75 // many calls to readFull(). This attempts to test as many of the
76 // code paths in Counter as possible to ensure that counter values are
77 // properly passed from thread local state, both at calls to
78 // readFull() and at thread death.
79 TEST_F(ThreadCachedIntTest, MultithreadedSlow) {
80 static constexpr uint32_t kNumThreads = 20;
82 vector<unique_ptr<std::thread>> threads(kNumThreads);
83 // Creates kNumThreads threads. Each thread performs a different
84 // number of iterations in Runner() - threads[0] performs 1
85 // iteration, threads[1] performs 2 iterations, threads[2] performs
86 // 3 iterations, and so on.
87 for (uint32_t i = 0; i < kNumThreads; ++i) {
88 threads[i].reset(new std::thread(Runner, &g_counter_for_mt_slow, i + 1));
90 // Variable to grab current counter value.
91 int32_t counter_value;
92 // The expected value of the counter.
94 // The expected value of GetDeadThreadsTotal().
95 int32_t dead_total = 0;
96 // Each iteration of the following thread allows one additional
97 // iteration of the threads. Given that the threads perform
98 // different number of iterations from 1 through kNumThreads, one
99 // thread will complete in each of the iterations of the loop below.
100 for (uint32_t i = 0; i < kNumThreads; ++i) {
101 // Allow upto iteration i on all threads.
103 std::lock_guard<std::mutex> lk(cv_m);
104 g_sync_for_mt = i + 1;
107 total += (kNumThreads - i) * 10;
108 // Loop until the counter reaches its expected value.
110 counter_value = g_counter_for_mt_slow.readFull();
111 } while (counter_value < total);
112 // All threads have done what they can until iteration i, now make
113 // sure they don't go further by checking 10 more times in the
115 for (uint32_t j = 0; j < 10; ++j) {
116 counter_value = g_counter_for_mt_slow.readFull();
117 EXPECT_EQ(total, counter_value);
119 dead_total += (i + 1) * 10;
120 EXPECT_GE(dead_total, GetDeadThreadsTotal(g_counter_for_mt_slow));
122 // All threads are done.
123 for (uint32_t i = 0; i < kNumThreads; ++i) {
126 counter_value = g_counter_for_mt_slow.readFull();
127 EXPECT_EQ(total, counter_value);
128 EXPECT_EQ(total, dead_total);
129 EXPECT_EQ(dead_total, GetDeadThreadsTotal(g_counter_for_mt_slow));
132 // Fast test with lots of threads and only one call to readFull()
134 TEST_F(ThreadCachedIntTest, MultithreadedFast) {
135 static constexpr uint32_t kNumThreads = 1000;
137 vector<unique_ptr<std::thread>> threads(kNumThreads);
138 // Creates kNumThreads threads. Each thread performs a different
139 // number of iterations in Runner() - threads[0] performs 1
140 // iteration, threads[1] performs 2 iterations, threads[2] performs
141 // 3 iterations, and so on.
142 for (uint32_t i = 0; i < kNumThreads; ++i) {
143 threads[i].reset(new std::thread(Runner, &g_counter_for_mt_fast, i + 1));
145 // Let the threads run to completion.
147 std::lock_guard<std::mutex> lk(cv_m);
148 g_sync_for_mt = kNumThreads;
151 // The expected value of the counter.
153 for (uint32_t i = 0; i < kNumThreads; ++i) {
154 total += (kNumThreads - i) * 10;
156 // Wait for all threads to complete.
157 for (uint32_t i = 0; i < kNumThreads; ++i) {
160 int32_t counter_value = g_counter_for_mt_fast.readFull();
161 EXPECT_EQ(total, counter_value);
162 EXPECT_EQ(total, GetDeadThreadsTotal(g_counter_for_mt_fast));
165 TEST(ThreadCachedInt, SingleThreadedNotCached) {
166 ThreadCachedInt<int64_t> val(0, 0);
167 EXPECT_EQ(0, val.readFast());
169 EXPECT_EQ(1, val.readFast());
170 for (int i = 0; i < 41; ++i) {
173 EXPECT_EQ(42, val.readFast());
175 EXPECT_EQ(41, val.readFast());
178 // Note: This is somewhat fragile to the implementation. If this causes
179 // problems, feel free to remove it.
180 TEST(ThreadCachedInt, SingleThreadedCached) {
181 ThreadCachedInt<int64_t> val(0, 10);
182 EXPECT_EQ(0, val.readFast());
184 EXPECT_EQ(0, val.readFast());
185 for (int i = 0; i < 7; ++i) {
188 EXPECT_EQ(0, val.readFast());
189 EXPECT_EQ(0, val.readFastAndReset());
190 EXPECT_EQ(8, val.readFull());
191 EXPECT_EQ(8, val.readFullAndReset());
192 EXPECT_EQ(0, val.readFull());
193 EXPECT_EQ(0, val.readFast());
196 ThreadCachedInt<int32_t> globalInt32(0, 11);
197 ThreadCachedInt<int64_t> globalInt64(0, 11);
198 int kNumInserts = 100000;
199 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
200 #define CREATE_INC_FUNC(size) \
201 void incFunc ## size () { \
202 const int num = kNumInserts / FLAGS_numThreads; \
203 for (int i = 0; i < num; ++i) { \
204 ++globalInt ## size ; \
210 // Confirms counts are accurate with competing threads
211 TEST(ThreadCachedInt, MultiThreadedCached) {
212 kNumInserts = 100000;
213 CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
214 "FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
215 const int numPerThread = kNumInserts / FLAGS_numThreads;
216 ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
218 std::atomic<bool> run(true);
219 std::atomic<int> threadsDone(0);
220 std::vector<std::thread> threads;
221 for (int i = 0; i < FLAGS_numThreads; ++i) {
222 threads.push_back(std::thread([&] {
223 FOR_EACH_RANGE(k, 0, numPerThread) {
226 std::atomic_fetch_add(&threadsDone, 1);
227 while (run.load()) { usleep(100); }
231 // We create and increment another ThreadCachedInt here to make sure it
232 // doesn't interact with the other instances
233 ThreadCachedInt<int64_t> otherTCInt64(0, 10);
234 otherTCInt64.set(33);
237 while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
241 // Threads are done incrementing, but caches have not been flushed yet, so
242 // we have to readFull.
243 EXPECT_NE(kNumInserts, TCInt64.readFast());
244 EXPECT_EQ(kNumInserts, TCInt64.readFull());
247 for (auto& t : threads) {
251 } // Caches are flushed when threads finish
252 EXPECT_EQ(kNumInserts, TCInt64.readFast());
255 #define MAKE_MT_CACHE_SIZE_BM(size) \
256 void BM_mt_cache_size ## size (int iters, int cacheSize) { \
257 kNumInserts = iters; \
258 globalInt ## size.set(0); \
259 globalInt ## size.setCacheSize(cacheSize); \
260 std::vector<std::thread> threads; \
261 for (int i = 0; i < FLAGS_numThreads; ++i) { \
262 threads.push_back(std::thread(incFunc ## size)); \
264 for (auto& t : threads) { \
268 MAKE_MT_CACHE_SIZE_BM(64);
269 MAKE_MT_CACHE_SIZE_BM(32);
271 #define REG_BASELINE(name, inc_stmt) \
272 BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) { \
273 const int iterPerThread = iters / FLAGS_numThreads; \
274 std::vector<std::thread> threads; \
275 for (int i = 0; i < FLAGS_numThreads; ++i) { \
276 threads.push_back(std::thread([&]() { \
277 for (int j = 0; j < iterPerThread; ++j) { \
282 for (auto& t : threads) { \
287 ThreadLocal<int64_t> globalTL64Baseline;
288 ThreadLocal<int32_t> globalTL32Baseline;
289 std::atomic<int64_t> globalInt64Baseline(0);
290 std::atomic<int32_t> globalInt32Baseline(0);
291 FOLLY_TLS int64_t global__thread64;
292 FOLLY_TLS int32_t global__thread32;
294 // Alternate lock-free implementation. Achieves about the same performance,
295 // but uses about 20x more memory than ThreadCachedInt with 24 threads.
296 struct ShardedAtomicInt {
297 static const int64_t kBuckets_ = 2048;
298 std::atomic<int64_t> ints_[kBuckets_];
300 inline void inc(int64_t val = 1) {
301 int bucket = hash::twang_mix64(
302 uint64_t(pthread_self())) & (kBuckets_ - 1);
303 std::atomic_fetch_add(&ints_[bucket], val);
306 // read the first few and extrapolate
309 static const int numToRead = 8;
310 FOR_EACH_RANGE(i, 0, numToRead) {
311 ret += ints_[i].load(std::memory_order_relaxed);
313 return ret * (kBuckets_ / numToRead);
316 // readFull is lock-free, but has to do thousands of loads...
319 for (auto& i : ints_) {
320 // Fun fact - using memory_order_consume below reduces perf 30-40% in high
321 // contention benchmarks.
322 ret += i.load(std::memory_order_relaxed);
327 ShardedAtomicInt shd_int64;
329 REG_BASELINE(_thread64, global__thread64 += 1);
330 REG_BASELINE(_thread32, global__thread32 += 1);
331 REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
332 REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
333 REG_BASELINE(atomic_inc64,
334 std::atomic_fetch_add(&globalInt64Baseline, int64_t(1)));
335 REG_BASELINE(atomic_inc32,
336 std::atomic_fetch_add(&globalInt32Baseline, int32_t(1)));
337 REG_BASELINE(ShardedAtm64, shd_int64.inc());
339 BENCHMARK_PARAM(BM_mt_cache_size64, 0);
340 BENCHMARK_PARAM(BM_mt_cache_size64, 10);
341 BENCHMARK_PARAM(BM_mt_cache_size64, 100);
342 BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
343 BENCHMARK_PARAM(BM_mt_cache_size32, 0);
344 BENCHMARK_PARAM(BM_mt_cache_size32, 10);
345 BENCHMARK_PARAM(BM_mt_cache_size32, 100);
346 BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
347 BENCHMARK_DRAW_LINE();
350 BENCHMARK(Atomic_readFull) {
351 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
353 BENCHMARK(ThrCache_readFull) {
354 doNotOptimizeAway(globalInt64.readFull());
356 BENCHMARK(Sharded_readFull) {
357 doNotOptimizeAway(shd_int64.readFull());
359 BENCHMARK(ThrCache_readFast) {
360 doNotOptimizeAway(globalInt64.readFast());
362 BENCHMARK(Sharded_readFast) {
363 doNotOptimizeAway(shd_int64.readFast());
365 BENCHMARK_DRAW_LINE();
368 REG_BASELINE(Atomic_readFull,
369 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
370 REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
371 REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
372 REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
373 REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
374 BENCHMARK_DRAW_LINE();
376 int main(int argc, char** argv) {
377 testing::InitGoogleTest(&argc, argv);
378 gflags::ParseCommandLineFlags(&argc, &argv, true);
379 gflags::SetCommandLineOptionWithMode(
380 "bm_min_usec", "10000", gflags::SET_FLAG_IF_DEFAULT
382 if (FLAGS_benchmark) {
383 folly::runBenchmarks();
385 return RUN_ALL_TESTS();
389 Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
391 Benchmark Iters Total t t/iter iter/sec
392 ------------------------------------------------------------------------------
393 + 103% BM_mt_baseline__thread64 10000000 13.54 ms 1.354 ns 704.4 M
394 * BM_mt_baseline__thread32 10000000 6.651 ms 665.1 ps 1.4 G
395 +50.3% BM_mt_baseline_ThreadLocal64 10000000 9.994 ms 999.4 ps 954.2 M
396 +49.9% BM_mt_baseline_ThreadLocal32 10000000 9.972 ms 997.2 ps 956.4 M
397 +2650% BM_mt_baseline_atomic_inc64 10000000 182.9 ms 18.29 ns 52.13 M
398 +2665% BM_mt_baseline_atomic_inc32 10000000 183.9 ms 18.39 ns 51.85 M
399 +75.3% BM_mt_baseline_ShardedAtm64 10000000 11.66 ms 1.166 ns 817.8 M
400 +6670% BM_mt_cache_size64/0 10000000 450.3 ms 45.03 ns 21.18 M
401 +1644% BM_mt_cache_size64/10 10000000 116 ms 11.6 ns 82.2 M
402 + 381% BM_mt_cache_size64/100 10000000 32.04 ms 3.204 ns 297.7 M
403 + 129% BM_mt_cache_size64/1000 10000000 15.24 ms 1.524 ns 625.8 M
404 +6052% BM_mt_cache_size32/0 10000000 409.2 ms 40.92 ns 23.31 M
405 +1304% BM_mt_cache_size32/10 10000000 93.39 ms 9.339 ns 102.1 M
406 + 298% BM_mt_cache_size32/100 10000000 26.52 ms 2.651 ns 359.7 M
407 +68.1% BM_mt_cache_size32/1000 10000000 11.18 ms 1.118 ns 852.9 M
408 ------------------------------------------------------------------------------
409 +10.4% Atomic_readFull 10000000 36.05 ms 3.605 ns 264.5 M
410 + 619% ThrCache_readFull 10000000 235.1 ms 23.51 ns 40.57 M
411 SLOW Sharded_readFull 1981093 2 s 1.01 us 967.3 k
412 * ThrCache_readFast 10000000 32.65 ms 3.265 ns 292.1 M
413 +10.0% Sharded_readFast 10000000 35.92 ms 3.592 ns 265.5 M
414 ------------------------------------------------------------------------------
415 +4.54% BM_mt_baseline_Atomic_readFull 10000000 8.672 ms 867.2 ps 1.074 G
416 SLOW BM_mt_baseline_ThrCache_readFull 10000000 996.9 ms 99.69 ns 9.567 M
417 SLOW BM_mt_baseline_Sharded_readFull 10000000 891.5 ms 89.15 ns 10.7 M
418 * BM_mt_baseline_ThrCache_readFast 10000000 8.295 ms 829.5 ps 1.123 G
419 +12.7% BM_mt_baseline_Sharded_readFast 10000000 9.348 ms 934.8 ps 1020 M
420 ------------------------------------------------------------------------------