2 * Copyright 2013 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "folly/ThreadCachedInt.h"
18 #include "folly/Hash.h"
22 #include <gtest/gtest.h>
23 #include <gflags/gflags.h>
24 #include <glog/logging.h>
25 #include "folly/Benchmark.h"
27 using namespace folly;
29 TEST(ThreadCachedInt, SingleThreadedNotCached) {
30 ThreadCachedInt<int64_t> val(0, 0);
31 EXPECT_EQ(0, val.readFast());
33 EXPECT_EQ(1, val.readFast());
34 for (int i = 0; i < 41; ++i) {
37 EXPECT_EQ(42, val.readFast());
39 EXPECT_EQ(41, val.readFast());
42 // Note: This is somewhat fragile to the implementation. If this causes
43 // problems, feel free to remove it.
44 TEST(ThreadCachedInt, SingleThreadedCached) {
45 ThreadCachedInt<int64_t> val(0, 10);
46 EXPECT_EQ(0, val.readFast());
48 EXPECT_EQ(0, val.readFast());
49 for (int i = 0; i < 7; ++i) {
52 EXPECT_EQ(0, val.readFast());
53 EXPECT_EQ(0, val.readFastAndReset());
54 EXPECT_EQ(8, val.readFull());
55 EXPECT_EQ(8, val.readFullAndReset());
56 EXPECT_EQ(0, val.readFull());
57 EXPECT_EQ(0, val.readFast());
60 ThreadCachedInt<int32_t> globalInt32(0, 11);
61 ThreadCachedInt<int64_t> globalInt64(0, 11);
62 int kNumInserts = 100000;
63 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
64 #define CREATE_INC_FUNC(size) \
65 void incFunc ## size () { \
66 const int num = kNumInserts / FLAGS_numThreads; \
67 for (int i = 0; i < num; ++i) { \
68 ++globalInt ## size ; \
74 // Confirms counts are accurate with competing threads
75 TEST(ThreadCachedInt, MultiThreadedCached) {
77 CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
78 "FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
79 const int numPerThread = kNumInserts / FLAGS_numThreads;
80 ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
82 std::atomic<bool> run(true);
83 std::atomic<int> threadsDone(0);
84 std::vector<std::thread> threads;
85 for (int i = 0; i < FLAGS_numThreads; ++i) {
86 threads.push_back(std::thread([&] {
87 FOR_EACH_RANGE(k, 0, numPerThread) {
90 std::atomic_fetch_add(&threadsDone, 1);
91 while (run.load()) { usleep(100); }
95 // We create and increment another ThreadCachedInt here to make sure it
96 // doesn't interact with the other instances
97 ThreadCachedInt<int64_t> otherTCInt64(0, 10);
101 while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
105 // Threads are done incrementing, but caches have not been flushed yet, so
106 // we have to readFull.
107 EXPECT_NE(kNumInserts, TCInt64.readFast());
108 EXPECT_EQ(kNumInserts, TCInt64.readFull());
111 for (auto& t : threads) {
115 } // Caches are flushed when threads finish
116 EXPECT_EQ(kNumInserts, TCInt64.readFast());
119 #define MAKE_MT_CACHE_SIZE_BM(size) \
120 void BM_mt_cache_size ## size (int iters, int cacheSize) { \
121 kNumInserts = iters; \
122 globalInt ## size.set(0); \
123 globalInt ## size.setCacheSize(cacheSize); \
124 std::vector<std::thread> threads; \
125 for (int i = 0; i < FLAGS_numThreads; ++i) { \
126 threads.push_back(std::thread(incFunc ## size)); \
128 for (auto& t : threads) { \
132 MAKE_MT_CACHE_SIZE_BM(64);
133 MAKE_MT_CACHE_SIZE_BM(32);
135 #define REG_BASELINE(name, inc_stmt) \
136 BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) { \
137 const int iterPerThread = iters / FLAGS_numThreads; \
138 std::vector<std::thread> threads; \
139 for (int i = 0; i < FLAGS_numThreads; ++i) { \
140 threads.push_back(std::thread([&]() { \
141 for (int i = 0; i < iterPerThread; ++i) { \
146 for (auto& t : threads) { \
151 ThreadLocal<int64_t> globalTL64Baseline;
152 ThreadLocal<int32_t> globalTL32Baseline;
153 std::atomic<int64_t> globalInt64Baseline(0);
154 std::atomic<int32_t> globalInt32Baseline(0);
155 __thread int64_t global__thread64;
156 __thread int32_t global__thread32;
158 // Alternate lock-free implementation. Achieves about the same performance,
159 // but uses about 20x more memory than ThreadCachedInt with 24 threads.
160 struct ShardedAtomicInt {
161 static const int64_t kBuckets_ = 2048;
162 std::atomic<int64_t> ints_[kBuckets_];
164 inline void inc(int64_t val = 1) {
165 int bucket = hash::twang_mix64(pthread_self()) & (kBuckets_ - 1);
166 std::atomic_fetch_add(&ints_[bucket], val);
169 // read the first few and extrapolate
172 static const int numToRead = 8;
173 FOR_EACH_RANGE(i, 0, numToRead) {
174 ret += ints_[i].load(std::memory_order_relaxed);
176 return ret * (kBuckets_ / numToRead);
179 // readFull is lock-free, but has to do thousands of loads...
182 for (auto& i : ints_) {
183 // Fun fact - using memory_order_consume below reduces perf 30-40% in high
184 // contention benchmarks.
185 ret += i.load(std::memory_order_relaxed);
190 ShardedAtomicInt shd_int64;
192 REG_BASELINE(_thread64, global__thread64 += 1);
193 REG_BASELINE(_thread32, global__thread32 += 1);
194 REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
195 REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
196 REG_BASELINE(atomic_inc64, std::atomic_fetch_add(&globalInt64Baseline, 1L));
197 REG_BASELINE(atomic_inc32, std::atomic_fetch_add(&globalInt32Baseline, 1));
198 REG_BASELINE(ShardedAtm64, shd_int64.inc());
200 BENCHMARK_PARAM(BM_mt_cache_size64, 0);
201 BENCHMARK_PARAM(BM_mt_cache_size64, 10);
202 BENCHMARK_PARAM(BM_mt_cache_size64, 100);
203 BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
204 BENCHMARK_PARAM(BM_mt_cache_size32, 0);
205 BENCHMARK_PARAM(BM_mt_cache_size32, 10);
206 BENCHMARK_PARAM(BM_mt_cache_size32, 100);
207 BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
208 BENCHMARK_DRAW_LINE();
211 BENCHMARK(Atomic_readFull) {
212 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
214 BENCHMARK(ThrCache_readFull) {
215 doNotOptimizeAway(globalInt64.readFull());
217 BENCHMARK(Sharded_readFull) {
218 doNotOptimizeAway(shd_int64.readFull());
220 BENCHMARK(ThrCache_readFast) {
221 doNotOptimizeAway(globalInt64.readFast());
223 BENCHMARK(Sharded_readFast) {
224 doNotOptimizeAway(shd_int64.readFast());
226 BENCHMARK_DRAW_LINE();
229 REG_BASELINE(Atomic_readFull,
230 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
231 REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
232 REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
233 REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
234 REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
235 BENCHMARK_DRAW_LINE();
237 int main(int argc, char** argv) {
238 testing::InitGoogleTest(&argc, argv);
239 google::ParseCommandLineFlags(&argc, &argv, true);
240 google::SetCommandLineOptionWithMode(
241 "bm_min_usec", "10000", google::SET_FLAG_IF_DEFAULT
243 if (FLAGS_benchmark) {
244 folly::runBenchmarks();
246 return RUN_ALL_TESTS();
250 Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
252 Benchmark Iters Total t t/iter iter/sec
253 ------------------------------------------------------------------------------
254 + 103% BM_mt_baseline__thread64 10000000 13.54 ms 1.354 ns 704.4 M
255 * BM_mt_baseline__thread32 10000000 6.651 ms 665.1 ps 1.4 G
256 +50.3% BM_mt_baseline_ThreadLocal64 10000000 9.994 ms 999.4 ps 954.2 M
257 +49.9% BM_mt_baseline_ThreadLocal32 10000000 9.972 ms 997.2 ps 956.4 M
258 +2650% BM_mt_baseline_atomic_inc64 10000000 182.9 ms 18.29 ns 52.13 M
259 +2665% BM_mt_baseline_atomic_inc32 10000000 183.9 ms 18.39 ns 51.85 M
260 +75.3% BM_mt_baseline_ShardedAtm64 10000000 11.66 ms 1.166 ns 817.8 M
261 +6670% BM_mt_cache_size64/0 10000000 450.3 ms 45.03 ns 21.18 M
262 +1644% BM_mt_cache_size64/10 10000000 116 ms 11.6 ns 82.2 M
263 + 381% BM_mt_cache_size64/100 10000000 32.04 ms 3.204 ns 297.7 M
264 + 129% BM_mt_cache_size64/1000 10000000 15.24 ms 1.524 ns 625.8 M
265 +6052% BM_mt_cache_size32/0 10000000 409.2 ms 40.92 ns 23.31 M
266 +1304% BM_mt_cache_size32/10 10000000 93.39 ms 9.339 ns 102.1 M
267 + 298% BM_mt_cache_size32/100 10000000 26.52 ms 2.651 ns 359.7 M
268 +68.1% BM_mt_cache_size32/1000 10000000 11.18 ms 1.118 ns 852.9 M
269 ------------------------------------------------------------------------------
270 +10.4% Atomic_readFull 10000000 36.05 ms 3.605 ns 264.5 M
271 + 619% ThrCache_readFull 10000000 235.1 ms 23.51 ns 40.57 M
272 SLOW Sharded_readFull 1981093 2 s 1.01 us 967.3 k
273 * ThrCache_readFast 10000000 32.65 ms 3.265 ns 292.1 M
274 +10.0% Sharded_readFast 10000000 35.92 ms 3.592 ns 265.5 M
275 ------------------------------------------------------------------------------
276 +4.54% BM_mt_baseline_Atomic_readFull 10000000 8.672 ms 867.2 ps 1.074 G
277 SLOW BM_mt_baseline_ThrCache_readFull 10000000 996.9 ms 99.69 ns 9.567 M
278 SLOW BM_mt_baseline_Sharded_readFull 10000000 891.5 ms 89.15 ns 10.7 M
279 * BM_mt_baseline_ThrCache_readFast 10000000 8.295 ms 829.5 ps 1.123 G
280 +12.7% BM_mt_baseline_Sharded_readFast 10000000 9.348 ms 934.8 ps 1020 M
281 ------------------------------------------------------------------------------