2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/ThreadCachedInt.h>
18 #include <folly/Hash.h>
22 #include <gtest/gtest.h>
23 #include <gflags/gflags.h>
24 #include <glog/logging.h>
25 #include <folly/Benchmark.h>
27 using namespace folly;
29 TEST(ThreadCachedInt, SingleThreadedNotCached) {
30 ThreadCachedInt<int64_t> val(0, 0);
31 EXPECT_EQ(0, val.readFast());
33 EXPECT_EQ(1, val.readFast());
34 for (int i = 0; i < 41; ++i) {
37 EXPECT_EQ(42, val.readFast());
39 EXPECT_EQ(41, val.readFast());
42 // Note: This is somewhat fragile to the implementation. If this causes
43 // problems, feel free to remove it.
44 TEST(ThreadCachedInt, SingleThreadedCached) {
45 ThreadCachedInt<int64_t> val(0, 10);
46 EXPECT_EQ(0, val.readFast());
48 EXPECT_EQ(0, val.readFast());
49 for (int i = 0; i < 7; ++i) {
52 EXPECT_EQ(0, val.readFast());
53 EXPECT_EQ(0, val.readFastAndReset());
54 EXPECT_EQ(8, val.readFull());
55 EXPECT_EQ(8, val.readFullAndReset());
56 EXPECT_EQ(0, val.readFull());
57 EXPECT_EQ(0, val.readFast());
60 ThreadCachedInt<int32_t> globalInt32(0, 11);
61 ThreadCachedInt<int64_t> globalInt64(0, 11);
62 int kNumInserts = 100000;
63 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
64 #define CREATE_INC_FUNC(size) \
65 void incFunc ## size () { \
66 const int num = kNumInserts / FLAGS_numThreads; \
67 for (int i = 0; i < num; ++i) { \
68 ++globalInt ## size ; \
74 // Confirms counts are accurate with competing threads
75 TEST(ThreadCachedInt, MultiThreadedCached) {
77 CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
78 "FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
79 const int numPerThread = kNumInserts / FLAGS_numThreads;
80 ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
82 std::atomic<bool> run(true);
83 std::atomic<int> threadsDone(0);
84 std::vector<std::thread> threads;
85 for (int i = 0; i < FLAGS_numThreads; ++i) {
86 threads.push_back(std::thread([&] {
87 FOR_EACH_RANGE(k, 0, numPerThread) {
90 std::atomic_fetch_add(&threadsDone, 1);
91 while (run.load()) { usleep(100); }
95 // We create and increment another ThreadCachedInt here to make sure it
96 // doesn't interact with the other instances
97 ThreadCachedInt<int64_t> otherTCInt64(0, 10);
101 while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
105 // Threads are done incrementing, but caches have not been flushed yet, so
106 // we have to readFull.
107 EXPECT_NE(kNumInserts, TCInt64.readFast());
108 EXPECT_EQ(kNumInserts, TCInt64.readFull());
111 for (auto& t : threads) {
115 } // Caches are flushed when threads finish
116 EXPECT_EQ(kNumInserts, TCInt64.readFast());
119 #define MAKE_MT_CACHE_SIZE_BM(size) \
120 void BM_mt_cache_size ## size (int iters, int cacheSize) { \
121 kNumInserts = iters; \
122 globalInt ## size.set(0); \
123 globalInt ## size.setCacheSize(cacheSize); \
124 std::vector<std::thread> threads; \
125 for (int i = 0; i < FLAGS_numThreads; ++i) { \
126 threads.push_back(std::thread(incFunc ## size)); \
128 for (auto& t : threads) { \
132 MAKE_MT_CACHE_SIZE_BM(64);
133 MAKE_MT_CACHE_SIZE_BM(32);
135 #define REG_BASELINE(name, inc_stmt) \
136 BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) { \
137 const int iterPerThread = iters / FLAGS_numThreads; \
138 std::vector<std::thread> threads; \
139 for (int i = 0; i < FLAGS_numThreads; ++i) { \
140 threads.push_back(std::thread([&]() { \
141 for (int i = 0; i < iterPerThread; ++i) { \
146 for (auto& t : threads) { \
151 ThreadLocal<int64_t> globalTL64Baseline;
152 ThreadLocal<int32_t> globalTL32Baseline;
153 std::atomic<int64_t> globalInt64Baseline(0);
154 std::atomic<int32_t> globalInt32Baseline(0);
155 FOLLY_TLS int64_t global__thread64;
156 FOLLY_TLS int32_t global__thread32;
158 // Alternate lock-free implementation. Achieves about the same performance,
159 // but uses about 20x more memory than ThreadCachedInt with 24 threads.
160 struct ShardedAtomicInt {
161 static const int64_t kBuckets_ = 2048;
162 std::atomic<int64_t> ints_[kBuckets_];
164 inline void inc(int64_t val = 1) {
165 int bucket = hash::twang_mix64(
166 uint64_t(pthread_self())) & (kBuckets_ - 1);
167 std::atomic_fetch_add(&ints_[bucket], val);
170 // read the first few and extrapolate
173 static const int numToRead = 8;
174 FOR_EACH_RANGE(i, 0, numToRead) {
175 ret += ints_[i].load(std::memory_order_relaxed);
177 return ret * (kBuckets_ / numToRead);
180 // readFull is lock-free, but has to do thousands of loads...
183 for (auto& i : ints_) {
184 // Fun fact - using memory_order_consume below reduces perf 30-40% in high
185 // contention benchmarks.
186 ret += i.load(std::memory_order_relaxed);
191 ShardedAtomicInt shd_int64;
193 REG_BASELINE(_thread64, global__thread64 += 1);
194 REG_BASELINE(_thread32, global__thread32 += 1);
195 REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
196 REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
197 REG_BASELINE(atomic_inc64,
198 std::atomic_fetch_add(&globalInt64Baseline, int64_t(1)));
199 REG_BASELINE(atomic_inc32,
200 std::atomic_fetch_add(&globalInt32Baseline, int32_t(1)));
201 REG_BASELINE(ShardedAtm64, shd_int64.inc());
203 BENCHMARK_PARAM(BM_mt_cache_size64, 0);
204 BENCHMARK_PARAM(BM_mt_cache_size64, 10);
205 BENCHMARK_PARAM(BM_mt_cache_size64, 100);
206 BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
207 BENCHMARK_PARAM(BM_mt_cache_size32, 0);
208 BENCHMARK_PARAM(BM_mt_cache_size32, 10);
209 BENCHMARK_PARAM(BM_mt_cache_size32, 100);
210 BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
211 BENCHMARK_DRAW_LINE();
214 BENCHMARK(Atomic_readFull) {
215 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
217 BENCHMARK(ThrCache_readFull) {
218 doNotOptimizeAway(globalInt64.readFull());
220 BENCHMARK(Sharded_readFull) {
221 doNotOptimizeAway(shd_int64.readFull());
223 BENCHMARK(ThrCache_readFast) {
224 doNotOptimizeAway(globalInt64.readFast());
226 BENCHMARK(Sharded_readFast) {
227 doNotOptimizeAway(shd_int64.readFast());
229 BENCHMARK_DRAW_LINE();
232 REG_BASELINE(Atomic_readFull,
233 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
234 REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
235 REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
236 REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
237 REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
238 BENCHMARK_DRAW_LINE();
240 int main(int argc, char** argv) {
241 testing::InitGoogleTest(&argc, argv);
242 gflags::ParseCommandLineFlags(&argc, &argv, true);
243 gflags::SetCommandLineOptionWithMode(
244 "bm_min_usec", "10000", gflags::SET_FLAG_IF_DEFAULT
246 if (FLAGS_benchmark) {
247 folly::runBenchmarks();
249 return RUN_ALL_TESTS();
253 Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
255 Benchmark Iters Total t t/iter iter/sec
256 ------------------------------------------------------------------------------
257 + 103% BM_mt_baseline__thread64 10000000 13.54 ms 1.354 ns 704.4 M
258 * BM_mt_baseline__thread32 10000000 6.651 ms 665.1 ps 1.4 G
259 +50.3% BM_mt_baseline_ThreadLocal64 10000000 9.994 ms 999.4 ps 954.2 M
260 +49.9% BM_mt_baseline_ThreadLocal32 10000000 9.972 ms 997.2 ps 956.4 M
261 +2650% BM_mt_baseline_atomic_inc64 10000000 182.9 ms 18.29 ns 52.13 M
262 +2665% BM_mt_baseline_atomic_inc32 10000000 183.9 ms 18.39 ns 51.85 M
263 +75.3% BM_mt_baseline_ShardedAtm64 10000000 11.66 ms 1.166 ns 817.8 M
264 +6670% BM_mt_cache_size64/0 10000000 450.3 ms 45.03 ns 21.18 M
265 +1644% BM_mt_cache_size64/10 10000000 116 ms 11.6 ns 82.2 M
266 + 381% BM_mt_cache_size64/100 10000000 32.04 ms 3.204 ns 297.7 M
267 + 129% BM_mt_cache_size64/1000 10000000 15.24 ms 1.524 ns 625.8 M
268 +6052% BM_mt_cache_size32/0 10000000 409.2 ms 40.92 ns 23.31 M
269 +1304% BM_mt_cache_size32/10 10000000 93.39 ms 9.339 ns 102.1 M
270 + 298% BM_mt_cache_size32/100 10000000 26.52 ms 2.651 ns 359.7 M
271 +68.1% BM_mt_cache_size32/1000 10000000 11.18 ms 1.118 ns 852.9 M
272 ------------------------------------------------------------------------------
273 +10.4% Atomic_readFull 10000000 36.05 ms 3.605 ns 264.5 M
274 + 619% ThrCache_readFull 10000000 235.1 ms 23.51 ns 40.57 M
275 SLOW Sharded_readFull 1981093 2 s 1.01 us 967.3 k
276 * ThrCache_readFast 10000000 32.65 ms 3.265 ns 292.1 M
277 +10.0% Sharded_readFast 10000000 35.92 ms 3.592 ns 265.5 M
278 ------------------------------------------------------------------------------
279 +4.54% BM_mt_baseline_Atomic_readFull 10000000 8.672 ms 867.2 ps 1.074 G
280 SLOW BM_mt_baseline_ThrCache_readFull 10000000 996.9 ms 99.69 ns 9.567 M
281 SLOW BM_mt_baseline_Sharded_readFull 10000000 891.5 ms 89.15 ns 10.7 M
282 * BM_mt_baseline_ThrCache_readFast 10000000 8.295 ms 829.5 ps 1.123 G
283 +12.7% BM_mt_baseline_Sharded_readFast 10000000 9.348 ms 934.8 ps 1020 M
284 ------------------------------------------------------------------------------