2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/ThreadCachedInt.h>
22 #include <glog/logging.h>
24 #include <folly/Benchmark.h>
25 #include <folly/Hash.h>
26 #include <folly/portability/GFlags.h>
27 #include <folly/portability/GTest.h>
29 using namespace folly;
31 TEST(ThreadCachedInt, SingleThreadedNotCached) {
32 ThreadCachedInt<int64_t> val(0, 0);
33 EXPECT_EQ(0, val.readFast());
35 EXPECT_EQ(1, val.readFast());
36 for (int i = 0; i < 41; ++i) {
39 EXPECT_EQ(42, val.readFast());
41 EXPECT_EQ(41, val.readFast());
44 // Note: This is somewhat fragile to the implementation. If this causes
45 // problems, feel free to remove it.
46 TEST(ThreadCachedInt, SingleThreadedCached) {
47 ThreadCachedInt<int64_t> val(0, 10);
48 EXPECT_EQ(0, val.readFast());
50 EXPECT_EQ(0, val.readFast());
51 for (int i = 0; i < 7; ++i) {
54 EXPECT_EQ(0, val.readFast());
55 EXPECT_EQ(0, val.readFastAndReset());
56 EXPECT_EQ(8, val.readFull());
57 EXPECT_EQ(8, val.readFullAndReset());
58 EXPECT_EQ(0, val.readFull());
59 EXPECT_EQ(0, val.readFast());
62 ThreadCachedInt<int32_t> globalInt32(0, 11);
63 ThreadCachedInt<int64_t> globalInt64(0, 11);
64 int kNumInserts = 100000;
65 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
66 #define CREATE_INC_FUNC(size) \
67 void incFunc ## size () { \
68 const int num = kNumInserts / FLAGS_numThreads; \
69 for (int i = 0; i < num; ++i) { \
70 ++globalInt ## size ; \
76 // Confirms counts are accurate with competing threads
77 TEST(ThreadCachedInt, MultiThreadedCached) {
79 CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
80 "FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
81 const int numPerThread = kNumInserts / FLAGS_numThreads;
82 ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
84 std::atomic<bool> run(true);
85 std::atomic<int> threadsDone(0);
86 std::vector<std::thread> threads;
87 for (int i = 0; i < FLAGS_numThreads; ++i) {
88 threads.push_back(std::thread([&] {
89 FOR_EACH_RANGE(k, 0, numPerThread) {
92 std::atomic_fetch_add(&threadsDone, 1);
93 while (run.load()) { usleep(100); }
97 // We create and increment another ThreadCachedInt here to make sure it
98 // doesn't interact with the other instances
99 ThreadCachedInt<int64_t> otherTCInt64(0, 10);
100 otherTCInt64.set(33);
103 while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
107 // Threads are done incrementing, but caches have not been flushed yet, so
108 // we have to readFull.
109 EXPECT_NE(kNumInserts, TCInt64.readFast());
110 EXPECT_EQ(kNumInserts, TCInt64.readFull());
113 for (auto& t : threads) {
117 } // Caches are flushed when threads finish
118 EXPECT_EQ(kNumInserts, TCInt64.readFast());
121 #define MAKE_MT_CACHE_SIZE_BM(size) \
122 void BM_mt_cache_size ## size (int iters, int cacheSize) { \
123 kNumInserts = iters; \
124 globalInt ## size.set(0); \
125 globalInt ## size.setCacheSize(cacheSize); \
126 std::vector<std::thread> threads; \
127 for (int i = 0; i < FLAGS_numThreads; ++i) { \
128 threads.push_back(std::thread(incFunc ## size)); \
130 for (auto& t : threads) { \
134 MAKE_MT_CACHE_SIZE_BM(64);
135 MAKE_MT_CACHE_SIZE_BM(32);
137 #define REG_BASELINE(name, inc_stmt) \
138 BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) { \
139 const int iterPerThread = iters / FLAGS_numThreads; \
140 std::vector<std::thread> threads; \
141 for (int i = 0; i < FLAGS_numThreads; ++i) { \
142 threads.push_back(std::thread([&]() { \
143 for (int i = 0; i < iterPerThread; ++i) { \
148 for (auto& t : threads) { \
153 ThreadLocal<int64_t> globalTL64Baseline;
154 ThreadLocal<int32_t> globalTL32Baseline;
155 std::atomic<int64_t> globalInt64Baseline(0);
156 std::atomic<int32_t> globalInt32Baseline(0);
157 FOLLY_TLS int64_t global__thread64;
158 FOLLY_TLS int32_t global__thread32;
160 // Alternate lock-free implementation. Achieves about the same performance,
161 // but uses about 20x more memory than ThreadCachedInt with 24 threads.
162 struct ShardedAtomicInt {
163 static const int64_t kBuckets_ = 2048;
164 std::atomic<int64_t> ints_[kBuckets_];
166 inline void inc(int64_t val = 1) {
167 int bucket = hash::twang_mix64(
168 uint64_t(pthread_self())) & (kBuckets_ - 1);
169 std::atomic_fetch_add(&ints_[bucket], val);
172 // read the first few and extrapolate
175 static const int numToRead = 8;
176 FOR_EACH_RANGE(i, 0, numToRead) {
177 ret += ints_[i].load(std::memory_order_relaxed);
179 return ret * (kBuckets_ / numToRead);
182 // readFull is lock-free, but has to do thousands of loads...
185 for (auto& i : ints_) {
186 // Fun fact - using memory_order_consume below reduces perf 30-40% in high
187 // contention benchmarks.
188 ret += i.load(std::memory_order_relaxed);
193 ShardedAtomicInt shd_int64;
195 REG_BASELINE(_thread64, global__thread64 += 1);
196 REG_BASELINE(_thread32, global__thread32 += 1);
197 REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
198 REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
199 REG_BASELINE(atomic_inc64,
200 std::atomic_fetch_add(&globalInt64Baseline, int64_t(1)));
201 REG_BASELINE(atomic_inc32,
202 std::atomic_fetch_add(&globalInt32Baseline, int32_t(1)));
203 REG_BASELINE(ShardedAtm64, shd_int64.inc());
205 BENCHMARK_PARAM(BM_mt_cache_size64, 0);
206 BENCHMARK_PARAM(BM_mt_cache_size64, 10);
207 BENCHMARK_PARAM(BM_mt_cache_size64, 100);
208 BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
209 BENCHMARK_PARAM(BM_mt_cache_size32, 0);
210 BENCHMARK_PARAM(BM_mt_cache_size32, 10);
211 BENCHMARK_PARAM(BM_mt_cache_size32, 100);
212 BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
213 BENCHMARK_DRAW_LINE();
216 BENCHMARK(Atomic_readFull) {
217 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
219 BENCHMARK(ThrCache_readFull) {
220 doNotOptimizeAway(globalInt64.readFull());
222 BENCHMARK(Sharded_readFull) {
223 doNotOptimizeAway(shd_int64.readFull());
225 BENCHMARK(ThrCache_readFast) {
226 doNotOptimizeAway(globalInt64.readFast());
228 BENCHMARK(Sharded_readFast) {
229 doNotOptimizeAway(shd_int64.readFast());
231 BENCHMARK_DRAW_LINE();
234 REG_BASELINE(Atomic_readFull,
235 doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
236 REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
237 REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
238 REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
239 REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
240 BENCHMARK_DRAW_LINE();
242 int main(int argc, char** argv) {
243 testing::InitGoogleTest(&argc, argv);
244 gflags::ParseCommandLineFlags(&argc, &argv, true);
245 gflags::SetCommandLineOptionWithMode(
246 "bm_min_usec", "10000", gflags::SET_FLAG_IF_DEFAULT
248 if (FLAGS_benchmark) {
249 folly::runBenchmarks();
251 return RUN_ALL_TESTS();
255 Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
257 Benchmark Iters Total t t/iter iter/sec
258 ------------------------------------------------------------------------------
259 + 103% BM_mt_baseline__thread64 10000000 13.54 ms 1.354 ns 704.4 M
260 * BM_mt_baseline__thread32 10000000 6.651 ms 665.1 ps 1.4 G
261 +50.3% BM_mt_baseline_ThreadLocal64 10000000 9.994 ms 999.4 ps 954.2 M
262 +49.9% BM_mt_baseline_ThreadLocal32 10000000 9.972 ms 997.2 ps 956.4 M
263 +2650% BM_mt_baseline_atomic_inc64 10000000 182.9 ms 18.29 ns 52.13 M
264 +2665% BM_mt_baseline_atomic_inc32 10000000 183.9 ms 18.39 ns 51.85 M
265 +75.3% BM_mt_baseline_ShardedAtm64 10000000 11.66 ms 1.166 ns 817.8 M
266 +6670% BM_mt_cache_size64/0 10000000 450.3 ms 45.03 ns 21.18 M
267 +1644% BM_mt_cache_size64/10 10000000 116 ms 11.6 ns 82.2 M
268 + 381% BM_mt_cache_size64/100 10000000 32.04 ms 3.204 ns 297.7 M
269 + 129% BM_mt_cache_size64/1000 10000000 15.24 ms 1.524 ns 625.8 M
270 +6052% BM_mt_cache_size32/0 10000000 409.2 ms 40.92 ns 23.31 M
271 +1304% BM_mt_cache_size32/10 10000000 93.39 ms 9.339 ns 102.1 M
272 + 298% BM_mt_cache_size32/100 10000000 26.52 ms 2.651 ns 359.7 M
273 +68.1% BM_mt_cache_size32/1000 10000000 11.18 ms 1.118 ns 852.9 M
274 ------------------------------------------------------------------------------
275 +10.4% Atomic_readFull 10000000 36.05 ms 3.605 ns 264.5 M
276 + 619% ThrCache_readFull 10000000 235.1 ms 23.51 ns 40.57 M
277 SLOW Sharded_readFull 1981093 2 s 1.01 us 967.3 k
278 * ThrCache_readFast 10000000 32.65 ms 3.265 ns 292.1 M
279 +10.0% Sharded_readFast 10000000 35.92 ms 3.592 ns 265.5 M
280 ------------------------------------------------------------------------------
281 +4.54% BM_mt_baseline_Atomic_readFull 10000000 8.672 ms 867.2 ps 1.074 G
282 SLOW BM_mt_baseline_ThrCache_readFull 10000000 996.9 ms 99.69 ns 9.567 M
283 SLOW BM_mt_baseline_Sharded_readFull 10000000 891.5 ms 89.15 ns 10.7 M
284 * BM_mt_baseline_ThrCache_readFast 10000000 8.295 ms 829.5 ps 1.123 G
285 +12.7% BM_mt_baseline_Sharded_readFast 10000000 9.348 ms 934.8 ps 1020 M
286 ------------------------------------------------------------------------------