2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/CacheLocality.h>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <folly/Benchmark.h>
27 using namespace folly::detail;
29 #define DECLARE_SPREADER_TAG(tag, locality, func) \
31 template <typename dummy> \
37 const CacheLocality& CacheLocality::system<tag>() { \
38 static auto* inst = new CacheLocality(locality); \
42 Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
50 CacheLocality::system<>(),
51 folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
52 DECLARE_SPREADER_TAG(PthreadSelfTag,
53 CacheLocality::system<>(),
54 folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
56 BENCHMARK(AccessSpreaderUse, iters) {
57 for (unsigned long i = 0; i < iters; ++i) {
58 auto x = AccessSpreader<>::current(16);
59 folly::doNotOptimizeAway(x);
63 // Benchmark scores here reflect the time for 32 threads to perform an
64 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
65 // if we don't separate the counters onto unique 128 byte stripes the
66 // 1_stripe and 2_stripe results are identical, even though the L3 is
67 // claimed to have 64 byte cache lines.
69 // Getcpu refers to the vdso getcpu implementation. ThreadLocal refers
70 // to execution using SequentialThreadId, the fallback if the vdso
71 // getcpu isn't available. PthreadSelf hashes the value returned from
72 // pthread_self() as a fallback-fallback for systems that don't have
73 // thread-local support.
75 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
76 // so since the stripe selection is 12 nanos the atomic increments in
77 // the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected
78 // to ping-pong almost every operation, since the loops have the same
79 // duration. Widths 4 and 2 have the same behavior, but each tour of the
80 // cache line is 4 and 8 cores long, respectively. These all suggest a
81 // lower bound of 60 nanos for intra-chip handoff and increment between
84 // With 420 nanos of busywork per contended increment, the system can
85 // hide all of the latency of a tour of length 4, but not quite one of
86 // length 8. I was a bit surprised at how much worse the non-striped
87 // version got. It seems that the inter-chip traffic also interferes
88 // with the L1-only localWork.load(). When the local work is doubled
89 // to about 1 microsecond we see that the inter-chip contention is still
90 // very important, but subdivisions on the same chip don't matter.
92 // sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
93 // --benchmark --bm_min_iters=1000000
94 // ============================================================================
95 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
96 // ============================================================================
97 // AccessSpreaderUse 11.94ns 83.79M
98 // ----------------------------------------------------------------------------
99 // contentionAtWidthGetcpu(1_stripe_0_work) 985.75ns 1.01M
100 // contentionAtWidthGetcpu(2_stripe_0_work) 424.02ns 2.36M
101 // contentionAtWidthGetcpu(4_stripe_0_work) 190.13ns 5.26M
102 // contentionAtWidthGetcpu(8_stripe_0_work) 91.86ns 10.89M
103 // contentionAtWidthGetcpu(16_stripe_0_work) 29.31ns 34.12M
104 // contentionAtWidthGetcpu(32_stripe_0_work) 29.53ns 33.86M
105 // contentionAtWidthGetcpu(64_stripe_0_work) 29.93ns 33.41M
106 // contentionAtWidthThreadLocal(2_stripe_0_work) 609.21ns 1.64M
107 // contentionAtWidthThreadLocal(4_stripe_0_work) 303.60ns 3.29M
108 // contentionAtWidthThreadLocal(8_stripe_0_work) 246.57ns 4.06M
109 // contentionAtWidthThreadLocal(16_stripe_0_work) 154.84ns 6.46M
110 // contentionAtWidthThreadLocal(32_stripe_0_work) 24.14ns 41.43M
111 // contentionAtWidthThreadLocal(64_stripe_0_work) 23.95ns 41.75M
112 // contentionAtWidthPthreadSelf(2_stripe_0_work) 722.01ns 1.39M
113 // contentionAtWidthPthreadSelf(4_stripe_0_work) 501.56ns 1.99M
114 // contentionAtWidthPthreadSelf(8_stripe_0_work) 474.58ns 2.11M
115 // contentionAtWidthPthreadSelf(16_stripe_0_work) 300.90ns 3.32M
116 // contentionAtWidthPthreadSelf(32_stripe_0_work) 175.77ns 5.69M
117 // contentionAtWidthPthreadSelf(64_stripe_0_work) 174.88ns 5.72M
118 // atomicIncrBaseline(local_incr_0_work) 16.81ns 59.51M
119 // ----------------------------------------------------------------------------
120 // contentionAtWidthGetcpu(1_stripe_500_work) 1.82us 549.97K
121 // contentionAtWidthGetcpu(2_stripe_500_work) 533.71ns 1.87M
122 // contentionAtWidthGetcpu(4_stripe_500_work) 424.64ns 2.35M
123 // contentionAtWidthGetcpu(8_stripe_500_work) 451.85ns 2.21M
124 // contentionAtWidthGetcpu(16_stripe_500_work) 425.54ns 2.35M
125 // contentionAtWidthGetcpu(32_stripe_500_work) 501.66ns 1.99M
126 // atomicIncrBaseline(local_incr_500_work) 438.46ns 2.28M
127 // ----------------------------------------------------------------------------
128 // contentionAtWidthGetcpu(1_stripe_1000_work) 1.88us 532.20K
129 // contentionAtWidthGetcpu(2_stripe_1000_work) 824.62ns 1.21M
130 // contentionAtWidthGetcpu(4_stripe_1000_work) 803.56ns 1.24M
131 // contentionAtWidthGetcpu(8_stripe_1000_work) 926.65ns 1.08M
132 // contentionAtWidthGetcpu(16_stripe_1000_work) 900.10ns 1.11M
133 // contentionAtWidthGetcpu(32_stripe_1000_work) 890.75ns 1.12M
134 // atomicIncrBaseline(local_incr_1000_work) 774.47ns 1.29M
135 // ============================================================================
136 template <template <typename> class Tag>
137 static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
138 const size_t counterAlignment = 128;
139 const size_t numThreads = 32;
141 folly::BenchmarkSuspender braces;
143 std::atomic<size_t> ready(0);
144 std::atomic<bool> go(false);
146 // while in theory the cache line size is 64 bytes, experiments show
147 // that we get contention on 128 byte boundaries for Ivy Bridge. The
148 // extra indirection adds 1 or 2 nanos
149 assert(counterAlignment >= sizeof(std::atomic<size_t>));
150 std::vector<char> raw(counterAlignment * stripes);
152 // if we happen to be using the tlsRoundRobin, then sequentially
153 // assigning the thread identifiers is the unlikely best-case scenario.
154 // We don't want to unfairly benefit or penalize. Computing the exact
155 // maximum likelihood of the probability distributions is annoying, so
156 // I approximate as 2/5 of the ids that have no threads, 2/5 that have
157 // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
158 // wrapping back to slot 0 when we hit 1/15 and 1/5.
160 std::vector<std::thread> threads;
161 while (threads.size() < numThreads) {
162 threads.push_back(std::thread([&, iters, stripes, work]() {
163 auto counters = std::vector<std::atomic<size_t>*>(stripes);
164 for (size_t i = 0; i < stripes; ++i) {
166 new (raw.data() + counterAlignment * i) std::atomic<size_t>();
173 std::atomic<int> localWork(0);
174 for (size_t i = iters; i > 0; --i) {
175 ++*(counters[AccessSpreader<Tag>::current(stripes)]);
176 for (size_t j = work; j > 0; --j) {
182 if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
183 // create a few dummy threads to wrap back around to 0 mod numCpus
184 for (size_t i = threads.size(); i != numThreads; ++i) {
185 std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
190 while (ready < numThreads) {
196 for (auto& thr : threads) {
201 static void atomicIncrBaseline(size_t iters,
203 size_t numThreads = 32) {
204 folly::BenchmarkSuspender braces;
206 std::atomic<bool> go(false);
208 std::vector<std::thread> threads;
209 while (threads.size() < numThreads) {
210 threads.push_back(std::thread([&]() {
214 std::atomic<size_t> localCounter(0);
215 std::atomic<int> localWork(0);
216 for (size_t i = iters; i > 0; --i) {
218 for (size_t j = work; j > 0; --j) {
228 for (auto& thr : threads) {
233 static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
234 contentionAtWidth<std::atomic>(iters, stripes, work);
237 static void contentionAtWidthThreadLocal(size_t iters,
240 contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
243 static void contentionAtWidthPthreadSelf(size_t iters,
246 contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
249 BENCHMARK_DRAW_LINE()
250 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
251 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
252 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
253 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
254 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
255 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
256 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
257 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
258 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
259 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
260 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
261 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
262 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
263 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
264 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
265 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
266 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
267 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
268 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
269 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
270 BENCHMARK_DRAW_LINE()
271 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
272 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
273 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
274 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
275 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
276 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
277 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
278 BENCHMARK_DRAW_LINE()
279 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
280 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
281 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
282 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
283 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
284 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
285 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
287 int main(int argc, char** argv) {
288 gflags::ParseCommandLineFlags(&argc, &argv, true);
289 folly::runBenchmarks();