folly/test/ThreadCachedIntTest.cpp

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <folly/ThreadCachedInt.h>
  18
  19 #include <atomic>
  20 #include <thread>
  21
  22 #include <glog/logging.h>
  23
  24 #include <folly/Benchmark.h>
  25 #include <folly/Hash.h>
  26 #include <folly/portability/GFlags.h>
  27 #include <folly/portability/GTest.h>
  28
  29 using namespace folly;
  30
  31 TEST(ThreadCachedInt, SingleThreadedNotCached) {
  32   ThreadCachedInt<int64_t> val(0, 0);
  33   EXPECT_EQ(0, val.readFast());
  34   ++val;
  35   EXPECT_EQ(1, val.readFast());
  36   for (int i = 0; i < 41; ++i) {
  37     val.increment(1);
  38   }
  39   EXPECT_EQ(42, val.readFast());
  40   --val;
  41   EXPECT_EQ(41, val.readFast());
  42 }
  43
  44 // Note: This is somewhat fragile to the implementation.  If this causes
  45 // problems, feel free to remove it.
  46 TEST(ThreadCachedInt, SingleThreadedCached) {
  47   ThreadCachedInt<int64_t> val(0, 10);
  48   EXPECT_EQ(0, val.readFast());
  49   ++val;
  50   EXPECT_EQ(0, val.readFast());
  51   for (int i = 0; i < 7; ++i) {
  52     val.increment(1);
  53   }
  54   EXPECT_EQ(0, val.readFast());
  55   EXPECT_EQ(0, val.readFastAndReset());
  56   EXPECT_EQ(8, val.readFull());
  57   EXPECT_EQ(8, val.readFullAndReset());
  58   EXPECT_EQ(0, val.readFull());
  59   EXPECT_EQ(0, val.readFast());
  60 }
  61
  62 ThreadCachedInt<int32_t> globalInt32(0, 11);
  63 ThreadCachedInt<int64_t> globalInt64(0, 11);
  64 int kNumInserts = 100000;
  65 DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
  66 #define CREATE_INC_FUNC(size)                                       \
  67   void incFunc ## size () {                                         \
  68     const int num = kNumInserts / FLAGS_numThreads;                 \
  69     for (int i = 0; i < num; ++i) {                                 \
  70       ++globalInt ## size ;                                         \
  71     }                                                               \
  72   }
  73 CREATE_INC_FUNC(64);
  74 CREATE_INC_FUNC(32);
  75
  76 // Confirms counts are accurate with competing threads
  77 TEST(ThreadCachedInt, MultiThreadedCached) {
  78   kNumInserts = 100000;
  79   CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
  80     "FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
  81   const int numPerThread = kNumInserts / FLAGS_numThreads;
  82   ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
  83   {
  84     std::atomic<bool> run(true);
  85     std::atomic<int> threadsDone(0);
  86     std::vector<std::thread> threads;
  87     for (int i = 0; i < FLAGS_numThreads; ++i) {
  88       threads.push_back(std::thread([&] {
  89         FOR_EACH_RANGE(k, 0, numPerThread) {
  90           ++TCInt64;
  91         }
  92         std::atomic_fetch_add(&threadsDone, 1);
  93         while (run.load()) { usleep(100); }
  94       }));
  95     }
  96
  97     // We create and increment another ThreadCachedInt here to make sure it
  98     // doesn't interact with the other instances
  99     ThreadCachedInt<int64_t> otherTCInt64(0, 10);
 100     otherTCInt64.set(33);
 101     ++otherTCInt64;
 102
 103     while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
 104
 105     ++otherTCInt64;
 106
 107     // Threads are done incrementing, but caches have not been flushed yet, so
 108     // we have to readFull.
 109     EXPECT_NE(kNumInserts, TCInt64.readFast());
 110     EXPECT_EQ(kNumInserts, TCInt64.readFull());
 111
 112     run.store(false);
 113     for (auto& t : threads) {
 114       t.join();
 115     }
 116
 117   }  // Caches are flushed when threads finish
 118   EXPECT_EQ(kNumInserts, TCInt64.readFast());
 119 }
 120
 121 #define MAKE_MT_CACHE_SIZE_BM(size)                             \
 122   void BM_mt_cache_size ## size (int iters, int cacheSize) {    \
 123     kNumInserts = iters;                                        \
 124     globalInt ## size.set(0);                                   \
 125     globalInt ## size.setCacheSize(cacheSize);                  \
 126     std::vector<std::thread> threads;                           \
 127     for (int i = 0; i < FLAGS_numThreads; ++i) {                \
 128       threads.push_back(std::thread(incFunc ## size));          \
 129     }                                                           \
 130     for (auto& t : threads) {                                   \
 131       t.join();                                                 \
 132     }                                                           \
 133   }
 134 MAKE_MT_CACHE_SIZE_BM(64);
 135 MAKE_MT_CACHE_SIZE_BM(32);
 136
 137 #define REG_BASELINE(name, inc_stmt)                            \
 138   BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) {     \
 139     const int iterPerThread = iters / FLAGS_numThreads;         \
 140     std::vector<std::thread> threads;                           \
 141     for (int i = 0; i < FLAGS_numThreads; ++i) {                \
 142       threads.push_back(std::thread([&]() {                     \
 143             for (int i = 0; i < iterPerThread; ++i) {           \
 144               inc_stmt;                                         \
 145             }                                                   \
 146           }));                                                  \
 147     }                                                           \
 148     for (auto& t : threads) {                                   \
 149       t.join();                                                 \
 150     }                                                           \
 151   }
 152
 153 ThreadLocal<int64_t> globalTL64Baseline;
 154 ThreadLocal<int32_t> globalTL32Baseline;
 155 std::atomic<int64_t> globalInt64Baseline(0);
 156 std::atomic<int32_t> globalInt32Baseline(0);
 157 FOLLY_TLS int64_t global__thread64;
 158 FOLLY_TLS int32_t global__thread32;
 159
 160 // Alternate lock-free implementation.  Achieves about the same performance,
 161 // but uses about 20x more memory than ThreadCachedInt with 24 threads.
 162 struct ShardedAtomicInt {
 163   static const int64_t kBuckets_ = 2048;
 164   std::atomic<int64_t> ints_[kBuckets_];
 165
 166   inline void inc(int64_t val = 1) {
 167     int bucket = hash::twang_mix64(
 168       uint64_t(pthread_self())) & (kBuckets_ - 1);
 169     std::atomic_fetch_add(&ints_[bucket], val);
 170   }
 171
 172   // read the first few and extrapolate
 173   int64_t readFast() {
 174     int64_t ret = 0;
 175     static const int numToRead = 8;
 176     FOR_EACH_RANGE(i, 0, numToRead) {
 177       ret += ints_[i].load(std::memory_order_relaxed);
 178     }
 179     return ret * (kBuckets_ / numToRead);
 180   }
 181
 182   // readFull is lock-free, but has to do thousands of loads...
 183   int64_t readFull() {
 184     int64_t ret = 0;
 185     for (auto& i : ints_) {
 186       // Fun fact - using memory_order_consume below reduces perf 30-40% in high
 187       // contention benchmarks.
 188       ret += i.load(std::memory_order_relaxed);
 189     }
 190     return ret;
 191   }
 192 };
 193 ShardedAtomicInt shd_int64;
 194
 195 REG_BASELINE(_thread64, global__thread64 += 1);
 196 REG_BASELINE(_thread32, global__thread32 += 1);
 197 REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
 198 REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
 199 REG_BASELINE(atomic_inc64,
 200              std::atomic_fetch_add(&globalInt64Baseline, int64_t(1)));
 201 REG_BASELINE(atomic_inc32,
 202              std::atomic_fetch_add(&globalInt32Baseline, int32_t(1)));
 203 REG_BASELINE(ShardedAtm64, shd_int64.inc());
 204
 205 BENCHMARK_PARAM(BM_mt_cache_size64, 0);
 206 BENCHMARK_PARAM(BM_mt_cache_size64, 10);
 207 BENCHMARK_PARAM(BM_mt_cache_size64, 100);
 208 BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
 209 BENCHMARK_PARAM(BM_mt_cache_size32, 0);
 210 BENCHMARK_PARAM(BM_mt_cache_size32, 10);
 211 BENCHMARK_PARAM(BM_mt_cache_size32, 100);
 212 BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
 213 BENCHMARK_DRAW_LINE();
 214
 215 // single threaded
 216 BENCHMARK(Atomic_readFull) {
 217   doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
 218 }
 219 BENCHMARK(ThrCache_readFull) {
 220   doNotOptimizeAway(globalInt64.readFull());
 221 }
 222 BENCHMARK(Sharded_readFull) {
 223   doNotOptimizeAway(shd_int64.readFull());
 224 }
 225 BENCHMARK(ThrCache_readFast) {
 226   doNotOptimizeAway(globalInt64.readFast());
 227 }
 228 BENCHMARK(Sharded_readFast) {
 229   doNotOptimizeAway(shd_int64.readFast());
 230 }
 231 BENCHMARK_DRAW_LINE();
 232
 233 // multi threaded
 234 REG_BASELINE(Atomic_readFull,
 235       doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
 236 REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
 237 REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
 238 REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
 239 REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
 240 BENCHMARK_DRAW_LINE();
 241
 242 int main(int argc, char** argv) {
 243   testing::InitGoogleTest(&argc, argv);
 244   gflags::ParseCommandLineFlags(&argc, &argv, true);
 245   gflags::SetCommandLineOptionWithMode(
 246     "bm_min_usec", "10000", gflags::SET_FLAG_IF_DEFAULT
 247   );
 248   if (FLAGS_benchmark) {
 249     folly::runBenchmarks();
 250   }
 251   return RUN_ALL_TESTS();
 252 }
 253
 254 /*
 255  Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
 256
 257  Benchmark                               Iters   Total t    t/iter iter/sec
 258  ------------------------------------------------------------------------------
 259  + 103% BM_mt_baseline__thread64     10000000  13.54 ms  1.354 ns  704.4 M
 260 *       BM_mt_baseline__thread32     10000000  6.651 ms  665.1 ps    1.4 G
 261  +50.3% BM_mt_baseline_ThreadLocal64  10000000  9.994 ms  999.4 ps  954.2 M
 262  +49.9% BM_mt_baseline_ThreadLocal32  10000000  9.972 ms  997.2 ps  956.4 M
 263  +2650% BM_mt_baseline_atomic_inc64  10000000  182.9 ms  18.29 ns  52.13 M
 264  +2665% BM_mt_baseline_atomic_inc32  10000000  183.9 ms  18.39 ns  51.85 M
 265  +75.3% BM_mt_baseline_ShardedAtm64  10000000  11.66 ms  1.166 ns  817.8 M
 266  +6670% BM_mt_cache_size64/0         10000000  450.3 ms  45.03 ns  21.18 M
 267  +1644% BM_mt_cache_size64/10        10000000    116 ms   11.6 ns   82.2 M
 268  + 381% BM_mt_cache_size64/100       10000000  32.04 ms  3.204 ns  297.7 M
 269  + 129% BM_mt_cache_size64/1000      10000000  15.24 ms  1.524 ns  625.8 M
 270  +6052% BM_mt_cache_size32/0         10000000  409.2 ms  40.92 ns  23.31 M
 271  +1304% BM_mt_cache_size32/10        10000000  93.39 ms  9.339 ns  102.1 M
 272  + 298% BM_mt_cache_size32/100       10000000  26.52 ms  2.651 ns  359.7 M
 273  +68.1% BM_mt_cache_size32/1000      10000000  11.18 ms  1.118 ns  852.9 M
 274 ------------------------------------------------------------------------------
 275  +10.4% Atomic_readFull              10000000  36.05 ms  3.605 ns  264.5 M
 276  + 619% ThrCache_readFull            10000000  235.1 ms  23.51 ns  40.57 M
 277  SLOW   Sharded_readFull              1981093      2 s    1.01 us  967.3 k
 278 *       ThrCache_readFast            10000000  32.65 ms  3.265 ns  292.1 M
 279  +10.0% Sharded_readFast             10000000  35.92 ms  3.592 ns  265.5 M
 280 ------------------------------------------------------------------------------
 281  +4.54% BM_mt_baseline_Atomic_readFull  10000000  8.672 ms  867.2 ps  1.074 G
 282  SLOW   BM_mt_baseline_ThrCache_readFull  10000000  996.9 ms  99.69 ns  9.567 M
 283  SLOW   BM_mt_baseline_Sharded_readFull  10000000  891.5 ms  89.15 ns   10.7 M
 284 *       BM_mt_baseline_ThrCache_readFast  10000000  8.295 ms  829.5 ps  1.123 G
 285  +12.7% BM_mt_baseline_Sharded_readFast  10000000  9.348 ms  934.8 ps   1020 M
 286 ------------------------------------------------------------------------------
 287 */