Portability.h \
portability/Constexpr.h \
portability/Environment.h \
+ portability/GFlags.h \
portability/Syscall.h \
portability/SysUio.h \
Preprocessor.h \
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/io/IOBuf.h>
+#include <folly/Benchmark.h>
+#include <folly/Format.h>
+#include <folly/Range.h>
+#include <folly/io/Cursor.h>
+#include <folly/io/Cursor-defs.h>
+using folly::ByteRange;
+using folly::format;
+using folly::IOBuf;
+using folly::StringPiece;
+using std::unique_ptr;
+using namespace folly::io;
+int benchmark_size = 1000;
+unique_ptr<IOBuf> iobuf_benchmark;
+unique_ptr<IOBuf> iobuf_read_benchmark;
+template <class CursClass>
+void runBenchmark() {
+ CursClass c(iobuf_benchmark.get());
+ for (int i = 0; i < benchmark_size; i++) {
+ c.write((uint8_t)0);
+ }
+BENCHMARK(rwPrivateCursorBenchmark, iters) {
+ while (iters--) {
+ runBenchmark<RWPrivateCursor>();
+ }
+BENCHMARK(rwUnshareCursorBenchmark, iters) {
+ while (iters--) {
+ runBenchmark<RWUnshareCursor>();
+ }
+BENCHMARK(cursorBenchmark, iters) {
+ while (iters--) {
+ Cursor c(iobuf_read_benchmark.get());
+ for (int i = 0; i < benchmark_size; i++) {
+ c.read<uint8_t>();
+ }
+ }
+BENCHMARK(skipBenchmark, iters) {
+ while (iters--) {
+ Cursor c(iobuf_read_benchmark.get());
+ for (int i = 0; i < benchmark_size; i++) {
+ c.peek();
+ c.skip(1);
+ }
+ }
+// fbmake opt
+// _bin/folly/experimental/io/test/iobuf_cursor_test -benchmark
+// Benchmark Iters Total t t/iter iter/sec
+// ---------------------------------------------------------------------------
+// rwPrivateCursorBenchmark 100000 142.9 ms 1.429 us 683.5 k
+// rwUnshareCursorBenchmark 100000 309.3 ms 3.093 us 315.7 k
+// cursorBenchmark 100000 741.4 ms 7.414 us 131.7 k
+// skipBenchmark 100000 738.9 ms 7.389 us 132.2 k
+// uname -a:
+// Linux dev2159.snc6.facebook.com 2.6.33-7_fbk15_104e4d0 #1 SMP
+// Tue Oct 19 22:40:30 PDT 2010 x86_64 x86_64 x86_64 GNU/Linux
+// 72GB RAM, 2 CPUs (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz)
+// hyperthreading disabled
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ iobuf_benchmark = IOBuf::create(benchmark_size);
+ iobuf_benchmark->append(benchmark_size);
+ iobuf_read_benchmark = IOBuf::create(1);
+ for (int i = 0; i < benchmark_size; i++) {
+ unique_ptr<IOBuf> iobuf2(IOBuf::create(1));
+ iobuf2->append(1);
+ iobuf_read_benchmark->prependChain(std::move(iobuf2));
+ }
+ folly::runBenchmarks();
+ return 0;
#include <folly/io/IOBuf.h>
-#include <gflags/gflags.h>
-#include <boost/random.hpp>
-#include <gtest/gtest.h>
-#include <folly/Benchmark.h>
#include <folly/Format.h>
#include <folly/Range.h>
#include <folly/io/Cursor.h>
#include <folly/io/Cursor-defs.h>
+#include <gtest/gtest.h>
using folly::ByteRange;
using folly::format;
EXPECT_STREQ("hello", curs.readFixedString(5).c_str());
-int benchmark_size = 1000;
-unique_ptr<IOBuf> iobuf_benchmark;
-unique_ptr<IOBuf> iobuf_read_benchmark;
-template <class CursClass>
-void runBenchmark() {
- CursClass c(iobuf_benchmark.get());
- for(int i = 0; i < benchmark_size; i++) {
- c.write((uint8_t)0);
- }
-BENCHMARK(rwPrivateCursorBenchmark, iters) {
- while (iters--) {
- runBenchmark<RWPrivateCursor>();
- }
-BENCHMARK(rwUnshareCursorBenchmark, iters) {
- while (iters--) {
- runBenchmark<RWUnshareCursor>();
- }
-BENCHMARK(cursorBenchmark, iters) {
- while (iters--) {
- Cursor c(iobuf_read_benchmark.get());
- for(int i = 0; i < benchmark_size ; i++) {
- c.read<uint8_t>();
- }
- }
-BENCHMARK(skipBenchmark, iters) {
- while (iters--) {
- Cursor c(iobuf_read_benchmark.get());
- for(int i = 0; i < benchmark_size ; i++) {
- c.peek();
- c.skip(1);
- }
- }
-// fbmake opt
-// _bin/folly/experimental/io/test/iobuf_cursor_test -benchmark
-// Benchmark Iters Total t t/iter iter/sec
-// ---------------------------------------------------------------------------
-// rwPrivateCursorBenchmark 100000 142.9 ms 1.429 us 683.5 k
-// rwUnshareCursorBenchmark 100000 309.3 ms 3.093 us 315.7 k
-// cursorBenchmark 100000 741.4 ms 7.414 us 131.7 k
-// skipBenchmark 100000 738.9 ms 7.389 us 132.2 k
-// uname -a:
-// Linux dev2159.snc6.facebook.com 2.6.33-7_fbk15_104e4d0 #1 SMP
-// Tue Oct 19 22:40:30 PDT 2010 x86_64 x86_64 x86_64 GNU/Linux
-// 72GB RAM, 2 CPUs (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz)
-// hyperthreading disabled
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- auto ret = RUN_ALL_TESTS();
- if (ret == 0 && FLAGS_benchmark) {
- iobuf_benchmark = IOBuf::create(benchmark_size);
- iobuf_benchmark->append(benchmark_size);
- iobuf_read_benchmark = IOBuf::create(1);
- for (int i = 0; i < benchmark_size; i++) {
- unique_ptr<IOBuf> iobuf2(IOBuf::create(1));
- iobuf2->append(1);
- iobuf_read_benchmark->prependChain(std::move(iobuf2));
- }
- folly::runBenchmarks();
- }
- return ret;
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOLLY_GFLAGS_H_
+#define FOLLY_GFLAGS_H_
+#include <folly/Portability.h>
+#include <gflags/gflags.h>
+#define DEFINE_int32(_name, _default, _description) int FLAGS_##_name = _default
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// @author Tudor Bosman (tudorb@fb.com)
+#include <folly/Bits.h>
+#include <folly/Benchmark.h>
+using namespace folly;
+BENCHMARK(nextPowTwoClz, iters) {
+ for (unsigned long i = 0; i < iters; ++i) {
+ auto x = folly::nextPowTwo(iters);
+ folly::doNotOptimizeAway(x);
+ }
+BENCHMARK(isPowTwo, iters) {
+ bool b;
+ for (unsigned long i = 0; i < iters; ++i) {
+ b = folly::isPowTwo(i);
+ folly::doNotOptimizeAway(b);
+ }
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
+Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled
+ (12 physical cores, 12 MB cache, 72 GB RAM)
+Benchmark Iters Total t t/iter iter/sec
+* nextPowTwoClz 1000000 1.659 ms 1.659 ns 574.8 M
// @author Tudor Bosman (tudorb@fb.com)
-#include <gflags/gflags.h>
#include <folly/Bits.h>
-#include <folly/Benchmark.h>
#include <gtest/gtest.h>
using namespace folly;
-BENCHMARK(nextPowTwoClz, iters) {
- for (unsigned long i = 0; i < iters; ++i) {
- auto x = folly::nextPowTwo(iters);
- folly::doNotOptimizeAway(x);
- }
TEST(Bits, isPowTwo) {
EXPECT_FALSE(isPowTwo((1ull<<63) + 1));
-BENCHMARK(isPowTwo, iters) {
- bool b;
- for (unsigned long i = 0; i < iters; ++i) {
- b = folly::isPowTwo(i);
- folly::doNotOptimizeAway(b);
- }
TEST(Bits, popcount) {
EXPECT_EQ(0, popcount(0U));
EXPECT_EQ(1, popcount(1U));
EXPECT_EQ(32, popcount(uint32_t(-1)));
EXPECT_EQ(64, popcount(uint64_t(-1)));
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- auto ret = RUN_ALL_TESTS();
- if (!ret && FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return ret;
-Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled
- (12 physical cores, 12 MB cache, 72 GB RAM)
-Benchmark Iters Total t t/iter iter/sec
-* nextPowTwoClz 1000000 1.659 ms 1.659 ns 574.8 M
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/detail/CacheLocality.h>
+#include <sched.h>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include <glog/logging.h>
+#include <folly/Benchmark.h>
+using namespace folly::detail;
+#define DECLARE_SPREADER_TAG(tag, locality, func) \
+ namespace { \
+ template <typename dummy> \
+ struct tag {}; \
+ } \
+ namespace folly { \
+ namespace detail { \
+ template <> \
+ const CacheLocality& CacheLocality::system<tag>() { \
+ static auto* inst = new CacheLocality(locality); \
+ return *inst; \
+ } \
+ template <> \
+ Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
+ return func; \
+ } \
+ } \
+ }
+ ThreadLocalTag,
+ CacheLocality::system<>(),
+ folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
+ CacheLocality::system<>(),
+ folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
+BENCHMARK(AccessSpreaderUse, iters) {
+ for (unsigned long i = 0; i < iters; ++i) {
+ auto x = AccessSpreader<>::current(16);
+ folly::doNotOptimizeAway(x);
+ }
+// Benchmark scores here reflect the time for 32 threads to perform an
+// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
+// if we don't separate the counters onto unique 128 byte stripes the
+// 1_stripe and 2_stripe results are identical, even though the L3 is
+// claimed to have 64 byte cache lines.
+// Getcpu refers to the vdso getcpu implementation. ThreadLocal refers
+// to execution using SequentialThreadId, the fallback if the vdso
+// getcpu isn't available. PthreadSelf hashes the value returned from
+// pthread_self() as a fallback-fallback for systems that don't have
+// thread-local support.
+// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
+// so since the stripe selection is 12 nanos the atomic increments in
+// the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected
+// to ping-pong almost every operation, since the loops have the same
+// duration. Widths 4 and 2 have the same behavior, but each tour of the
+// cache line is 4 and 8 cores long, respectively. These all suggest a
+// lower bound of 60 nanos for intra-chip handoff and increment between
+// the L1s.
+// With 420 nanos of busywork per contended increment, the system can
+// hide all of the latency of a tour of length 4, but not quite one of
+// length 8. I was a bit surprised at how much worse the non-striped
+// version got. It seems that the inter-chip traffic also interferes
+// with the L1-only localWork.load(). When the local work is doubled
+// to about 1 microsecond we see that the inter-chip contention is still
+// very important, but subdivisions on the same chip don't matter.
+// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
+// --benchmark --bm_min_iters=1000000
+// ============================================================================
+// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
+// ============================================================================
+// AccessSpreaderUse 11.94ns 83.79M
+// ----------------------------------------------------------------------------
+// contentionAtWidthGetcpu(1_stripe_0_work) 985.75ns 1.01M
+// contentionAtWidthGetcpu(2_stripe_0_work) 424.02ns 2.36M
+// contentionAtWidthGetcpu(4_stripe_0_work) 190.13ns 5.26M
+// contentionAtWidthGetcpu(8_stripe_0_work) 91.86ns 10.89M
+// contentionAtWidthGetcpu(16_stripe_0_work) 29.31ns 34.12M
+// contentionAtWidthGetcpu(32_stripe_0_work) 29.53ns 33.86M
+// contentionAtWidthGetcpu(64_stripe_0_work) 29.93ns 33.41M
+// contentionAtWidthThreadLocal(2_stripe_0_work) 609.21ns 1.64M
+// contentionAtWidthThreadLocal(4_stripe_0_work) 303.60ns 3.29M
+// contentionAtWidthThreadLocal(8_stripe_0_work) 246.57ns 4.06M
+// contentionAtWidthThreadLocal(16_stripe_0_work) 154.84ns 6.46M
+// contentionAtWidthThreadLocal(32_stripe_0_work) 24.14ns 41.43M
+// contentionAtWidthThreadLocal(64_stripe_0_work) 23.95ns 41.75M
+// contentionAtWidthPthreadSelf(2_stripe_0_work) 722.01ns 1.39M
+// contentionAtWidthPthreadSelf(4_stripe_0_work) 501.56ns 1.99M
+// contentionAtWidthPthreadSelf(8_stripe_0_work) 474.58ns 2.11M
+// contentionAtWidthPthreadSelf(16_stripe_0_work) 300.90ns 3.32M
+// contentionAtWidthPthreadSelf(32_stripe_0_work) 175.77ns 5.69M
+// contentionAtWidthPthreadSelf(64_stripe_0_work) 174.88ns 5.72M
+// atomicIncrBaseline(local_incr_0_work) 16.81ns 59.51M
+// ----------------------------------------------------------------------------
+// contentionAtWidthGetcpu(1_stripe_500_work) 1.82us 549.97K
+// contentionAtWidthGetcpu(2_stripe_500_work) 533.71ns 1.87M
+// contentionAtWidthGetcpu(4_stripe_500_work) 424.64ns 2.35M
+// contentionAtWidthGetcpu(8_stripe_500_work) 451.85ns 2.21M
+// contentionAtWidthGetcpu(16_stripe_500_work) 425.54ns 2.35M
+// contentionAtWidthGetcpu(32_stripe_500_work) 501.66ns 1.99M
+// atomicIncrBaseline(local_incr_500_work) 438.46ns 2.28M
+// ----------------------------------------------------------------------------
+// contentionAtWidthGetcpu(1_stripe_1000_work) 1.88us 532.20K
+// contentionAtWidthGetcpu(2_stripe_1000_work) 824.62ns 1.21M
+// contentionAtWidthGetcpu(4_stripe_1000_work) 803.56ns 1.24M
+// contentionAtWidthGetcpu(8_stripe_1000_work) 926.65ns 1.08M
+// contentionAtWidthGetcpu(16_stripe_1000_work) 900.10ns 1.11M
+// contentionAtWidthGetcpu(32_stripe_1000_work) 890.75ns 1.12M
+// atomicIncrBaseline(local_incr_1000_work) 774.47ns 1.29M
+// ============================================================================
+template <template <typename> class Tag>
+static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
+ const size_t counterAlignment = 128;
+ const size_t numThreads = 32;
+ folly::BenchmarkSuspender braces;
+ std::atomic<size_t> ready(0);
+ std::atomic<bool> go(false);
+ // while in theory the cache line size is 64 bytes, experiments show
+ // that we get contention on 128 byte boundaries for Ivy Bridge. The
+ // extra indirection adds 1 or 2 nanos
+ assert(counterAlignment >= sizeof(std::atomic<size_t>));
+ std::vector<char> raw(counterAlignment * stripes);
+ // if we happen to be using the tlsRoundRobin, then sequentially
+ // assigning the thread identifiers is the unlikely best-case scenario.
+ // We don't want to unfairly benefit or penalize. Computing the exact
+ // maximum likelihood of the probability distributions is annoying, so
+ // I approximate as 2/5 of the ids that have no threads, 2/5 that have
+ // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
+ // wrapping back to slot 0 when we hit 1/15 and 1/5.
+ std::vector<std::thread> threads;
+ while (threads.size() < numThreads) {
+ threads.push_back(std::thread([&, iters, stripes, work]() {
+ std::atomic<size_t>* counters[stripes];
+ for (size_t i = 0; i < stripes; ++i) {
+ counters[i] =
+ new (raw.data() + counterAlignment * i) std::atomic<size_t>();
+ }
+ ready++;
+ while (!go.load()) {
+ sched_yield();
+ }
+ std::atomic<int> localWork(0);
+ for (size_t i = iters; i > 0; --i) {
+ ++*(counters[AccessSpreader<Tag>::current(stripes)]);
+ for (size_t j = work; j > 0; --j) {
+ localWork.load();
+ }
+ }
+ }));
+ if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
+ // create a few dummy threads to wrap back around to 0 mod numCpus
+ for (size_t i = threads.size(); i != numThreads; ++i) {
+ std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
+ }
+ }
+ }
+ while (ready < numThreads) {
+ sched_yield();
+ }
+ braces.dismiss();
+ go = true;
+ for (auto& thr : threads) {
+ thr.join();
+ }
+static void atomicIncrBaseline(size_t iters,
+ size_t work,
+ size_t numThreads = 32) {
+ folly::BenchmarkSuspender braces;
+ std::atomic<bool> go(false);
+ std::vector<std::thread> threads;
+ while (threads.size() < numThreads) {
+ threads.push_back(std::thread([&]() {
+ while (!go.load()) {
+ sched_yield();
+ }
+ std::atomic<size_t> localCounter(0);
+ std::atomic<int> localWork(0);
+ for (size_t i = iters; i > 0; --i) {
+ localCounter++;
+ for (size_t j = work; j > 0; --j) {
+ localWork.load();
+ }
+ }
+ }));
+ }
+ braces.dismiss();
+ go = true;
+ for (auto& thr : threads) {
+ thr.join();
+ }
+static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
+ contentionAtWidth<std::atomic>(iters, stripes, work);
+static void contentionAtWidthThreadLocal(size_t iters,
+ size_t stripes,
+ size_t work) {
+ contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
+static void contentionAtWidthPthreadSelf(size_t iters,
+ size_t stripes,
+ size_t work) {
+ contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
+BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
#include <unordered_map>
#include <glog/logging.h>
#include <gtest/gtest.h>
-#include <folly/Benchmark.h>
using namespace folly::detail;
DECLARE_SPREADER_TAG(ManualTag, CacheLocality::uniform(16), testingGetcpu)
- ThreadLocalTag,
- CacheLocality::system<>(),
- folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
- CacheLocality::system<>(),
- folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
TEST(AccessSpreader, Wrapping) {
// this test won't pass unless locality.numCpus divides kMaxCpus
-BENCHMARK(AccessSpreaderUse, iters) {
- for (unsigned long i = 0; i < iters; ++i) {
- auto x = AccessSpreader<>::current(16);
- folly::doNotOptimizeAway(x);
- }
-// Benchmark scores here reflect the time for 32 threads to perform an
-// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
-// if we don't separate the counters onto unique 128 byte stripes the
-// 1_stripe and 2_stripe results are identical, even though the L3 is
-// claimed to have 64 byte cache lines.
-// Getcpu refers to the vdso getcpu implementation. ThreadLocal refers
-// to execution using SequentialThreadId, the fallback if the vdso
-// getcpu isn't available. PthreadSelf hashes the value returned from
-// pthread_self() as a fallback-fallback for systems that don't have
-// thread-local support.
-// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// so since the stripe selection is 12 nanos the atomic increments in
-// the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected
-// to ping-pong almost every operation, since the loops have the same
-// duration. Widths 4 and 2 have the same behavior, but each tour of the
-// cache line is 4 and 8 cores long, respectively. These all suggest a
-// lower bound of 60 nanos for intra-chip handoff and increment between
-// the L1s.
-// With 420 nanos of busywork per contended increment, the system can
-// hide all of the latency of a tour of length 4, but not quite one of
-// length 8. I was a bit surprised at how much worse the non-striped
-// version got. It seems that the inter-chip traffic also interferes
-// with the L1-only localWork.load(). When the local work is doubled
-// to about 1 microsecond we see that the inter-chip contention is still
-// very important, but subdivisions on the same chip don't matter.
-// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
-// --benchmark --bm_min_iters=1000000
-// ============================================================================
-// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
-// ============================================================================
-// AccessSpreaderUse 11.94ns 83.79M
-// ----------------------------------------------------------------------------
-// contentionAtWidthGetcpu(1_stripe_0_work) 985.75ns 1.01M
-// contentionAtWidthGetcpu(2_stripe_0_work) 424.02ns 2.36M
-// contentionAtWidthGetcpu(4_stripe_0_work) 190.13ns 5.26M
-// contentionAtWidthGetcpu(8_stripe_0_work) 91.86ns 10.89M
-// contentionAtWidthGetcpu(16_stripe_0_work) 29.31ns 34.12M
-// contentionAtWidthGetcpu(32_stripe_0_work) 29.53ns 33.86M
-// contentionAtWidthGetcpu(64_stripe_0_work) 29.93ns 33.41M
-// contentionAtWidthThreadLocal(2_stripe_0_work) 609.21ns 1.64M
-// contentionAtWidthThreadLocal(4_stripe_0_work) 303.60ns 3.29M
-// contentionAtWidthThreadLocal(8_stripe_0_work) 246.57ns 4.06M
-// contentionAtWidthThreadLocal(16_stripe_0_work) 154.84ns 6.46M
-// contentionAtWidthThreadLocal(32_stripe_0_work) 24.14ns 41.43M
-// contentionAtWidthThreadLocal(64_stripe_0_work) 23.95ns 41.75M
-// contentionAtWidthPthreadSelf(2_stripe_0_work) 722.01ns 1.39M
-// contentionAtWidthPthreadSelf(4_stripe_0_work) 501.56ns 1.99M
-// contentionAtWidthPthreadSelf(8_stripe_0_work) 474.58ns 2.11M
-// contentionAtWidthPthreadSelf(16_stripe_0_work) 300.90ns 3.32M
-// contentionAtWidthPthreadSelf(32_stripe_0_work) 175.77ns 5.69M
-// contentionAtWidthPthreadSelf(64_stripe_0_work) 174.88ns 5.72M
-// atomicIncrBaseline(local_incr_0_work) 16.81ns 59.51M
-// ----------------------------------------------------------------------------
-// contentionAtWidthGetcpu(1_stripe_500_work) 1.82us 549.97K
-// contentionAtWidthGetcpu(2_stripe_500_work) 533.71ns 1.87M
-// contentionAtWidthGetcpu(4_stripe_500_work) 424.64ns 2.35M
-// contentionAtWidthGetcpu(8_stripe_500_work) 451.85ns 2.21M
-// contentionAtWidthGetcpu(16_stripe_500_work) 425.54ns 2.35M
-// contentionAtWidthGetcpu(32_stripe_500_work) 501.66ns 1.99M
-// atomicIncrBaseline(local_incr_500_work) 438.46ns 2.28M
-// ----------------------------------------------------------------------------
-// contentionAtWidthGetcpu(1_stripe_1000_work) 1.88us 532.20K
-// contentionAtWidthGetcpu(2_stripe_1000_work) 824.62ns 1.21M
-// contentionAtWidthGetcpu(4_stripe_1000_work) 803.56ns 1.24M
-// contentionAtWidthGetcpu(8_stripe_1000_work) 926.65ns 1.08M
-// contentionAtWidthGetcpu(16_stripe_1000_work) 900.10ns 1.11M
-// contentionAtWidthGetcpu(32_stripe_1000_work) 890.75ns 1.12M
-// atomicIncrBaseline(local_incr_1000_work) 774.47ns 1.29M
-// ============================================================================
-template <template <typename> class Tag>
-static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
- const size_t counterAlignment = 128;
- const size_t numThreads = 32;
- folly::BenchmarkSuspender braces;
- std::atomic<size_t> ready(0);
- std::atomic<bool> go(false);
- // while in theory the cache line size is 64 bytes, experiments show
- // that we get contention on 128 byte boundaries for Ivy Bridge. The
- // extra indirection adds 1 or 2 nanos
- assert(counterAlignment >= sizeof(std::atomic<size_t>));
- std::vector<char> raw(counterAlignment * stripes);
- // if we happen to be using the tlsRoundRobin, then sequentially
- // assigning the thread identifiers is the unlikely best-case scenario.
- // We don't want to unfairly benefit or penalize. Computing the exact
- // maximum likelihood of the probability distributions is annoying, so
- // I approximate as 2/5 of the ids that have no threads, 2/5 that have
- // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
- // wrapping back to slot 0 when we hit 1/15 and 1/5.
- std::vector<std::thread> threads;
- while (threads.size() < numThreads) {
- threads.push_back(std::thread([&, iters, stripes, work]() {
- std::atomic<size_t>* counters[stripes];
- for (size_t i = 0; i < stripes; ++i) {
- counters[i] =
- new (raw.data() + counterAlignment * i) std::atomic<size_t>();
- }
- ready++;
- while (!go.load()) {
- sched_yield();
- }
- std::atomic<int> localWork(0);
- for (size_t i = iters; i > 0; --i) {
- ++*(counters[AccessSpreader<Tag>::current(stripes)]);
- for (size_t j = work; j > 0; --j) {
- localWork.load();
- }
- }
- }));
- if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
- // create a few dummy threads to wrap back around to 0 mod numCpus
- for (size_t i = threads.size(); i != numThreads; ++i) {
- std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
- }
- }
- }
- while (ready < numThreads) {
- sched_yield();
- }
- braces.dismiss();
- go = true;
- for (auto& thr : threads) {
- thr.join();
- }
-static void atomicIncrBaseline(size_t iters,
- size_t work,
- size_t numThreads = 32) {
- folly::BenchmarkSuspender braces;
- std::atomic<bool> go(false);
- std::vector<std::thread> threads;
- while (threads.size() < numThreads) {
- threads.push_back(std::thread([&]() {
- while (!go.load()) {
- sched_yield();
- }
- std::atomic<size_t> localCounter(0);
- std::atomic<int> localWork(0);
- for (size_t i = iters; i > 0; --i) {
- localCounter++;
- for (size_t j = work; j > 0; --j) {
- localWork.load();
- }
- }
- }));
- }
- braces.dismiss();
- go = true;
- for (auto& thr : threads) {
- thr.join();
- }
-static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
- contentionAtWidth<std::atomic>(iters, stripes, work);
-static void contentionAtWidthThreadLocal(size_t iters,
- size_t stripes,
- size_t work) {
- contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
-static void contentionAtWidthPthreadSelf(size_t iters,
- size_t stripes,
- size_t work) {
- contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
-BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
-BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
-BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
-BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- auto ret = RUN_ALL_TESTS();
- if (!ret && FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return ret;
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/CallOnce.h>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <folly/Benchmark.h>
+#include <glog/logging.h>
+DEFINE_int32(threads, 16, "benchmark concurrency");
+template <typename CallOnceFunc>
+void bm_impl(CallOnceFunc&& fn, int64_t iters) {
+ std::deque<std::thread> threads;
+ for (int i = 0; i < FLAGS_threads; ++i) {
+ threads.emplace_back([&fn, iters] {
+ for (int64_t j = 0; j < iters; ++j) {
+ fn();
+ }
+ });
+ }
+ for (std::thread& t : threads) {
+ t.join();
+ }
+BENCHMARK(StdCallOnceBench, iters) {
+ std::once_flag flag;
+ int out = 0;
+ bm_impl([&] { std::call_once(flag, [&] { ++out; }); }, iters);
+ CHECK_EQ(1, out);
+BENCHMARK(FollyCallOnceBench, iters) {
+ folly::once_flag flag;
+ int out = 0;
+ bm_impl([&] { folly::call_once(flag, [&] { ++out; }); }, iters);
+ CHECK_EQ(1, out);
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
#include <mutex>
#include <thread>
-#include <folly/Benchmark.h>
#include <folly/CallOnce.h>
-#include <gflags/gflags.h>
+#include <folly/portability/GFlags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
-BENCHMARK(StdCallOnceBench, iters) {
- std::once_flag flag;
- int out = 0;
- bm_impl([&] { std::call_once(flag, [&] { ++out; }); }, iters);
- ASSERT_EQ(1, out);
-BENCHMARK(FollyCallOnceBench, iters) {
- folly::once_flag flag;
- int out = 0;
- bm_impl([&] { folly::call_once(flag, [&] { ++out; }); }, iters);
- ASSERT_EQ(1, out);
TEST(FollyCallOnce, Simple) {
folly::once_flag flag;
auto fn = [&](int* outp) { ++*outp; };
ASSERT_EQ(1, out);
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- if (FLAGS_benchmark) {
- folly::runBenchmarksOnFlag();
- return 0;
- } else {
- return RUN_ALL_TESTS();
- }
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Conv.h>
+#include <boost/lexical_cast.hpp>
+#include <folly/Benchmark.h>
+#include <folly/Foreach.h>
+#include <limits>
+#include <stdexcept>
+using namespace std;
+using namespace folly;
+// Benchmarks for ASCII to int conversion
+// @author: Rajat Goel (rajat)
+static int64_t handwrittenAtoi(const char* start, const char* end) {
+ bool positive = true;
+ int64_t retVal = 0;
+ if (start == end) {
+ throw std::runtime_error("empty string");
+ }
+ while (start < end && isspace(*start)) {
+ ++start;
+ }
+ switch (*start) {
+ case '-':
+ positive = false;
+ case '+':
+ ++start;
+ default:
+ ;
+ }
+ while (start < end && *start >= '0' && *start <= '9') {
+ auto const newRetVal = retVal * 10 + (*start++ - '0');
+ if (newRetVal < retVal) {
+ throw std::runtime_error("overflow");
+ }
+ retVal = newRetVal;
+ }
+ if (start != end) {
+ throw std::runtime_error("extra chars at the end");
+ }
+ return positive ? retVal : -retVal;
+static StringPiece pc1 = "1234567890123456789";
+void handwrittenAtoiMeasure(unsigned int n, unsigned int digits) {
+ auto p = pc1.subpiece(pc1.size() - digits, digits);
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(handwrittenAtoi(p.begin(), p.end()));
+ }
+void follyAtoiMeasure(unsigned int n, unsigned int digits) {
+ auto p = pc1.subpiece(pc1.size() - digits, digits);
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(folly::to<int64_t>(p.begin(), p.end()));
+ }
+void clibAtoiMeasure(unsigned int n, unsigned int digits) {
+ auto p = pc1.subpiece(pc1.size() - digits, digits);
+ assert(*p.end() == 0);
+ static_assert(sizeof(long) == 8, "64-bit long assumed");
+ FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(atol(p.begin())); }
+void clibStrtoulMeasure(unsigned int n, unsigned int digits) {
+ auto p = pc1.subpiece(pc1.size() - digits, digits);
+ assert(*p.end() == 0);
+ char* endptr;
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(strtoul(p.begin(), &endptr, 10));
+ }
+void lexicalCastMeasure(unsigned int n, unsigned int digits) {
+ auto p = pc1.subpiece(pc1.size() - digits, digits);
+ assert(*p.end() == 0);
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(boost::lexical_cast<uint64_t>(p.begin()));
+ }
+// Benchmarks for unsigned to string conversion, raw
+unsigned u64ToAsciiTable(uint64_t value, char* dst) {
+ static const char digits[201] =
+ "00010203040506070809"
+ "10111213141516171819"
+ "20212223242526272829"
+ "30313233343536373839"
+ "40414243444546474849"
+ "50515253545556575859"
+ "60616263646566676869"
+ "70717273747576777879"
+ "80818283848586878889"
+ "90919293949596979899";
+ uint32_t const length = digits10(value);
+ uint32_t next = length - 1;
+ while (value >= 100) {
+ auto const i = (value % 100) * 2;
+ value /= 100;
+ dst[next] = digits[i + 1];
+ dst[next - 1] = digits[i];
+ next -= 2;
+ }
+ // Handle last 1-2 digits
+ if (value < 10) {
+ dst[next] = '0' + uint32_t(value);
+ } else {
+ auto i = uint32_t(value) * 2;
+ dst[next] = digits[i + 1];
+ dst[next - 1] = digits[i];
+ }
+ return length;
+void u64ToAsciiTableBM(unsigned int n, uint64_t value) {
+ // This is too fast, need to do 10 times per iteration
+ char buf[20];
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(u64ToAsciiTable(value + n, buf));
+ }
+unsigned u64ToAsciiClassic(uint64_t value, char* dst) {
+ // Write backwards.
+ char* next = (char*)dst;
+ char* start = next;
+ do {
+ *next++ = '0' + (value % 10);
+ value /= 10;
+ } while (value != 0);
+ unsigned length = next - start;
+ // Reverse in-place.
+ next--;
+ while (next > start) {
+ char swap = *next;
+ *next = *start;
+ *start = swap;
+ next--;
+ start++;
+ }
+ return length;
+void u64ToAsciiClassicBM(unsigned int n, uint64_t value) {
+ // This is too fast, need to do 10 times per iteration
+ char buf[20];
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(u64ToAsciiClassic(value + n, buf));
+ }
+void u64ToAsciiFollyBM(unsigned int n, uint64_t value) {
+ // This is too fast, need to do 10 times per iteration
+ char buf[20];
+ FOR_EACH_RANGE(i, 0, n) {
+ doNotOptimizeAway(uint64ToBufferUnsafe(value + n, buf));
+ }
+// Benchmark unsigned to string conversion
+void u64ToStringClibMeasure(unsigned int n, uint64_t value) {
+ // FOLLY_RANGE_CHECK_TO_STRING expands to std::to_string, except on Android
+ // where std::to_string is not supported
+void u64ToStringFollyMeasure(unsigned int n, uint64_t value) {
+ FOR_EACH_RANGE(i, 0, n) { to<std::string>(value + n); }
+// Benchmark uitoa with string append
+void u2aAppendClassicBM(unsigned int n, uint64_t value) {
+ string s;
+ FOR_EACH_RANGE(i, 0, n) {
+ // auto buf = &s.back() + 1;
+ char buffer[20];
+ s.append(buffer, u64ToAsciiClassic(value, buffer));
+ doNotOptimizeAway(s.size());
+ }
+void u2aAppendFollyBM(unsigned int n, uint64_t value) {
+ string s;
+ FOR_EACH_RANGE(i, 0, n) {
+ // auto buf = &s.back() + 1;
+ char buffer[20];
+ s.append(buffer, uint64ToBufferUnsafe(value, buffer));
+ doNotOptimizeAway(s.size());
+ }
+template <class String>
+struct StringIdenticalToBM {
+ StringIdenticalToBM() {}
+ void operator()(unsigned int n, size_t len) const {
+ String s;
+ BENCHMARK_SUSPEND { s.append(len, '0'); }
+ FOR_EACH_RANGE(i, 0, n) {
+ String result = to<String>(s);
+ doNotOptimizeAway(result.size());
+ }
+ }
+template <class String>
+struct StringVariadicToBM {
+ StringVariadicToBM() {}
+ void operator()(unsigned int n, size_t len) const {
+ String s;
+ BENCHMARK_SUSPEND { s.append(len, '0'); }
+ FOR_EACH_RANGE(i, 0, n) {
+ String result = to<String>(s, nullptr);
+ doNotOptimizeAway(result.size());
+ }
+ }
+static size_t bigInt = 11424545345345;
+static size_t smallInt = 104;
+static char someString[] = "this is some nice string";
+static char otherString[] = "this is a long string, so it's not so nice";
+static char reallyShort[] = "meh";
+static std::string stdString = "std::strings are very nice";
+static float fValue = 1.2355;
+static double dValue = 345345345.435;
+BENCHMARK(preallocateTestNoFloat, n) {
+ for (size_t i = 0; i < n; ++i) {
+ auto val1 = to<std::string>(bigInt, someString, stdString, otherString);
+ auto val3 = to<std::string>(reallyShort, smallInt);
+ auto val2 = to<std::string>(bigInt, stdString);
+ auto val4 = to<std::string>(bigInt, stdString, dValue, otherString);
+ auto val5 = to<std::string>(bigInt, someString, reallyShort);
+ }
+BENCHMARK(preallocateTestFloat, n) {
+ for (size_t i = 0; i < n; ++i) {
+ auto val1 = to<std::string>(stdString, ',', fValue, dValue);
+ auto val2 = to<std::string>(stdString, ',', dValue);
+ }
+static const StringIdenticalToBM<std::string> stringIdenticalToBM;
+static const StringVariadicToBM<std::string> stringVariadicToBM;
+static const StringIdenticalToBM<fbstring> fbstringIdenticalToBM;
+static const StringVariadicToBM<fbstring> fbstringVariadicToBM;
+ BENCHMARK_PARAM(u64ToAsciiClassicBM, n); \
+ BENCHMARK_PARAM(u64ToStringClibMeasure, n); \
+ BENCHMARK_RELATIVE_PARAM(u64ToStringFollyMeasure, n); \
+ BENCHMARK_PARAM(clibAtoiMeasure, n); \
+ BENCHMARK_RELATIVE_PARAM(lexicalCastMeasure, n); \
+ BENCHMARK_RELATIVE_PARAM(handwrittenAtoiMeasure, n); \
+ BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n); \
+ BENCHMARK_PARAM(T##VariadicToBM, n); \
+DEFINE_BENCHMARK_GROUP(fbstring, 1024);
+DEFINE_BENCHMARK_GROUP(fbstring, 32768);
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
* limitations under the License.
-#include <folly/Benchmark.h>
#include <folly/Conv.h>
#include <folly/Foreach.h>
#include <boost/lexical_cast.hpp>
using namespace folly;
-TEST(Conv, digits10Minimal) {
- // Not much of a test (and it's included in the test below anyway).
- // I just want to inspect the generated assembly for this function.
- folly::doNotOptimizeAway(digits10(random() * random()));
TEST(Conv, digits10) {
char buffer[100];
uint64_t power;
toAppendDelimFit(",", str1, str2, &res3);
EXPECT_EQ(res3, str1 + "," + str2);
-// Benchmarks for ASCII to int conversion
-// @author: Rajat Goel (rajat)
-static int64_t handwrittenAtoi(const char* start, const char* end) {
- bool positive = true;
- int64_t retVal = 0;
- if (start == end) {
- throw std::runtime_error("empty string");
- }
- while (start < end && isspace(*start)) {
- ++start;
- }
- switch (*start) {
- case '-':
- positive = false;
- case '+':
- ++start;
- default:;
- }
- while (start < end && *start >= '0' && *start <= '9') {
- auto const newRetVal = retVal * 10 + (*start++ - '0');
- if (newRetVal < retVal) {
- throw std::runtime_error("overflow");
- }
- retVal = newRetVal;
- }
- if (start != end) {
- throw std::runtime_error("extra chars at the end");
- }
- return positive ? retVal : -retVal;
-static StringPiece pc1 = "1234567890123456789";
-void handwrittenAtoiMeasure(unsigned int n, unsigned int digits) {
- auto p = pc1.subpiece(pc1.size() - digits, digits);
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(handwrittenAtoi(p.begin(), p.end()));
- }
-void follyAtoiMeasure(unsigned int n, unsigned int digits) {
- auto p = pc1.subpiece(pc1.size() - digits, digits);
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(folly::to<int64_t>(p.begin(), p.end()));
- }
-void clibAtoiMeasure(unsigned int n, unsigned int digits) {
- auto p = pc1.subpiece(pc1.size() - digits, digits);
- assert(*p.end() == 0);
- static_assert(sizeof(long) == 8, "64-bit long assumed");
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(atol(p.begin()));
- }
-void clibStrtoulMeasure(unsigned int n, unsigned int digits) {
- auto p = pc1.subpiece(pc1.size() - digits, digits);
- assert(*p.end() == 0);
- char * endptr;
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(strtoul(p.begin(), &endptr, 10));
- }
-void lexicalCastMeasure(unsigned int n, unsigned int digits) {
- auto p = pc1.subpiece(pc1.size() - digits, digits);
- assert(*p.end() == 0);
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(boost::lexical_cast<uint64_t>(p.begin()));
- }
-// Benchmarks for unsigned to string conversion, raw
-unsigned u64ToAsciiTable(uint64_t value, char* dst) {
- static const char digits[201] =
- "00010203040506070809"
- "10111213141516171819"
- "20212223242526272829"
- "30313233343536373839"
- "40414243444546474849"
- "50515253545556575859"
- "60616263646566676869"
- "70717273747576777879"
- "80818283848586878889"
- "90919293949596979899";
- uint32_t const length = digits10(value);
- uint32_t next = length - 1;
- while (value >= 100) {
- auto const i = (value % 100) * 2;
- value /= 100;
- dst[next] = digits[i + 1];
- dst[next - 1] = digits[i];
- next -= 2;
- }
- // Handle last 1-2 digits
- if (value < 10) {
- dst[next] = '0' + uint32_t(value);
- } else {
- auto i = uint32_t(value) * 2;
- dst[next] = digits[i + 1];
- dst[next - 1] = digits[i];
- }
- return length;
-void u64ToAsciiTableBM(unsigned int n, uint64_t value) {
- // This is too fast, need to do 10 times per iteration
- char buf[20];
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(u64ToAsciiTable(value + n, buf));
- }
-unsigned u64ToAsciiClassic(uint64_t value, char* dst) {
- // Write backwards.
- char* next = (char*)dst;
- char* start = next;
- do {
- *next++ = '0' + (value % 10);
- value /= 10;
- } while (value != 0);
- unsigned length = next - start;
- // Reverse in-place.
- next--;
- while (next > start) {
- char swap = *next;
- *next = *start;
- *start = swap;
- next--;
- start++;
- }
- return length;
-void u64ToAsciiClassicBM(unsigned int n, uint64_t value) {
- // This is too fast, need to do 10 times per iteration
- char buf[20];
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(u64ToAsciiClassic(value + n, buf));
- }
-void u64ToAsciiFollyBM(unsigned int n, uint64_t value) {
- // This is too fast, need to do 10 times per iteration
- char buf[20];
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(uint64ToBufferUnsafe(value + n, buf));
- }
-// Benchmark unsigned to string conversion
-void u64ToStringClibMeasure(unsigned int n, uint64_t value) {
- // FOLLY_RANGE_CHECK_TO_STRING expands to std::to_string, except on Android
- // where std::to_string is not supported
- FOR_EACH_RANGE (i, 0, n) {
- }
-void u64ToStringFollyMeasure(unsigned int n, uint64_t value) {
- FOR_EACH_RANGE (i, 0, n) {
- to<std::string>(value + n);
- }
-// Benchmark uitoa with string append
-void u2aAppendClassicBM(unsigned int n, uint64_t value) {
- string s;
- FOR_EACH_RANGE (i, 0, n) {
- // auto buf = &s.back() + 1;
- char buffer[20];
- s.append(buffer, u64ToAsciiClassic(value, buffer));
- doNotOptimizeAway(s.size());
- }
-void u2aAppendFollyBM(unsigned int n, uint64_t value) {
- string s;
- FOR_EACH_RANGE (i, 0, n) {
- // auto buf = &s.back() + 1;
- char buffer[20];
- s.append(buffer, uint64ToBufferUnsafe(value, buffer));
- doNotOptimizeAway(s.size());
- }
-template <class String>
-struct StringIdenticalToBM {
- StringIdenticalToBM() {}
- void operator()(unsigned int n, size_t len) const {
- String s;
- BENCHMARK_SUSPEND { s.append(len, '0'); }
- FOR_EACH_RANGE (i, 0, n) {
- String result = to<String>(s);
- doNotOptimizeAway(result.size());
- }
- }
-template <class String>
-struct StringVariadicToBM {
- StringVariadicToBM() {}
- void operator()(unsigned int n, size_t len) const {
- String s;
- BENCHMARK_SUSPEND { s.append(len, '0'); }
- FOR_EACH_RANGE (i, 0, n) {
- String result = to<String>(s, nullptr);
- doNotOptimizeAway(result.size());
- }
- }
-static size_t bigInt = 11424545345345;
-static size_t smallInt = 104;
-static char someString[] = "this is some nice string";
-static char otherString[] = "this is a long string, so it's not so nice";
-static char reallyShort[] = "meh";
-static std::string stdString = "std::strings are very nice";
-static float fValue = 1.2355;
-static double dValue = 345345345.435;
-BENCHMARK(preallocateTestNoFloat, n) {
- for (size_t i = 0; i < n; ++i) {
- auto val1 = to<std::string>(bigInt, someString, stdString, otherString);
- auto val3 = to<std::string>(reallyShort, smallInt);
- auto val2 = to<std::string>(bigInt, stdString);
- auto val4 = to<std::string>(bigInt, stdString, dValue, otherString);
- auto val5 = to<std::string>(bigInt, someString, reallyShort);
- }
-BENCHMARK(preallocateTestFloat, n) {
- for (size_t i = 0; i < n; ++i) {
- auto val1 = to<std::string>(stdString, ',', fValue, dValue);
- auto val2 = to<std::string>(stdString, ',', dValue);
- }
-static const StringIdenticalToBM<std::string> stringIdenticalToBM;
-static const StringVariadicToBM<std::string> stringVariadicToBM;
-static const StringIdenticalToBM<fbstring> fbstringIdenticalToBM;
-static const StringVariadicToBM<fbstring> fbstringVariadicToBM;
- BENCHMARK_PARAM(u64ToAsciiClassicBM, n); \
- BENCHMARK_PARAM(u64ToStringClibMeasure, n); \
- BENCHMARK_RELATIVE_PARAM(u64ToStringFollyMeasure, n); \
- BENCHMARK_PARAM(clibAtoiMeasure, n); \
- BENCHMARK_RELATIVE_PARAM(lexicalCastMeasure, n); \
- BENCHMARK_RELATIVE_PARAM(handwrittenAtoiMeasure, n); \
- BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n); \
- BENCHMARK_PARAM(T ## VariadicToBM, n); \
-DEFINE_BENCHMARK_GROUP(fbstring, 1024);
-DEFINE_BENCHMARK_GROUP(fbstring, 32768);
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- auto ret = RUN_ALL_TESTS();
- if (!ret && FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return ret;
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Logging.h>
+#include <folly/Benchmark.h>
+#include <vector>
+BENCHMARK(skip_overhead, iter) {
+ auto prev = FLAGS_minloglevel;
+ FLAGS_minloglevel = 2;
+ for (unsigned i = 0; i < iter; ++i) {
+ FB_LOG_EVERY_MS(INFO, 1000) << "every 1s";
+ }
+ FLAGS_minloglevel = prev;
+BENCHMARK(dev_null_log_overhead, iter) {
+ auto prev = FLAGS_minloglevel;
+ FLAGS_minloglevel = 2;
+ for (unsigned i = 0; i < iter; ++i) {
+ FB_LOG_EVERY_MS(INFO, -1) << "every -1ms";
+ }
+ FLAGS_minloglevel = prev;
+// ============================================================================
+// folly/test/LoggingTest.cpp relative time/iter iters/s
+// ============================================================================
+// skip_overhead 36.37ns 27.49M
+// dev_null_log_overhead 2.61us 382.57K
+// ============================================================================
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
#include <folly/Logging.h>
-#include <gflags/gflags.h>
#include <gtest/gtest.h>
-#include <folly/Benchmark.h>
#include <vector>
TEST(LogEveryMs, basic) {
EXPECT_EQ(10, count);
-BENCHMARK(skip_overhead, iter) {
- auto prev = FLAGS_minloglevel;
- FLAGS_minloglevel = 2;
- for (unsigned i = 0; i < iter; ++i) {
- FB_LOG_EVERY_MS(INFO, 1000) << "every 1s";
- }
- FLAGS_minloglevel = prev;
-BENCHMARK(dev_null_log_overhead, iter) {
- auto prev = FLAGS_minloglevel;
- FLAGS_minloglevel = 2;
- for (unsigned i = 0; i < iter; ++i) {
- FB_LOG_EVERY_MS(INFO, -1) << "every -1ms";
- }
- FLAGS_minloglevel = prev;
-// ============================================================================
-// folly/test/LoggingTest.cpp relative time/iter iters/s
-// ============================================================================
-// skip_overhead 36.37ns 27.49M
-// dev_null_log_overhead 2.61us 382.57K
-// ============================================================================
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- auto rv = RUN_ALL_TESTS();
- if (!rv && FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return rv;
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/detail/MemoryIdler.h>
+#include <folly/Benchmark.h>
+using namespace folly::detail;
+BENCHMARK(releaseStack, iters) {
+ for (size_t i = 0; i < iters; ++i) {
+ MemoryIdler::unmapUnusedStack();
+ }
+BENCHMARK(releaseMallocTLS, iters) {
+ for (size_t i = 0; i < iters; ++i) {
+ MemoryIdler::flushLocalMallocCaches();
+ }
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
#include <folly/detail/MemoryIdler.h>
#include <folly/Baton.h>
#include <memory>
#include <thread>
#include <assert.h>
#include <semaphore.h>
-#include <gflags/gflags.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
-#include <folly/Benchmark.h>
using namespace folly;
using namespace folly::detail;
EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
fut, 1, -1, MockClock::duration::max())));
-BENCHMARK(releaseStack, iters) {
- for (size_t i = 0; i < iters; ++i) {
- MemoryIdler::unmapUnusedStack();
- }
-BENCHMARK(releaseMallocTLS, iters) {
- for (size_t i = 0; i < iters; ++i) {
- MemoryIdler::flushLocalMallocCaches();
- }
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- auto rv = RUN_ALL_TESTS();
- if (!rv && FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return rv;
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Random.h>
+#include <folly/Benchmark.h>
+#include <folly/Foreach.h>
+#include <folly/Range.h>
+#include <glog/logging.h>
+#include <algorithm>
+#include <thread>
+#include <vector>
+#include <random>
+using namespace folly;
+BENCHMARK(minstdrand, n) {
+ BenchmarkSuspender braces;
+ std::random_device rd;
+ std::minstd_rand rng(rd());
+ braces.dismiss();
+ FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(rng()); }
+BENCHMARK(mt19937, n) {
+ BenchmarkSuspender braces;
+ std::random_device rd;
+ std::mt19937 rng(rd());
+ braces.dismiss();
+ FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(rng()); }
+BENCHMARK(threadprng, n) {
+ BenchmarkSuspender braces;
+ ThreadLocalPRNG tprng;
+ tprng();
+ braces.dismiss();
+ FOR_EACH_RANGE(i, 0, n) { doNotOptimizeAway(tprng()); }
+BENCHMARK(RandomDouble) { doNotOptimizeAway(Random::randDouble01()); }
+BENCHMARK(Random32) { doNotOptimizeAway(Random::rand32()); }
+BENCHMARK(Random32Num) { doNotOptimizeAway(Random::rand32(100)); }
+BENCHMARK(Random64) { doNotOptimizeAway(Random::rand64()); }
+BENCHMARK(Random64Num) { doNotOptimizeAway(Random::rand64(100ul << 32)); }
+BENCHMARK(Random64OneIn) { doNotOptimizeAway(Random::oneIn(100)); }
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ folly::runBenchmarks();
+ return 0;
#include <folly/Random.h>
-#include <folly/Range.h>
-#include <folly/Benchmark.h>
-#include <folly/Foreach.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
EXPECT_LT(seeds[i], seeds[i+1]);
-BENCHMARK(minstdrand, n) {
- BenchmarkSuspender braces;
- std::random_device rd;
- std::minstd_rand rng(rd());
- braces.dismiss();
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(rng());
- }
-BENCHMARK(mt19937, n) {
- BenchmarkSuspender braces;
- std::random_device rd;
- std::mt19937 rng(rd());
- braces.dismiss();
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(rng());
- }
-BENCHMARK(threadprng, n) {
- BenchmarkSuspender braces;
- ThreadLocalPRNG tprng;
- tprng();
- braces.dismiss();
- FOR_EACH_RANGE (i, 0, n) {
- doNotOptimizeAway(tprng());
- }
-BENCHMARK(RandomDouble) { doNotOptimizeAway(Random::randDouble01()); }
-BENCHMARK(Random32) { doNotOptimizeAway(Random::rand32()); }
-BENCHMARK(Random32Num) { doNotOptimizeAway(Random::rand32(100)); }
-BENCHMARK(Random64) { doNotOptimizeAway(Random::rand64()); }
-BENCHMARK(Random64Num) { doNotOptimizeAway(Random::rand64(100ul << 32)); }
-BENCHMARK(Random64OneIn) { doNotOptimizeAway(Random::oneIn(100)); }
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- if (FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return RUN_ALL_TESTS();
--- /dev/null
+ * Copyright 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/ThreadLocal.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <limits.h>
+#include <map>
+#include <mutex>
+#include <set>
+#include <thread>
+#include <unordered_map>
+#include <boost/thread/tss.hpp>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <folly/Benchmark.h>
+#include <folly/experimental/io/FsUtil.h>
+using namespace folly;
+// Simple reference implementation using pthread_get_specific
+template <typename T>
+class PThreadGetSpecific {
+ public:
+ PThreadGetSpecific() : key_(0) { pthread_key_create(&key_, OnThreadExit); }
+ T* get() const { return static_cast<T*>(pthread_getspecific(key_)); }
+ void reset(T* t) {
+ delete get();
+ pthread_setspecific(key_, t);
+ }
+ static void OnThreadExit(void* obj) { delete static_cast<T*>(obj); }
+ private:
+ pthread_key_t key_;
+DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
+#define REG(var) \
+ BENCHMARK(FB_CONCATENATE(BM_mt_, var), iters) { \
+ const int itersPerThread = iters / FLAGS_numThreads; \
+ std::vector<std::thread> threads; \
+ for (int i = 0; i < FLAGS_numThreads; ++i) { \
+ threads.push_back(std::thread([&]() { \
+ var.reset(new int(0)); \
+ for (int i = 0; i < itersPerThread; ++i) { \
+ ++(*var.get()); \
+ } \
+ })); \
+ } \
+ for (auto& t : threads) { \
+ t.join(); \
+ } \
+ }
+ThreadLocalPtr<int> tlp;
+PThreadGetSpecific<int> pthread_get_specific;
+boost::thread_specific_ptr<int> boost_tsp;
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ gflags::SetCommandLineOptionWithMode(
+ "bm_max_iters", "100000000", gflags::SET_FLAG_IF_DEFAULT);
+ folly::runBenchmarks();
+ return 0;
+Ran with 24 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
+Benchmark Iters Total t t/iter iter/sec
+* BM_mt_tlp 100000000 39.88 ms 398.8 ps 2.335 G
+ +5.91% BM_mt_pthread_get_specific 100000000 42.23 ms 422.3 ps 2.205 G
+ + 295% BM_mt_boost_tsp 100000000 157.8 ms 1.578 ns 604.5 M
#include <thread>
#include <unordered_map>
-#include <boost/thread/tss.hpp>
-#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
-#include <folly/Benchmark.h>
#include <folly/Baton.h>
#include <folly/experimental/io/FsUtil.h>
// yep!
-// clang is unable to compile this code unless in c++14 mode.
-#if __cplusplus >= 201402L
-namespace {
-// This will fail to compile unless ThreadLocal{Ptr} has a constexpr
-// default constructor. This ensures that ThreadLocal is safe to use in
-// static constructors without worrying about initialization order
-class ConstexprThreadLocalCompile {
- ThreadLocal<int> a_;
- ThreadLocalPtr<int> b_;
- constexpr ConstexprThreadLocalCompile() {}
-// Simple reference implementation using pthread_get_specific
-template<typename T>
-class PThreadGetSpecific {
- public:
- PThreadGetSpecific() : key_(0) {
- pthread_key_create(&key_, OnThreadExit);
- }
- T* get() const {
- return static_cast<T*>(pthread_getspecific(key_));
- }
- void reset(T* t) {
- delete get();
- pthread_setspecific(key_, t);
- }
- static void OnThreadExit(void* obj) {
- delete static_cast<T*>(obj);
- }
- private:
- pthread_key_t key_;
-DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
-#define REG(var) \
- BENCHMARK(FB_CONCATENATE(BM_mt_, var), iters) { \
- const int itersPerThread = iters / FLAGS_numThreads; \
- std::vector<std::thread> threads; \
- for (int i = 0; i < FLAGS_numThreads; ++i) { \
- threads.push_back(std::thread([&]() { \
- var.reset(new int(0)); \
- for (int i = 0; i < itersPerThread; ++i) { \
- ++(*var.get()); \
- } \
- })); \
- } \
- for (auto& t : threads) { \
- t.join(); \
- } \
- }
-ThreadLocalPtr<int> tlp;
-PThreadGetSpecific<int> pthread_get_specific;
-boost::thread_specific_ptr<int> boost_tsp;
-int main(int argc, char** argv) {
- testing::InitGoogleTest(&argc, argv);
- gflags::ParseCommandLineFlags(&argc, &argv, true);
- gflags::SetCommandLineOptionWithMode(
- "bm_max_iters", "100000000", gflags::SET_FLAG_IF_DEFAULT
- );
- if (FLAGS_benchmark) {
- folly::runBenchmarks();
- }
- return RUN_ALL_TESTS();
-Ran with 24 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
-Benchmark Iters Total t t/iter iter/sec
-* BM_mt_tlp 100000000 39.88 ms 398.8 ps 2.335 G
- +5.91% BM_mt_pthread_get_specific 100000000 42.23 ms 422.3 ps 2.205 G
- + 295% BM_mt_boost_tsp 100000000 157.8 ms 1.578 ns 604.5 M