/// it doesn't have false sharing with anything at a smaller memory address.
#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING FOLLY_ALIGNED(128)
-/// Holds a function pointer to the VDSO implementation of getcpu(2),
-/// if available
+/// Knows how to derive a function pointer to the VDSO implementation of
+/// getcpu(2), if available
struct Getcpu {
/// Function pointer to a function with the same signature as getcpu(2).
typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused);
/// Returns a pointer to the VDSO implementation of getcpu(2), if
- /// available, or nullptr otherwise
- static Func vdsoFunc();
+ /// available, or nullptr otherwise. This function may be quite
+ /// expensive, be sure to cache the result.
+ static Func resolveVdsoFunc();
};
#ifdef FOLLY_TLS
typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
#endif
-template <template <typename> class Atom, size_t kMaxCpus>
-struct AccessSpreaderArray;
-
/// AccessSpreader arranges access to a striped data structure in such a
/// way that concurrently executing threads are likely to be accessing
/// different stripes. It does NOT guarantee uncontended access.
/// Your underlying algorithm must be thread-safe without spreading, this
/// is merely an optimization. AccessSpreader::current(n) is typically
-/// much faster than a cache miss (22 nanos on my dev box, tested fast
+/// much faster than a cache miss (12 nanos on my dev box, tested fast
/// in both 2.6 and 3.2 kernels).
///
-/// You are free to create your own AccessSpreader-s or to cache the
-/// results of AccessSpreader<>::shared(n), but you will probably want
-/// to use one of the system-wide shared ones. Calling .current() on
-/// a particular AccessSpreader instance only saves about 1 nanosecond
-/// over calling AccessSpreader<>::shared(n).
-///
/// If available (and not using the deterministic testing implementation)
/// AccessSpreader uses the getcpu system call via VDSO and the
/// precise locality information retrieved from sysfs by CacheLocality.
/// own stripe and there will be no cache sharing at all.
///
/// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
-/// loaded, or for use during deterministic testing. Using sched_getcpu or
-/// the getcpu syscall would negate the performance advantages of access
-/// spreading, so we use a thread-local value and a shared atomic counter
-/// to spread access out.
+/// loaded, or for use during deterministic testing. Using sched_getcpu
+/// or the getcpu syscall would negate the performance advantages of
+/// access spreading, so we use a thread-local value and a shared atomic
+/// counter to spread access out. On systems lacking both a fast getcpu()
+/// and TLS, we hash the thread id to spread accesses.
///
/// AccessSpreader is templated on the template type that is used
/// to implement atomics, as a way to instantiate the underlying
template <template <typename> class Atom = std::atomic>
struct AccessSpreader {
- /// Returns a never-destructed shared AccessSpreader instance.
- /// numStripes should be > 0.
- static const AccessSpreader& shared(size_t numStripes) {
- // sharedInstances[0] actually has numStripes == 1
- assert(numStripes > 0);
-
- // the last shared element handles all large sizes
- return AccessSpreaderArray<Atom, kMaxCpus>::sharedInstance[std::min(
- size_t(kMaxCpus), numStripes)];
- }
-
- /// Returns the stripe associated with the current CPU, assuming
- /// that there are numStripes (non-zero) stripes. Equivalent to
- /// AccessSpreader::shared(numStripes)->current.
+ /// Returns the stripe associated with the current CPU. The returned
+ /// value will be < numStripes.
static size_t current(size_t numStripes) {
- return shared(numStripes).current();
- }
-
- /// stripeByCore uses 1 stripe per L1 cache, according to
- /// CacheLocality::system<>(). Use stripeByCore.numStripes() to see
- /// its width, or stripeByCore.current() to get the current stripe
- static const AccessSpreader stripeByCore;
-
- /// stripeByChip uses 1 stripe per last-level cache, which is the fewest
- /// number of stripes for which off-chip communication can be avoided
- /// (assuming all caches are on-chip). Use stripeByChip.numStripes()
- /// to see its width, or stripeByChip.current() to get the current stripe
- static const AccessSpreader stripeByChip;
-
- /// Constructs an AccessSpreader that will return values from
- /// 0 to numStripes-1 (inclusive), precomputing the mapping
- /// from CPU to stripe. There is no use in having more than
- /// CacheLocality::system<Atom>().localityIndexByCpu.size() stripes or
- /// kMaxCpus stripes
- explicit AccessSpreader(
- size_t spreaderNumStripes,
- const CacheLocality& cacheLocality = CacheLocality::system<Atom>(),
- Getcpu::Func getcpuFunc = nullptr)
- : getcpuFunc_(getcpuFunc ? getcpuFunc
- : pickGetcpuFunc(spreaderNumStripes)),
- numStripes_(spreaderNumStripes) {
- auto n = cacheLocality.numCpus;
- for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
- auto index = cacheLocality.localityIndexByCpu[cpu];
- assert(index < n);
- // as index goes from 0..n, post-transform value goes from
- // 0..numStripes
- stripeByCpu[cpu] = (index * numStripes_) / n;
- assert(stripeByCpu[cpu] < numStripes_);
- }
- for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
- stripeByCpu[cpu] = stripeByCpu[cpu - n];
- }
- }
-
- /// Returns 1 more than the maximum value that can be returned from
- /// current()
- size_t numStripes() const { return numStripes_; }
+ // widthAndCpuToStripe[0] will actually work okay (all zeros), but
+ // something's wrong with the caller
+ assert(numStripes > 0);
- /// Returns the stripe associated with the current CPU
- size_t current() const {
unsigned cpu;
- getcpuFunc_(&cpu, nullptr, nullptr);
- return stripeByCpu[cpu % kMaxCpus];
+ getcpuFunc(&cpu, nullptr, nullptr);
+ return widthAndCpuToStripe[std::min(size_t(kMaxCpus),
+ numStripes)][cpu % kMaxCpus];
}
private:
/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
- /// is in range. We use a member for this instead of a static so that
- /// this fetch preloads a prefix the stripeByCpu array
- Getcpu::Func getcpuFunc_;
-
- /// A precomputed map from cpu to stripe. Rather than add a layer of
- /// indirection requiring a dynamic bounds check and another cache miss,
- /// we always precompute the whole array
- CompactStripe stripeByCpu[kMaxCpus];
-
- size_t numStripes_;
-
- /// Returns the best getcpu implementation for this type and width
- /// of AccessSpreader
- static Getcpu::Func pickGetcpuFunc(size_t numStripes);
-};
-
-template <>
-Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t);
-
-/// An array of kMaxCpus+1 AccessSpreader<Atom> instances constructed
-/// with default params, with the zero-th element having 1 stripe
-template <template <typename> class Atom, size_t kMaxStripe>
-struct AccessSpreaderArray {
-
- AccessSpreaderArray() {
- for (size_t i = 0; i <= kMaxStripe; ++i) {
- new (raw + i) AccessSpreader<Atom>(std::max(size_t(1), i));
+ /// is in range. We use a static for this so that we can prearrange a
+ /// valid value in the pre-constructed state and avoid the need for a
+ /// conditional on every subsequent invocation (not normally a big win,
+ /// but 20% on some inner loops here).
+ static Getcpu::Func getcpuFunc;
+
+ /// For each level of splitting up to kMaxCpus, maps the cpu (mod
+ /// kMaxCpus) to the stripe. Rather than performing any inequalities
+ /// or modulo on the actual number of cpus, we just fill in the entire
+ /// array.
+ static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
+
+ static bool initialized;
+
+ /// Returns the best getcpu implementation for Atom
+ static Getcpu::Func pickGetcpuFunc();
+
+ /// Always claims to be on CPU zero, node zero
+ static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
+ if (cpu != nullptr) {
+ *cpu = 0;
}
+ if (node != nullptr) {
+ *node = 0;
+ }
+ return 0;
}
- ~AccessSpreaderArray() {
- for (size_t i = 0; i <= kMaxStripe; ++i) {
- auto p = static_cast<AccessSpreader<Atom>*>(static_cast<void*>(raw + i));
- p->~AccessSpreader();
+ // The function to call for fast lookup of getcpu is a singleton, as
+ // is the precomputed table of locality information. AccessSpreader
+ // is used in very tight loops, however (we're trying to race an L1
+ // cache miss!), so the normal singleton mechanisms are noticeably
+ // expensive. Even a not-taken branch guarding access to getcpuFunc
+ // slows AccessSpreader::current from 12 nanos to 14. As a result, we
+ // populate the static members with simple (but valid) values that can
+ // be filled in by the linker, and then follow up with a normal static
+ // initializer call that puts in the proper version. This means that
+ // when there are initialization order issues we will just observe a
+ // zero stripe. Once a sanitizer gets smart enough to detect this as
+ // a race or undefined behavior, we can annotate it.
+
+ static bool initialize() {
+ getcpuFunc = pickGetcpuFunc();
+
+ auto& cacheLocality = CacheLocality::system<Atom>();
+ auto n = cacheLocality.numCpus;
+ for (size_t width = 0; width <= kMaxCpus; ++width) {
+ auto numStripes = std::max(size_t{1}, width);
+ for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
+ auto index = cacheLocality.localityIndexByCpu[cpu];
+ assert(index < n);
+ // as index goes from 0..n, post-transform value goes from
+ // 0..numStripes
+ widthAndCpuToStripe[width][cpu] = (index * numStripes) / n;
+ assert(widthAndCpuToStripe[width][cpu] < numStripes);
+ }
+ for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
+ widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
+ }
}
+ return true;
}
+};
- AccessSpreader<Atom> const& operator[](size_t index) const {
- return *static_cast<AccessSpreader<Atom> const*>(
- static_cast<void const*>(raw + index));
+template <>
+Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc();
+
+#define DECLARE_ACCESS_SPREADER_TYPE(Atom) \
+ namespace folly { \
+ namespace detail { \
+ template <> \
+ Getcpu::Func AccessSpreader<Atom>::getcpuFunc = \
+ AccessSpreader<Atom>::degenerateGetcpu; \
+ template <> \
+ typename AccessSpreader<Atom>::CompactStripe \
+ AccessSpreader<Atom>::widthAndCpuToStripe[129][128] = {}; \
+ template <> \
+ bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize(); \
+ } \
}
- private:
- // AccessSpreader uses sharedInstance
- friend AccessSpreader<Atom>;
-
- static AccessSpreaderArray<Atom, kMaxStripe> sharedInstance;
-
- /// aligned_storage is uninitialized, we use placement new since there
- /// is no AccessSpreader default constructor
- typename std::aligned_storage<sizeof(AccessSpreader<Atom>),
- CacheLocality::kFalseSharingRange>::type
- raw[kMaxStripe + 1];
-};
-}
-}
+} // namespace detail
+} // namespace folly
#endif /* FOLLY_DETAIL_CacheLocality_H_ */
TEST(Getcpu, VdsoGetcpu) {
unsigned cpu;
- Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
+ Getcpu::resolveVdsoFunc()(&cpu, nullptr, nullptr);
EXPECT_TRUE(cpu < CPU_SETSIZE);
}
return 0;
}
-TEST(AccessSpreader, Stubbed) {
- std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
- for (size_t s = 1; s < spreaders.size(); ++s) {
- spreaders[s].reset(
- new AccessSpreader<>(s, nonUniformExampleLocality, &testingGetcpu));
- }
- std::vector<size_t> cpusInLocalityOrder = {
- 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23,
- 9, 24, 10, 25, 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31};
- for (size_t i = 0; i < 32; ++i) {
- // extra i * 32 is to check wrapping behavior of impl
- testingCpu = cpusInLocalityOrder[i] + i * 64;
- for (size_t s = 1; s < spreaders.size(); ++s) {
- EXPECT_EQ((i * s) / 32, spreaders[s]->current())
- << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
- }
- }
-}
-
-TEST(AccessSpreader, Default) {
- AccessSpreader<> spreader(16);
- EXPECT_LT(spreader.current(), 16);
-}
-
-TEST(AccessSpreader, Shared) {
+TEST(AccessSpreader, Simple) {
for (size_t s = 1; s < 200; ++s) {
- EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
+ EXPECT_LT(AccessSpreader<>::current(s), s);
}
}
-TEST(AccessSpreader, Statics) {
- LOG(INFO) << "stripeByCore.numStripes() = "
- << AccessSpreader<>::stripeByCore.numStripes();
- LOG(INFO) << "stripeByChip.numStripes() = "
- << AccessSpreader<>::stripeByChip.numStripes();
- for (size_t s = 1; s < 200; ++s) {
- EXPECT_LT(AccessSpreader<>::current(s), s);
+#define DECLARE_SPREADER_TAG(tag, locality, func) \
+ namespace { \
+ template <typename dummy> \
+ struct tag {}; \
+ } \
+ DECLARE_ACCESS_SPREADER_TYPE(tag) \
+ namespace folly { \
+ namespace detail { \
+ template <> \
+ const CacheLocality& CacheLocality::system<tag>() { \
+ static auto* inst = new CacheLocality(locality); \
+ return *inst; \
+ } \
+ template <> \
+ Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
+ return func; \
+ } \
+ } \
}
-}
+
+DECLARE_SPREADER_TAG(ManualTag, CacheLocality::uniform(16), testingGetcpu)
+DECLARE_SPREADER_TAG(
+ ThreadLocalTag,
+ CacheLocality::system<>(),
+ folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
+DECLARE_SPREADER_TAG(PthreadSelfTag,
+ CacheLocality::system<>(),
+ folly::detail::FallbackGetcpu<HashingThreadId>::getcpu)
TEST(AccessSpreader, Wrapping) {
// this test won't pass unless locality.numCpus divides kMaxCpus
- auto numCpus = 16;
- auto locality = CacheLocality::uniform(numCpus);
+ auto numCpus = CacheLocality::system<ManualTag>().numCpus;
+ EXPECT_EQ(0, 128 % numCpus);
for (size_t s = 1; s < 200; ++s) {
- AccessSpreader<> spreader(s, locality, &testingGetcpu);
for (size_t c = 0; c < 400; ++c) {
testingCpu = c;
- auto observed = spreader.current();
+ auto observed = AccessSpreader<ManualTag>::current(s);
testingCpu = c % numCpus;
- auto expected = spreader.current();
+ auto expected = AccessSpreader<ManualTag>::current(s);
EXPECT_EQ(expected, observed) << "numCpus=" << numCpus << ", s=" << s
<< ", c=" << c;
}
}
}
-// Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
-// a 2.2Ghz Xeon
-// ============================================================================
-// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
-// ============================================================================
-// LocalAccessSpreaderUse 20.77ns 48.16M
-// SharedAccessSpreaderUse 21.95ns 45.55M
-// AccessSpreaderConstruction 466.56ns 2.14M
-// ============================================================================
-
-BENCHMARK(LocalAccessSpreaderUse, iters) {
- folly::BenchmarkSuspender braces;
- AccessSpreader<> spreader(16);
- braces.dismiss();
-
- for (unsigned long i = 0; i < iters; ++i) {
- auto x = spreader.current();
- folly::doNotOptimizeAway(x);
- }
-}
-
-BENCHMARK(SharedAccessSpreaderUse, iters) {
+BENCHMARK(AccessSpreaderUse, iters) {
for (unsigned long i = 0; i < iters; ++i) {
auto x = AccessSpreader<>::current(16);
folly::doNotOptimizeAway(x);
}
}
-BENCHMARK(AccessSpreaderConstruction, iters) {
- std::aligned_storage<sizeof(AccessSpreader<>),
- std::alignment_of<AccessSpreader<>>::value>::type raw;
- for (unsigned long i = 0; i < iters; ++i) {
- auto x = new (&raw) AccessSpreader<>(16);
- folly::doNotOptimizeAway(x);
- x->~AccessSpreader();
- }
-}
-
-enum class SpreaderType { GETCPU, SHARED, TLS_RR, PTHREAD_SELF };
-
// Benchmark scores here reflect the time for 32 threads to perform an
// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
// if we don't separate the counters onto unique 128 byte stripes the
// 1_stripe and 2_stripe results are identical, even though the L3 is
// claimed to have 64 byte cache lines.
//
-// _stub means there was no call to getcpu or the tls round-robin
-// implementation, because for a single stripe the cpu doesn't matter.
-// _getcpu refers to the vdso getcpu implementation with a locally
-// constructed AccessSpreader. _tls_rr refers to execution using
-// SequentialThreadId, the fallback if the vdso getcpu isn't available.
-// _shared refers to calling AccessSpreader<>::current(numStripes)
-// inside the hot loop.
+// Getcpu refers to the vdso getcpu implementation. ThreadLocal refers
+// to execution using SequentialThreadId, the fallback if the vdso
+// getcpu isn't available. PthreadSelf hashes the value returned from
+// pthread_self() as a fallback-fallback for systems that don't have
+// thread-local support.
//
// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
-// so since the stripe selection is 21 nanos the atomic increments in
-// the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
+// so since the stripe selection is 12 nanos the atomic increments in
+// the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected
// to ping-pong almost every operation, since the loops have the same
// duration. Widths 4 and 2 have the same behavior, but each tour of the
// cache line is 4 and 8 cores long, respectively. These all suggest a
// lower bound of 60 nanos for intra-chip handoff and increment between
// the L1s.
//
-// With 455 nanos (1K cycles) of busywork per contended increment, the
-// system can hide all of the latency of a tour of length 4, but not
-// quite one of length 8. I was a bit surprised at how much worse the
-// non-striped version got. It seems that the inter-chip traffic also
-// interferes with the L1-only localWork.load(). When the local work is
-// doubled to about 1 microsecond we see that the inter-chip contention
-// is still very important, but subdivisions on the same chip don't matter.
+// With 420 nanos of busywork per contended increment, the system can
+// hide all of the latency of a tour of length 4, but not quite one of
+// length 8. I was a bit surprised at how much worse the non-striped
+// version got. It seems that the inter-chip traffic also interferes
+// with the L1-only localWork.load(). When the local work is doubled
+// to about 1 microsecond we see that the inter-chip contention is still
+// very important, but subdivisions on the same chip don't matter.
//
-// sudo nice -n -20
-// _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
+// sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
+// --benchmark --bm_min_iters=1000000
// ============================================================================
// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
// ============================================================================
-// LocalAccessSpreaderUse 13.00ns 76.94M
-// SharedAccessSpreaderUse 13.04ns 76.66M
-// AccessSpreaderConstruction 366.00ns 2.73M
+// AccessSpreaderUse 11.94ns 83.79M
// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_0_work_stub) 891.04ns 1.12M
-// contentionAtWidth(2_stripe_0_work_getcpu) 403.45ns 2.48M
-// contentionAtWidth(4_stripe_0_work_getcpu) 198.02ns 5.05M
-// contentionAtWidth(8_stripe_0_work_getcpu) 90.54ns 11.04M
-// contentionAtWidth(16_stripe_0_work_getcpu) 31.21ns 32.04M
-// contentionAtWidth(32_stripe_0_work_getcpu) 29.15ns 34.31M
-// contentionAtWidth(64_stripe_0_work_getcpu) 32.41ns 30.86M
-// contentionAtWidth(2_stripe_0_work_tls_rr) 958.06ns 1.04M
-// contentionAtWidth(4_stripe_0_work_tls_rr) 494.31ns 2.02M
-// contentionAtWidth(8_stripe_0_work_tls_rr) 362.34ns 2.76M
-// contentionAtWidth(16_stripe_0_work_tls_rr) 231.37ns 4.32M
-// contentionAtWidth(32_stripe_0_work_tls_rr) 128.26ns 7.80M
-// contentionAtWidth(64_stripe_0_work_tls_rr) 115.08ns 8.69M
-// contentionAtWidth(2_stripe_0_work_pthread_self) 856.63ns 1.17M
-// contentionAtWidth(4_stripe_0_work_pthread_self) 623.43ns 1.60M
-// contentionAtWidth(8_stripe_0_work_pthread_self) 419.69ns 2.38M
-// contentionAtWidth(16_stripe_0_work_pthread_self 217.32ns 4.60M
-// contentionAtWidth(32_stripe_0_work_pthread_self 157.69ns 6.34M
-// contentionAtWidth(64_stripe_0_work_pthread_self 140.94ns 7.10M
-// contentionAtWidth(2_stripe_0_work_shared) 406.55ns 2.46M
-// contentionAtWidth(4_stripe_0_work_shared) 198.28ns 5.04M
-// contentionAtWidth(8_stripe_0_work_shared) 90.11ns 11.10M
-// contentionAtWidth(16_stripe_0_work_shared) 34.53ns 28.96M
-// contentionAtWidth(32_stripe_0_work_shared) 30.08ns 33.25M
-// contentionAtWidth(64_stripe_0_work_shared) 34.60ns 28.90M
-// atomicIncrBaseline(local_incr_0_work) 17.51ns 57.12M
+// contentionAtWidthGetcpu(1_stripe_0_work) 985.75ns 1.01M
+// contentionAtWidthGetcpu(2_stripe_0_work) 424.02ns 2.36M
+// contentionAtWidthGetcpu(4_stripe_0_work) 190.13ns 5.26M
+// contentionAtWidthGetcpu(8_stripe_0_work) 91.86ns 10.89M
+// contentionAtWidthGetcpu(16_stripe_0_work) 29.31ns 34.12M
+// contentionAtWidthGetcpu(32_stripe_0_work) 29.53ns 33.86M
+// contentionAtWidthGetcpu(64_stripe_0_work) 29.93ns 33.41M
+// contentionAtWidthThreadLocal(2_stripe_0_work) 609.21ns 1.64M
+// contentionAtWidthThreadLocal(4_stripe_0_work) 303.60ns 3.29M
+// contentionAtWidthThreadLocal(8_stripe_0_work) 246.57ns 4.06M
+// contentionAtWidthThreadLocal(16_stripe_0_work) 154.84ns 6.46M
+// contentionAtWidthThreadLocal(32_stripe_0_work) 24.14ns 41.43M
+// contentionAtWidthThreadLocal(64_stripe_0_work) 23.95ns 41.75M
+// contentionAtWidthPthreadSelf(2_stripe_0_work) 722.01ns 1.39M
+// contentionAtWidthPthreadSelf(4_stripe_0_work) 501.56ns 1.99M
+// contentionAtWidthPthreadSelf(8_stripe_0_work) 474.58ns 2.11M
+// contentionAtWidthPthreadSelf(16_stripe_0_work) 300.90ns 3.32M
+// contentionAtWidthPthreadSelf(32_stripe_0_work) 175.77ns 5.69M
+// contentionAtWidthPthreadSelf(64_stripe_0_work) 174.88ns 5.72M
+// atomicIncrBaseline(local_incr_0_work) 16.81ns 59.51M
// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_500_work_stub) 1.87us 534.36K
-// contentionAtWidth(2_stripe_500_work_getcpu) 542.31ns 1.84M
-// contentionAtWidth(4_stripe_500_work_getcpu) 409.18ns 2.44M
-// contentionAtWidth(8_stripe_500_work_getcpu) 511.05ns 1.96M
-// contentionAtWidth(16_stripe_500_work_getcpu) 399.14ns 2.51M
-// contentionAtWidth(32_stripe_500_work_getcpu) 399.05ns 2.51M
-// atomicIncrBaseline(local_incr_500_work) 399.41ns 2.50M
+// contentionAtWidthGetcpu(1_stripe_500_work) 1.82us 549.97K
+// contentionAtWidthGetcpu(2_stripe_500_work) 533.71ns 1.87M
+// contentionAtWidthGetcpu(4_stripe_500_work) 424.64ns 2.35M
+// contentionAtWidthGetcpu(8_stripe_500_work) 451.85ns 2.21M
+// contentionAtWidthGetcpu(16_stripe_500_work) 425.54ns 2.35M
+// contentionAtWidthGetcpu(32_stripe_500_work) 501.66ns 1.99M
+// atomicIncrBaseline(local_incr_500_work) 438.46ns 2.28M
// ----------------------------------------------------------------------------
-// contentionAtWidth(1_stripe_1000_work_stub) 1.90us 525.73K
-// contentionAtWidth(2_stripe_1000_work_getcpu) 792.91ns 1.26M
-// contentionAtWidth(4_stripe_1000_work_getcpu) 788.14ns 1.27M
-// contentionAtWidth(8_stripe_1000_work_getcpu) 794.16ns 1.26M
-// contentionAtWidth(16_stripe_1000_work_getcpu) 785.33ns 1.27M
-// contentionAtWidth(32_stripe_1000_work_getcpu) 786.56ns 1.27M
-// atomicIncrBaseline(local_incr_1000_work) 784.69ns 1.27M
+// contentionAtWidthGetcpu(1_stripe_1000_work) 1.88us 532.20K
+// contentionAtWidthGetcpu(2_stripe_1000_work) 824.62ns 1.21M
+// contentionAtWidthGetcpu(4_stripe_1000_work) 803.56ns 1.24M
+// contentionAtWidthGetcpu(8_stripe_1000_work) 926.65ns 1.08M
+// contentionAtWidthGetcpu(16_stripe_1000_work) 900.10ns 1.11M
+// contentionAtWidthGetcpu(32_stripe_1000_work) 890.75ns 1.12M
+// atomicIncrBaseline(local_incr_1000_work) 774.47ns 1.29M
// ============================================================================
-static void contentionAtWidth(size_t iters,
- size_t stripes,
- size_t work,
- SpreaderType spreaderType,
- size_t counterAlignment = 128,
- size_t numThreads = 32) {
- folly::BenchmarkSuspender braces;
-
- folly::detail::Getcpu::Func getcpuFunc = nullptr;
-
- if (spreaderType == SpreaderType::TLS_RR) {
- getcpuFunc =
- folly::detail::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu;
- }
- if (spreaderType == SpreaderType::PTHREAD_SELF) {
- getcpuFunc = folly::detail::FallbackGetcpu<HashingThreadId>::getcpu;
- }
+template <template <typename> class Tag>
+static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
+ const size_t counterAlignment = 128;
+ const size_t numThreads = 32;
- AccessSpreader<> spreader(
- stripes, CacheLocality::system<std::atomic>(), getcpuFunc);
+ folly::BenchmarkSuspender braces;
std::atomic<size_t> ready(0);
std::atomic<bool> go(false);
new (raw.data() + counterAlignment * i) std::atomic<size_t>();
}
- spreader.current();
ready++;
while (!go.load()) {
sched_yield();
}
std::atomic<int> localWork(0);
- if (spreaderType == SpreaderType::SHARED) {
- for (size_t i = iters; i > 0; --i) {
- ++*(counters[AccessSpreader<>::current(stripes)]);
- for (size_t j = work; j > 0; --j) {
- localWork.load();
- }
- }
- } else {
- for (size_t i = iters; i > 0; --i) {
- ++*(counters[spreader.current()]);
- for (size_t j = work; j > 0; --j) {
- localWork.load();
- }
+ for (size_t i = iters; i > 0; --i) {
+ ++*(counters[AccessSpreader<Tag>::current(stripes)]);
+ for (size_t j = work; j > 0; --j) {
+ localWork.load();
}
}
}));
if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
// create a few dummy threads to wrap back around to 0 mod numCpus
for (size_t i = threads.size(); i != numThreads; ++i) {
- std::thread([&]() { spreader.current(); }).join();
+ std::thread([&]() { AccessSpreader<Tag>::current(stripes); }).join();
}
}
}
}
}
-BENCHMARK_DRAW_LINE()
+static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
+ contentionAtWidth<std::atomic>(iters, stripes, work);
+}
+
+static void contentionAtWidthThreadLocal(size_t iters,
+ size_t stripes,
+ size_t work) {
+ contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
+}
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 1_stripe_0_work_stub, 1, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 2_stripe_0_work_getcpu, 2, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 4_stripe_0_work_getcpu, 4, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 8_stripe_0_work_getcpu, 8, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 16_stripe_0_work_getcpu, 16, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 32_stripe_0_work_getcpu, 32, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 64_stripe_0_work_getcpu, 64, 0, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 2_stripe_0_work_tls_rr, 2, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 4_stripe_0_work_tls_rr, 4, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 8_stripe_0_work_tls_rr, 8, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 16_stripe_0_work_tls_rr, 16, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 32_stripe_0_work_tls_rr, 32, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 64_stripe_0_work_tls_rr, 64, 0, SpreaderType::TLS_RR)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 2_stripe_0_work_pthread_self,
- 2,
- 0,
- SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 4_stripe_0_work_pthread_self,
- 4,
- 0,
- SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 8_stripe_0_work_pthread_self,
- 8,
- 0,
- SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 16_stripe_0_work_pthread_self,
- 16,
- 0,
- SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 32_stripe_0_work_pthread_self,
- 32,
- 0,
- SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 64_stripe_0_work_pthread_self,
- 64,
- 0,
- SpreaderType::PTHREAD_SELF)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 2_stripe_0_work_shared, 2, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 4_stripe_0_work_shared, 4, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 8_stripe_0_work_shared, 8, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 16_stripe_0_work_shared, 16, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 32_stripe_0_work_shared, 32, 0, SpreaderType::SHARED)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 64_stripe_0_work_shared, 64, 0, SpreaderType::SHARED)
+static void contentionAtWidthPthreadSelf(size_t iters,
+ size_t stripes,
+ size_t work) {
+ contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
+}
+
+BENCHMARK_DRAW_LINE()
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 2_stripe_0_work, 2, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 4_stripe_0_work, 4, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 8_stripe_0_work, 8, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
+BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 1_stripe_500_work_stub, 1, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 2_stripe_500_work_getcpu, 2, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 4_stripe_500_work_getcpu, 4, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 8_stripe_500_work_getcpu, 8, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 16_stripe_500_work_getcpu, 16, 500, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 32_stripe_500_work_getcpu, 32, 500, SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
BENCHMARK_DRAW_LINE()
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 1_stripe_1000_work_stub, 1, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 2_stripe_1000_work_getcpu, 2, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 4_stripe_1000_work_getcpu, 4, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(
- contentionAtWidth, 8_stripe_1000_work_getcpu, 8, 1000, SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 16_stripe_1000_work_getcpu,
- 16,
- 1000,
- SpreaderType::GETCPU)
-BENCHMARK_NAMED_PARAM(contentionAtWidth,
- 32_stripe_1000_work_getcpu,
- 32,
- 1000,
- SpreaderType::GETCPU)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
+BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
int main(int argc, char** argv) {