throw std::runtime_error("unable to load cache sharing info");
}
- std::sort(cpus.begin(),
- cpus.end(),
- [&](size_t lhs, size_t rhs) -> bool {
- // sort first by equiv class of cache with highest index,
- // direction doesn't matter. If different cpus have
- // different numbers of caches then this code might produce
- // a sub-optimal ordering, but it won't crash
- auto& lhsEquiv = equivClassesByCpu[lhs];
- auto& rhsEquiv = equivClassesByCpu[rhs];
- for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1;
- i >= 0;
- --i) {
- auto idx = size_t(i);
- if (lhsEquiv[idx] != rhsEquiv[idx]) {
- return lhsEquiv[idx] < rhsEquiv[idx];
- }
- }
-
- // break ties deterministically by cpu
- return lhs < rhs;
- });
+ std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
+ // sort first by equiv class of cache with highest index,
+ // direction doesn't matter. If different cpus have
+ // different numbers of caches then this code might produce
+ // a sub-optimal ordering, but it won't crash
+ auto& lhsEquiv = equivClassesByCpu[lhs];
+ auto& rhsEquiv = equivClassesByCpu[rhs];
+ for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1;
+ i >= 0;
+ --i) {
+ auto idx = size_t(i);
+ if (lhsEquiv[idx] != rhsEquiv[idx]) {
+ return lhsEquiv[idx] < rhsEquiv[idx];
+ }
+ }
+
+ // break ties deterministically by cpu
+ return lhs < rhs;
+ });
// the cpus are now sorted by locality, with neighboring entries closer
// to each other than entries that are far away. For striping we want
// perfect L1 spreading in a system with hyperthreading enabled.
struct CacheLocality {
-
/// 1 more than the maximum value that can be returned from sched_getcpu
/// or getcpu. This is the number of hardware thread contexts provided
/// by the processors
#ifdef FOLLY_TLS
template <template <typename> class Atom>
struct SequentialThreadId {
-
/// Returns the thread id assigned to the current thread
static unsigned get() {
auto rv = currentId;
/// all of the time.
template <template <typename> class Atom = std::atomic>
struct AccessSpreader {
-
/// Returns the stripe associated with the current CPU. The returned
/// value will be < numStripes.
static size_t current(size_t numStripes) {
unsigned cpu;
getcpuFunc(&cpu, nullptr, nullptr);
- return widthAndCpuToStripe[std::min(size_t(kMaxCpus),
- numStripes)][cpu % kMaxCpus];
+ return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
+ [cpu % kMaxCpus];
}
private:
typedef uint8_t CompactStripe;
- static_assert((kMaxCpus & (kMaxCpus - 1)) == 0,
- "kMaxCpus should be a power of two so modulo is fast");
- static_assert(kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
- "stripeByCpu element type isn't wide enough");
+ static_assert(
+ (kMaxCpus & (kMaxCpus - 1)) == 0,
+ "kMaxCpus should be a power of two so modulo is fast");
+ static_assert(
+ kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
+ "stripeByCpu element type isn't wide enough");
/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
}
}
-static void atomicIncrBaseline(size_t iters,
- size_t work,
- size_t numThreads = 32) {
+static void
+atomicIncrBaseline(size_t iters, size_t work, size_t numThreads = 32) {
folly::BenchmarkSuspender braces;
std::atomic<bool> go(false);
contentionAtWidth<std::atomic>(iters, stripes, work);
}
-static void contentionAtWidthThreadLocal(size_t iters,
- size_t stripes,
- size_t work) {
+static void
+contentionAtWidthThreadLocal(size_t iters, size_t stripes, size_t work) {
contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
}
-static void contentionAtWidthPthreadSelf(size_t iters,
- size_t stripes,
- size_t work) {
+static void
+contentionAtWidthPthreadSelf(size_t iters, size_t stripes, size_t work) {
contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
}
#include <folly/portability/GTest.h>
+#include <glog/logging.h>
#include <memory>
#include <thread>
#include <type_traits>
#include <unordered_map>
-#include <glog/logging.h>
using namespace folly;
{"/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified"}};
/// This is the expected CacheLocality structure for fakeSysfsTree
-static const CacheLocality nonUniformExampleLocality = {32,
- {16, 16, 2},
- {0,
- 2,
- 4,
- 6,
- 8,
- 10,
- 11,
- 12,
- 14,
- 16,
- 18,
- 20,
- 22,
- 24,
- 26,
- 28,
- 30,
- 1,
- 3,
- 5,
- 7,
- 9,
- 13,
- 15,
- 17,
- 19,
- 21,
- 23,
- 25,
- 27,
- 29,
- 31}};
+static const CacheLocality nonUniformExampleLocality = {
+ 32,
+ {16, 16, 2},
+ {0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}};
TEST(CacheLocality, FakeSysfs) {
auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
auto observed = AccessSpreader<ManualTag>::current(s);
testingCpu = c % numCpus;
auto expected = AccessSpreader<ManualTag>::current(s);
- EXPECT_EQ(expected, observed) << "numCpus=" << numCpus << ", s=" << s
- << ", c=" << c;
+ EXPECT_EQ(expected, observed)
+ << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
}
}
}