From: Giuseppe Ottaviano Date: Wed, 28 Jun 2017 18:09:42 +0000 (-0700) Subject: Move CacheLocality out of detail/ and into concurrency/ X-Git-Tag: v2017.07.03.00~22 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=05ce52289b0ec8f525a92d6d1955301d0b77c0a7;p=folly.git Move CacheLocality out of detail/ and into concurrency/ Summary: There's no reason these utilities should only be used by folly. Reviewed By: mzlee Differential Revision: D5317894 fbshipit-source-id: 5a9bdf4c5efaa5bcbe78e6723a03a468f2fe5e32 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bfa9974..7d476c5c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -292,6 +292,8 @@ if (BUILD_TESTS) apply_folly_compile_options_to_target(folly_test_support) folly_define_tests( + DIRECTORY concurrency/ + TEST cache_locality_test SOURCES CacheLocalityTest.cpp DIRECTORY experimental/test/ TEST autotimer_test SOURCES AutoTimerTest.cpp TEST bits_test_2 SOURCES BitsTest.cpp @@ -467,7 +469,6 @@ if (BUILD_TESTS) TEST baton_test SOURCES BatonTest.cpp TEST bit_iterator_test SOURCES BitIteratorTest.cpp TEST bits_test SOURCES BitsTest.cpp - TEST cache_locality_test SOURCES CacheLocalityTest.cpp TEST cacheline_padded_test SOURCES CachelinePaddedTest.cpp TEST call_once_test SOURCES CallOnceTest.cpp TEST checksum_test SOURCES ChecksumTest.cpp diff --git a/folly/IndexedMemPool.h b/folly/IndexedMemPool.h index 275d441c..6b3fa53a 100644 --- a/folly/IndexedMemPool.h +++ b/folly/IndexedMemPool.h @@ -16,14 +16,16 @@ #pragma once -#include #include #include #include + +#include + #include #include #include -#include +#include #include #include @@ -497,7 +499,7 @@ struct IndexedMemPool : boost::noncopyable { } AtomicStruct& localHead() { - auto stripe = detail::AccessSpreader::current(NumLocalLists); + auto stripe = AccessSpreader::current(NumLocalLists); return local_[stripe].head; } diff --git a/folly/LifoSem.h b/folly/LifoSem.h index f6b7bf02..9c0404fe 100644 --- a/folly/LifoSem.h +++ b/folly/LifoSem.h @@ -27,7 +27,7 @@ #include #include #include -#include +#include namespace folly { @@ -515,9 +515,7 @@ struct LifoSemBase { FOLLY_ALIGN_TO_AVOID_FALSE_SHARING folly::AtomicStruct head_; - char padding_[folly::detail::CacheLocality::kFalseSharingRange - - sizeof(LifoSemHead)]; - + char padding_[folly::CacheLocality::kFalseSharingRange - sizeof(LifoSemHead)]; static LifoSemNode& idxToNode(uint32_t idx) { auto raw = &LifoSemRawNode::pool()[idx]; diff --git a/folly/MPMCQueue.h b/folly/MPMCQueue.h index b0cfc46f..0e921060 100644 --- a/folly/MPMCQueue.h +++ b/folly/MPMCQueue.h @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include @@ -647,11 +647,11 @@ class MPMCQueueBase> : boost::noncopyable { } // ideally this would be a static assert, but g++ doesn't allow it - assert(alignof(MPMCQueue) - >= detail::CacheLocality::kFalseSharingRange); - assert(static_cast(static_cast(&popTicket_)) - - static_cast(static_cast(&pushTicket_)) - >= detail::CacheLocality::kFalseSharingRange); + assert(alignof(MPMCQueue) >= CacheLocality::kFalseSharingRange); + assert( + static_cast(static_cast(&popTicket_)) - + static_cast(static_cast(&pushTicket_)) >= + CacheLocality::kFalseSharingRange); } /// A default-constructed queue is useful because a usable (non-zero @@ -971,8 +971,7 @@ class MPMCQueueBase> : boost::noncopyable { /// To avoid false sharing in slots_ with neighboring memory /// allocations, we pad it with this many SingleElementQueue-s at /// each end - kSlotPadding = (detail::CacheLocality::kFalseSharingRange - 1) - / sizeof(Slot) + 1 + kSlotPadding = (CacheLocality::kFalseSharingRange - 1) / sizeof(Slot) + 1 }; /// The maximum number of items in the queue at once @@ -1024,8 +1023,7 @@ class MPMCQueueBase> : boost::noncopyable { /// Alignment doesn't prevent false sharing at the end of the struct, /// so fill out the last cache line - char padding_[detail::CacheLocality::kFalseSharingRange - - sizeof(Atom)]; + char padding_[CacheLocality::kFalseSharingRange - sizeof(Atom)]; /// We assign tickets in increasing order, but we don't want to /// access neighboring elements of slots_ because that will lead to diff --git a/folly/Makefile.am b/folly/Makefile.am index 74977332..acc1c0bd 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -56,12 +56,12 @@ nobase_follyinclude_HEADERS = \ CppAttributes.h \ CpuId.h \ CPortability.h \ + concurrency/CacheLocality.h \ concurrency/CoreCachedSharedPtr.h \ detail/AtomicHashUtils.h \ detail/AtomicUnorderedMapUtils.h \ detail/AtomicUtils.h \ detail/BitIteratorDetail.h \ - detail/CacheLocality.h \ detail/CachelinePaddedImpl.h \ detail/ChecksumDetail.h \ detail/DiscriminatedPtrDetail.h \ @@ -459,7 +459,7 @@ libfolly_la_SOURCES = \ Assume.cpp \ Checksum.cpp \ ClockGettimeWrappers.cpp \ - detail/CacheLocality.cpp \ + concurrency/CacheLocality.cpp \ detail/IPAddress.cpp \ dynamic.cpp \ ExceptionWrapper.cpp \ diff --git a/folly/ProducerConsumerQueue.h b/folly/ProducerConsumerQueue.h index d0bf3ec8..12f2bf42 100644 --- a/folly/ProducerConsumerQueue.h +++ b/folly/ProducerConsumerQueue.h @@ -27,7 +27,7 @@ #include #include -#include +#include namespace folly { @@ -168,14 +168,14 @@ struct ProducerConsumerQueue { } private: - char pad0_[detail::CacheLocality::kFalseSharingRange]; - const uint32_t size_; - T* const records_; + char pad0_[CacheLocality::kFalseSharingRange]; + const uint32_t size_; + T* const records_; - FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic readIndex_; - FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic writeIndex_; + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic readIndex_; + FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic writeIndex_; - char pad1_[detail::CacheLocality::kFalseSharingRange - sizeof(writeIndex_)]; + char pad1_[CacheLocality::kFalseSharingRange - sizeof(writeIndex_)]; }; } diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h index c13a6d6f..24d8051c 100644 --- a/folly/SharedMutex.h +++ b/folly/SharedMutex.h @@ -19,11 +19,13 @@ #pragma once #include + #include #include #include + #include -#include +#include #include #include #include @@ -1417,8 +1419,7 @@ bool SharedMutexImpl:: // starting point for our empty-slot search, can change after // calling waitForZeroBits uint32_t bestSlot = - (uint32_t)folly::detail::AccessSpreader::current( - kMaxDeferredReaders); + (uint32_t)folly::AccessSpreader::current(kMaxDeferredReaders); // deferred readers are already enabled, or it is time to // enable them if we can find a slot diff --git a/folly/TokenBucket.h b/folly/TokenBucket.h index d88bcd86..905b0f9f 100644 --- a/folly/TokenBucket.h +++ b/folly/TokenBucket.h @@ -21,7 +21,7 @@ #include #include -#include +#include namespace folly { diff --git a/folly/concurrency/CacheLocality.cpp b/folly/concurrency/CacheLocality.cpp new file mode 100644 index 00000000..36b77b83 --- /dev/null +++ b/folly/concurrency/CacheLocality.cpp @@ -0,0 +1,273 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifndef _MSC_VER +#define _GNU_SOURCE 1 // for RTLD_NOLOAD +#include +#endif +#include + +#include +#include +#include +#include +#include + +namespace folly { + +///////////// CacheLocality + +/// Returns the best real CacheLocality information available +static CacheLocality getSystemLocalityInfo() { +#ifdef __linux__ + try { + return CacheLocality::readFromSysfs(); + } catch (...) { + // keep trying + } +#endif + + long numCpus = sysconf(_SC_NPROCESSORS_CONF); + if (numCpus <= 0) { + // This shouldn't happen, but if it does we should try to keep + // going. We are probably not going to be able to parse /sys on + // this box either (although we will try), which means we are going + // to fall back to the SequentialThreadId splitter. On my 16 core + // (x hyperthreading) dev box 16 stripes is enough to get pretty good + // contention avoidance with SequentialThreadId, and there is little + // improvement from going from 32 to 64. This default gives us some + // wiggle room + numCpus = 32; + } + return CacheLocality::uniform(size_t(numCpus)); +} + +template <> +const CacheLocality& CacheLocality::system() { + static auto* cache = new CacheLocality(getSystemLocalityInfo()); + return *cache; +} + +// Each level of cache has sharing sets, which are the set of cpus +// that share a common cache at that level. These are available in a +// hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map, +// for example). They are also available in a human-readable list form, +// as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list +// is a comma-separated list of numbers and ranges, where the ranges are +// a pair of decimal numbers separated by a '-'. +// +// To sort the cpus for optimum locality we don't really need to parse +// the sharing sets, we just need a unique representative from the +// equivalence class. The smallest value works fine, and happens to be +// the first decimal number in the file. We load all of the equivalence +// class information from all of the cpu*/index* directories, order the +// cpus first by increasing last-level cache equivalence class, then by +// the smaller caches. Finally, we break ties with the cpu number itself. + +/// Returns the first decimal number in the string, or throws an exception +/// if the string does not start with a number terminated by ',', '-', +/// '\n', or eos. +static size_t parseLeadingNumber(const std::string& line) { + auto raw = line.c_str(); + char* end; + unsigned long val = strtoul(raw, &end, 10); + if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) { + throw std::runtime_error( + to("error parsing list '", line, "'").c_str()); + } + return val; +} + +CacheLocality CacheLocality::readFromSysfsTree( + const std::function& mapping) { + // number of equivalence classes per level + std::vector numCachesByLevel; + + // the list of cache equivalence classes, where equivalance classes + // are named by the smallest cpu in the class + std::vector> equivClassesByCpu; + + std::vector cpus; + + while (true) { + auto cpu = cpus.size(); + std::vector levels; + for (size_t index = 0;; ++index) { + auto dir = + sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index); + auto cacheType = mapping(dir + "type"); + auto equivStr = mapping(dir + "shared_cpu_list"); + if (cacheType.size() == 0 || equivStr.size() == 0) { + // no more caches + break; + } + if (cacheType[0] == 'I') { + // cacheType in { "Data", "Instruction", "Unified" }. skip icache + continue; + } + auto equiv = parseLeadingNumber(equivStr); + auto level = levels.size(); + levels.push_back(equiv); + + if (equiv == cpu) { + // we only want to count the equiv classes once, so we do it when + // we first encounter them + while (numCachesByLevel.size() <= level) { + numCachesByLevel.push_back(0); + } + numCachesByLevel[level]++; + } + } + + if (levels.size() == 0) { + // no levels at all for this cpu, we must be done + break; + } + equivClassesByCpu.emplace_back(std::move(levels)); + cpus.push_back(cpu); + } + + if (cpus.size() == 0) { + throw std::runtime_error("unable to load cache sharing info"); + } + + std::sort(cpus.begin(), + cpus.end(), + [&](size_t lhs, size_t rhs) -> bool { + // sort first by equiv class of cache with highest index, + // direction doesn't matter. If different cpus have + // different numbers of caches then this code might produce + // a sub-optimal ordering, but it won't crash + auto& lhsEquiv = equivClassesByCpu[lhs]; + auto& rhsEquiv = equivClassesByCpu[rhs]; + for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1; + i >= 0; + --i) { + auto idx = size_t(i); + if (lhsEquiv[idx] != rhsEquiv[idx]) { + return lhsEquiv[idx] < rhsEquiv[idx]; + } + } + + // break ties deterministically by cpu + return lhs < rhs; + }); + + // the cpus are now sorted by locality, with neighboring entries closer + // to each other than entries that are far away. For striping we want + // the inverse map, since we are starting with the cpu + std::vector indexes(cpus.size()); + for (size_t i = 0; i < cpus.size(); ++i) { + indexes[cpus[i]] = i; + } + + return CacheLocality{ + cpus.size(), std::move(numCachesByLevel), std::move(indexes)}; +} + +CacheLocality CacheLocality::readFromSysfs() { + return readFromSysfsTree([](std::string name) { + std::ifstream xi(name.c_str()); + std::string rv; + std::getline(xi, rv); + return rv; + }); +} + +CacheLocality CacheLocality::uniform(size_t numCpus) { + CacheLocality rv; + + rv.numCpus = numCpus; + + // one cache shared by all cpus + rv.numCachesByLevel.push_back(numCpus); + + // no permutations in locality index mapping + for (size_t cpu = 0; cpu < numCpus; ++cpu) { + rv.localityIndexByCpu.push_back(cpu); + } + + return rv; +} + +////////////// Getcpu + +Getcpu::Func Getcpu::resolveVdsoFunc() { +#if !FOLLY_HAVE_LINUX_VDSO + return nullptr; +#else + void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (h == nullptr) { + return nullptr; + } + + auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu")); + if (func == nullptr) { + // technically a null result could either be a failure or a successful + // lookup of a symbol with the null value, but the second can't actually + // happen for this symbol. No point holding the handle forever if + // we don't need the code + dlclose(h); + } + + return func; +#endif +} + +#ifdef FOLLY_TLS +/////////////// SequentialThreadId +template struct SequentialThreadId; +#endif + +/////////////// AccessSpreader +template struct AccessSpreader; + +SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz) + : allocSize_{allocSize}, sz_(sz) {} + +SimpleAllocator::~SimpleAllocator() { + std::lock_guard g(m_); + for (auto& block : blocks_) { + detail::aligned_free(block); + } +} + +void* SimpleAllocator::allocateHard() { + // Allocate a new slab. + mem_ = static_cast(detail::aligned_malloc(allocSize_, allocSize_)); + if (!mem_) { + std::__throw_bad_alloc(); + } + end_ = mem_ + allocSize_; + blocks_.push_back(mem_); + + // Install a pointer to ourselves as the allocator. + *reinterpret_cast(mem_) = this; + static_assert( + alignof(std::max_align_t) >= sizeof(SimpleAllocator*), + "alignment too small"); + mem_ += std::min(sz_, alignof(std::max_align_t)); + + // New allocation. + auto mem = mem_; + mem_ += sz_; + assert(intptr_t(mem) % 128 != 0); + return mem; +} + +} // namespace folly diff --git a/folly/concurrency/CacheLocality.h b/folly/concurrency/CacheLocality.h new file mode 100644 index 00000000..be9d4410 --- /dev/null +++ b/folly/concurrency/CacheLocality.h @@ -0,0 +1,510 @@ +/* + * Copyright 2017 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace folly { + +// This file contains several classes that might be useful if you are +// trying to dynamically optimize cache locality: CacheLocality reads +// cache sharing information from sysfs to determine how CPUs should be +// grouped to minimize contention, Getcpu provides fast access to the +// current CPU via __vdso_getcpu, and AccessSpreader uses these two to +// optimally spread accesses among a predetermined number of stripes. +// +// AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is +// substantially less than the cost of a cache miss. This means that we +// can effectively use it to reduce cache line ping-pong on striped data +// structures such as IndexedMemPool or statistics counters. +// +// Because CacheLocality looks at all of the cache levels, it can be +// used for different levels of optimization. AccessSpreader(2) does +// per-chip spreading on a dual socket system. AccessSpreader(numCpus) +// does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does +// perfect L1 spreading in a system with hyperthreading enabled. + +struct CacheLocality { + + /// 1 more than the maximum value that can be returned from sched_getcpu + /// or getcpu. This is the number of hardware thread contexts provided + /// by the processors + size_t numCpus; + + /// Holds the number of caches present at each cache level (0 is + /// the closest to the cpu). This is the number of AccessSpreader + /// stripes needed to avoid cross-cache communication at the specified + /// layer. numCachesByLevel.front() is the number of L1 caches and + /// numCachesByLevel.back() is the number of last-level caches. + std::vector numCachesByLevel; + + /// A map from cpu (from sched_getcpu or getcpu) to an index in the + /// range 0..numCpus-1, where neighboring locality indices are more + /// likely to share caches then indices far away. All of the members + /// of a particular cache level be contiguous in their locality index. + /// For example, if numCpus is 32 and numCachesByLevel.back() is 2, + /// then cpus with a locality index < 16 will share one last-level + /// cache and cpus with a locality index >= 16 will share the other. + std::vector localityIndexByCpu; + + /// Returns the best CacheLocality information available for the current + /// system, cached for fast access. This will be loaded from sysfs if + /// possible, otherwise it will be correct in the number of CPUs but + /// not in their sharing structure. + /// + /// If you are into yo dawgs, this is a shared cache of the local + /// locality of the shared caches. + /// + /// The template parameter here is used to allow injection of a + /// repeatable CacheLocality structure during testing. Rather than + /// inject the type of the CacheLocality provider into every data type + /// that transitively uses it, all components select between the default + /// sysfs implementation and a deterministic implementation by keying + /// off the type of the underlying atomic. See DeterministicScheduler. + template