2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "folly/detail/CacheLocality.h"
19 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
23 #include "folly/Conv.h"
24 #include "folly/Exception.h"
25 #include "folly/FileUtil.h"
26 #include "folly/Format.h"
27 #include "folly/ScopeGuard.h"
29 namespace folly { namespace detail {
31 ///////////// CacheLocality
33 /// Returns the best real CacheLocality information available
34 static CacheLocality getSystemLocalityInfo() {
36 return CacheLocality::readFromSysfs();
41 long numCpus = sysconf(_SC_NPROCESSORS_CONF);
43 // This shouldn't happen, but if it does we should try to keep
44 // going. We are probably not going to be able to parse /sys on
45 // this box either (although we will try), which means we are going
46 // to fall back to the SequentialThreadId splitter. On my 16 core
47 // (x hyperthreading) dev box 16 stripes is enough to get pretty good
48 // contention avoidance with SequentialThreadId, and there is little
49 // improvement from going from 32 to 64. This default gives us some
53 return CacheLocality::uniform(numCpus);
57 const CacheLocality& CacheLocality::system<std::atomic>() {
58 static CacheLocality cache(getSystemLocalityInfo());
62 // Each level of cache has sharing sets, which are the set of cpus
63 // that share a common cache at that level. These are available in a
64 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
65 // for example). They are also available in a human-readable list form,
66 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list
67 // is a comma-separated list of numbers and ranges, where the ranges are
68 // a pair of decimal numbers separated by a '-'.
70 // To sort the cpus for optimum locality we don't really need to parse
71 // the sharing sets, we just need a unique representative from the
72 // equivalence class. The smallest value works fine, and happens to be
73 // the first decimal number in the file. We load all of the equivalence
74 // class information from all of the cpu*/index* directories, order the
75 // cpus first by increasing last-level cache equivalence class, then by
76 // the smaller caches. Finally, we break ties with the cpu number itself.
78 /// Returns the first decimal number in the string, or throws an exception
79 /// if the string does not start with a number terminated by ',', '-',
81 static ssize_t parseLeadingNumber(const std::string& line) {
82 auto raw = line.c_str();
84 unsigned val = strtoul(raw, &end, 10);
85 if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) {
86 throw std::runtime_error(to<std::string>(
87 "error parsing list '", line, "'").c_str());
92 CacheLocality CacheLocality::readFromSysfsTree(
93 const std::function<std::string(std::string)>& mapping) {
94 // number of equivalence classes per level
95 std::vector<size_t> numCachesByLevel;
97 // the list of cache equivalence classes, where equivalance classes
98 // are named by the smallest cpu in the class
99 std::vector<std::vector<size_t>> equivClassesByCpu;
101 std::vector<size_t> cpus;
104 auto cpu = cpus.size();
105 std::vector<size_t> levels;
106 for (size_t index = 0; ; ++index) {
107 auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/",
109 auto cacheType = mapping(dir + "type");
110 auto equivStr = mapping(dir + "shared_cpu_list");
111 if (cacheType.size() == 0 || equivStr.size() == 0) {
115 if (cacheType[0] == 'I') {
116 // cacheType in { "Data", "Instruction", "Unified" }. skip icache
119 auto equiv = parseLeadingNumber(equivStr);
120 auto level = levels.size();
121 levels.push_back(equiv);
124 // we only want to count the equiv classes once, so we do it when
125 // we first encounter them
126 while (numCachesByLevel.size() <= level) {
127 numCachesByLevel.push_back(0);
129 numCachesByLevel[level]++;
133 if (levels.size() == 0) {
134 // no levels at all for this cpu, we must be done
137 equivClassesByCpu.emplace_back(std::move(levels));
141 if (cpus.size() == 0) {
142 throw std::runtime_error("unable to load cache sharing info");
145 std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
146 // sort first by equiv class of cache with highest index, direction
147 // doesn't matter. If different cpus have different numbers of
148 // caches then this code might produce a sub-optimal ordering, but
150 auto& lhsEquiv = equivClassesByCpu[lhs];
151 auto& rhsEquiv = equivClassesByCpu[rhs];
152 for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) {
153 if (lhsEquiv[i] != rhsEquiv[i]) {
154 return lhsEquiv[i] < rhsEquiv[i];
158 // break ties deterministically by cpu
162 // the cpus are now sorted by locality, with neighboring entries closer
163 // to each other than entries that are far away. For striping we want
164 // the inverse map, since we are starting with the cpu
165 std::vector<size_t> indexes(cpus.size());
166 for (int i = 0; i < cpus.size(); ++i) {
167 indexes[cpus[i]] = i;
170 return CacheLocality{
171 cpus.size(), std::move(numCachesByLevel), std::move(indexes) };
174 CacheLocality CacheLocality::readFromSysfs() {
175 return readFromSysfsTree([](std::string name) {
176 std::ifstream xi(name.c_str());
178 std::getline(xi, rv);
184 CacheLocality CacheLocality::uniform(size_t numCpus) {
187 rv.numCpus = numCpus;
189 // one cache shared by all cpus
190 rv.numCachesByLevel.push_back(numCpus);
192 // no permutations in locality index mapping
193 for (size_t cpu = 0; cpu < numCpus; ++cpu) {
194 rv.localityIndexByCpu.push_back(cpu);
200 ////////////// Getcpu
202 /// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
204 static Getcpu::Func loadVdsoGetcpu() {
205 void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
210 auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
211 if (func == nullptr) {
212 // technically a null result could either be a failure or a successful
213 // lookup of a symbol with the null value, but the second can't actually
214 // happen for this symbol. No point holding the handle forever if
215 // we don't need the code
222 Getcpu::Func Getcpu::vdsoFunc() {
223 static Func func = loadVdsoGetcpu();
227 /////////////// SequentialThreadId
230 std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
233 __thread size_t SequentialThreadId<std::atomic>::currentId(0);
235 /////////////// AccessSpreader
238 const AccessSpreader<std::atomic>
239 AccessSpreader<std::atomic>::stripeByCore(
240 CacheLocality::system<>().numCachesByLevel.front());
243 const AccessSpreader<std::atomic>
244 AccessSpreader<std::atomic>::stripeByChip(
245 CacheLocality::system<>().numCachesByLevel.back());
248 AccessSpreaderArray<std::atomic,128>
249 AccessSpreaderArray<std::atomic,128>::sharedInstance = {};
251 /// Always claims to be on CPU zero, node zero
252 static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) {
253 if (cpu != nullptr) {
256 if (node != nullptr) {
263 Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
264 if (numStripes == 1) {
265 // there's no need to call getcpu if there is only one stripe.
266 // This should not be common, so we don't want to waste a test and
267 // branch in the main code path, but we might as well use a faster
269 return °enerateGetcpu;
271 auto best = Getcpu::vdsoFunc();
272 return best ? best : &SequentialThreadId<std::atomic>::getcpu;
276 } } // namespace folly::detail