2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/CacheLocality.h>
19 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
24 #include <folly/Conv.h>
25 #include <folly/Exception.h>
26 #include <folly/FileUtil.h>
27 #include <folly/Format.h>
28 #include <folly/ScopeGuard.h>
30 namespace folly { namespace detail {
32 ///////////// CacheLocality
34 /// Returns the best real CacheLocality information available
35 static CacheLocality getSystemLocalityInfo() {
38 return CacheLocality::readFromSysfs();
40 // fall through to below if something goes wrong
44 long numCpus = sysconf(_SC_NPROCESSORS_CONF);
46 // This shouldn't happen, but if it does we should try to keep
47 // going. We are probably not going to be able to parse /sys on
48 // this box either (although we will try), which means we are going
49 // to fall back to the SequentialThreadId splitter. On my 16 core
50 // (x hyperthreading) dev box 16 stripes is enough to get pretty good
51 // contention avoidance with SequentialThreadId, and there is little
52 // improvement from going from 32 to 64. This default gives us some
56 return CacheLocality::uniform(numCpus);
60 const CacheLocality& CacheLocality::system<std::atomic>() {
61 static CacheLocality cache(getSystemLocalityInfo());
65 // Each level of cache has sharing sets, which are the set of cpus
66 // that share a common cache at that level. These are available in a
67 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
68 // for example). They are also available in a human-readable list form,
69 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list
70 // is a comma-separated list of numbers and ranges, where the ranges are
71 // a pair of decimal numbers separated by a '-'.
73 // To sort the cpus for optimum locality we don't really need to parse
74 // the sharing sets, we just need a unique representative from the
75 // equivalence class. The smallest value works fine, and happens to be
76 // the first decimal number in the file. We load all of the equivalence
77 // class information from all of the cpu*/index* directories, order the
78 // cpus first by increasing last-level cache equivalence class, then by
79 // the smaller caches. Finally, we break ties with the cpu number itself.
81 /// Returns the first decimal number in the string, or throws an exception
82 /// if the string does not start with a number terminated by ',', '-',
84 static size_t parseLeadingNumber(const std::string& line) {
85 auto raw = line.c_str();
87 unsigned long val = strtoul(raw, &end, 10);
88 if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) {
89 throw std::runtime_error(to<std::string>(
90 "error parsing list '", line, "'").c_str());
95 CacheLocality CacheLocality::readFromSysfsTree(
96 const std::function<std::string(std::string)>& mapping) {
97 // number of equivalence classes per level
98 std::vector<size_t> numCachesByLevel;
100 // the list of cache equivalence classes, where equivalance classes
101 // are named by the smallest cpu in the class
102 std::vector<std::vector<size_t>> equivClassesByCpu;
104 std::vector<size_t> cpus;
107 auto cpu = cpus.size();
108 std::vector<size_t> levels;
109 for (size_t index = 0; ; ++index) {
110 auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/",
112 auto cacheType = mapping(dir + "type");
113 auto equivStr = mapping(dir + "shared_cpu_list");
114 if (cacheType.size() == 0 || equivStr.size() == 0) {
118 if (cacheType[0] == 'I') {
119 // cacheType in { "Data", "Instruction", "Unified" }. skip icache
122 auto equiv = parseLeadingNumber(equivStr);
123 auto level = levels.size();
124 levels.push_back(equiv);
127 // we only want to count the equiv classes once, so we do it when
128 // we first encounter them
129 while (numCachesByLevel.size() <= level) {
130 numCachesByLevel.push_back(0);
132 numCachesByLevel[level]++;
136 if (levels.size() == 0) {
137 // no levels at all for this cpu, we must be done
140 equivClassesByCpu.emplace_back(std::move(levels));
144 if (cpus.size() == 0) {
145 throw std::runtime_error("unable to load cache sharing info");
148 std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
149 // sort first by equiv class of cache with highest index, direction
150 // doesn't matter. If different cpus have different numbers of
151 // caches then this code might produce a sub-optimal ordering, but
153 auto& lhsEquiv = equivClassesByCpu[lhs];
154 auto& rhsEquiv = equivClassesByCpu[rhs];
155 for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) {
156 if (lhsEquiv[i] != rhsEquiv[i]) {
157 return lhsEquiv[i] < rhsEquiv[i];
161 // break ties deterministically by cpu
165 // the cpus are now sorted by locality, with neighboring entries closer
166 // to each other than entries that are far away. For striping we want
167 // the inverse map, since we are starting with the cpu
168 std::vector<size_t> indexes(cpus.size());
169 for (size_t i = 0; i < cpus.size(); ++i) {
170 indexes[cpus[i]] = i;
173 return CacheLocality{
174 cpus.size(), std::move(numCachesByLevel), std::move(indexes) };
177 CacheLocality CacheLocality::readFromSysfs() {
178 return readFromSysfsTree([](std::string name) {
179 std::ifstream xi(name.c_str());
181 std::getline(xi, rv);
187 CacheLocality CacheLocality::uniform(size_t numCpus) {
190 rv.numCpus = numCpus;
192 // one cache shared by all cpus
193 rv.numCachesByLevel.push_back(numCpus);
195 // no permutations in locality index mapping
196 for (size_t cpu = 0; cpu < numCpus; ++cpu) {
197 rv.localityIndexByCpu.push_back(cpu);
203 ////////////// Getcpu
205 #ifdef CLOCK_REALTIME_COARSE
207 static std::once_flag gVdsoInitOnce;
208 static Getcpu::Func gVdsoGetcpuFunc;
209 static int64_t (*gVdsoGettimeNsFunc)(clockid_t);
211 static int cachingVdsoGetcpu(unsigned* cpu, unsigned* unused_node,
212 void* unused_tcache) {
213 static __thread unsigned tls_cpu;
214 static __thread int64_t tls_lastContextSwitchNanos;
216 auto lastContextSwitchNanos = gVdsoGettimeNsFunc(CLOCK_REALTIME_COARSE);
217 if (tls_lastContextSwitchNanos != lastContextSwitchNanos) {
218 int rv = gVdsoGetcpuFunc(&tls_cpu, nullptr, nullptr);
222 tls_lastContextSwitchNanos = lastContextSwitchNanos;
229 /// Resolves the dynamically loaded symbol __vdso_getcpu and
230 /// __vdso_clock_gettime_ns, returning a pair of nulls on failure. Does a
231 /// little bit of probing to make sure that the __vdso_clock_gettime_ns
232 /// function isn't using the slow fallback path.
233 Getcpu::Func Getcpu::vdsoFunc() {
234 #ifdef CLOCK_REALTIME_COARSE
235 std::call_once(gVdsoInitOnce, []{
236 void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
238 typedef int64_t (*GettimeNsFunc)(clockid_t);
240 auto getcpuFunc = Getcpu::Func(
241 !h ? nullptr : dlsym(h, "__vdso_getcpu"));
242 auto gettimeNsFunc = GettimeNsFunc(
243 !h ? nullptr : dlsym(h, "__vdso_clock_gettime_ns"));
245 bool coarseGettimeDetected = false;
246 if (gettimeNsFunc != nullptr) {
247 // The TLS cache of getcpu results only is an optimization if the
248 // __vdso_clock_gettime_ns implementation is fast and actually
249 // coarse. The slow fallback implementation is not coarse, so if
250 // we detect a coarse clock we are set. If CLOCK_REALTIME_COARSE
251 // has the right properties, then so long as there is no context
252 // switch between two calls the returned time will be identical.
253 // Dynamically verify this. An unlikely context switch while we're
254 // testing can lead to a false negative, but not a false positive,
255 // so we just run the test multiple times. This ensures that we
256 // will get two calls to gettimeNsFunc in a row with no intervening
258 auto prev = gettimeNsFunc(CLOCK_REALTIME_COARSE);
259 for (int i = 0; i < 10 && !coarseGettimeDetected; ++i) {
260 auto next = gettimeNsFunc(CLOCK_REALTIME_COARSE);
261 coarseGettimeDetected = next == prev;
266 if (getcpuFunc == nullptr || !coarseGettimeDetected) {
267 // technically a null getcpuFunc could either be a failure or
268 // a successful lookup of a symbol with the null value, but the
269 // second can't actually happen for this symbol. No point holding
270 // the handle forever if we don't need the code
275 gVdsoGetcpuFunc = getcpuFunc;
276 gVdsoGettimeNsFunc = gettimeNsFunc;
280 if (gVdsoGetcpuFunc != nullptr) {
281 return cachingVdsoGetcpu;
288 /////////////// SequentialThreadId
291 std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
294 FOLLY_TLS size_t SequentialThreadId<std::atomic>::currentId(0);
296 /////////////// AccessSpreader
299 const AccessSpreader<std::atomic>
300 AccessSpreader<std::atomic>::stripeByCore(
301 CacheLocality::system<>().numCachesByLevel.front());
304 const AccessSpreader<std::atomic>
305 AccessSpreader<std::atomic>::stripeByChip(
306 CacheLocality::system<>().numCachesByLevel.back());
309 AccessSpreaderArray<std::atomic,128>
310 AccessSpreaderArray<std::atomic,128>::sharedInstance = {};
312 /// Always claims to be on CPU zero, node zero
313 static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) {
314 if (cpu != nullptr) {
317 if (node != nullptr) {
324 Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
325 if (numStripes == 1) {
326 // there's no need to call getcpu if there is only one stripe.
327 // This should not be common, so we don't want to waste a test and
328 // branch in the main code path, but we might as well use a faster
330 return °enerateGetcpu;
332 auto best = Getcpu::vdsoFunc();
333 return best ? best : &SequentialThreadId<std::atomic>::getcpu;
337 } } // namespace folly::detail