folly/detail/CacheLocality.cpp

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "folly/detail/CacheLocality.h"
  18
  19 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
  20 #include <dlfcn.h>
  21 #include <fstream>
  22
  23 #include "folly/Conv.h"
  24 #include "folly/Exception.h"
  25 #include "folly/FileUtil.h"
  26 #include "folly/Format.h"
  27 #include "folly/ScopeGuard.h"
  28
  29 namespace folly { namespace detail {
  30
  31 ///////////// CacheLocality
  32
  33 /// Returns the best real CacheLocality information available
  34 static CacheLocality getSystemLocalityInfo() {
  35   try {
  36     return CacheLocality::readFromSysfs();
  37   } catch (...) {
  38     // keep trying
  39   }
  40
  41   long numCpus = sysconf(_SC_NPROCESSORS_CONF);
  42   if (numCpus <= 0) {
  43     // This shouldn't happen, but if it does we should try to keep
  44     // going.  We are probably not going to be able to parse /sys on
  45     // this box either (although we will try), which means we are going
  46     // to fall back to the SequentialThreadId splitter.  On my 16 core
  47     // (x hyperthreading) dev box 16 stripes is enough to get pretty good
  48     // contention avoidance with SequentialThreadId, and there is little
  49     // improvement from going from 32 to 64.  This default gives us some
  50     // wiggle room
  51     numCpus = 32;
  52   }
  53   return CacheLocality::uniform(numCpus);
  54 }
  55
  56 template <>
  57 const CacheLocality& CacheLocality::system<std::atomic>() {
  58   static CacheLocality cache(getSystemLocalityInfo());
  59   return cache;
  60 }
  61
  62 // Each level of cache has sharing sets, which are the set of cpus
  63 // that share a common cache at that level.  These are available in a
  64 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
  65 // for example).  They are also available in a human-readable list form,
  66 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list.  The list
  67 // is a comma-separated list of numbers and ranges, where the ranges are
  68 // a pair of decimal numbers separated by a '-'.
  69 //
  70 // To sort the cpus for optimum locality we don't really need to parse
  71 // the sharing sets, we just need a unique representative from the
  72 // equivalence class.  The smallest value works fine, and happens to be
  73 // the first decimal number in the file.  We load all of the equivalence
  74 // class information from all of the cpu*/index* directories, order the
  75 // cpus first by increasing last-level cache equivalence class, then by
  76 // the smaller caches.  Finally, we break ties with the cpu number itself.
  77
  78 /// Returns the first decimal number in the string, or throws an exception
  79 /// if the string does not start with a number terminated by ',', '-',
  80 /// '\n', or eos.
  81 static ssize_t parseLeadingNumber(const std::string& line) {
  82   auto raw = line.c_str();
  83   char *end;
  84   unsigned val = strtoul(raw, &end, 10);
  85   if (end == raw || (*end != ',' && *end != '-' && *end != '\n')) {
  86     throw std::runtime_error(to<std::string>(
  87         "error parsing list '", line, "'").c_str());
  88   }
  89   return val;
  90 }
  91
  92 CacheLocality CacheLocality::readFromSysfsTree(
  93     const std::function<std::string(std::string)>& mapping) {
  94   // number of equivalence classes per level
  95   std::vector<size_t> numCachesByLevel;
  96
  97   // the list of cache equivalence classes, where equivalance classes
  98   // are named by the smallest cpu in the class
  99   std::vector<std::vector<size_t>> equivClassesByCpu;
 100
 101   std::vector<size_t> cpus;
 102
 103   while (true) {
 104     auto cpu = cpus.size();
 105     std::vector<size_t> levels;
 106     for (size_t index = 0; ; ++index) {
 107       auto dir = format("/sys/devices/system/cpu/cpu{}/cache/index{}/",
 108                         cpu, index).str();
 109       auto cacheType = mapping(dir + "type");
 110       auto equivStr = mapping(dir + "shared_cpu_list");
 111       if (cacheType.size() == 0 || equivStr.size() == 0) {
 112         // no more caches
 113         break;
 114       }
 115       if (cacheType[0] == 'I') {
 116         // cacheType in { "Data", "Instruction", "Unified" }. skip icache
 117         continue;
 118       }
 119       auto equiv = parseLeadingNumber(equivStr);
 120       auto level = levels.size();
 121       levels.push_back(equiv);
 122
 123       if (equiv == cpu) {
 124         // we only want to count the equiv classes once, so we do it when
 125         // we first encounter them
 126         while (numCachesByLevel.size() <= level) {
 127           numCachesByLevel.push_back(0);
 128         }
 129         numCachesByLevel[level]++;
 130       }
 131     }
 132
 133     if (levels.size() == 0) {
 134       // no levels at all for this cpu, we must be done
 135       break;
 136     }
 137     equivClassesByCpu.emplace_back(std::move(levels));
 138     cpus.push_back(cpu);
 139   }
 140
 141   if (cpus.size() == 0) {
 142     throw std::runtime_error("unable to load cache sharing info");
 143   }
 144
 145   std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
 146     // sort first by equiv class of cache with highest index, direction
 147     // doesn't matter.  If different cpus have different numbers of
 148     // caches then this code might produce a sub-optimal ordering, but
 149     // it won't crash
 150     auto& lhsEquiv = equivClassesByCpu[lhs];
 151     auto& rhsEquiv = equivClassesByCpu[rhs];
 152     for (int i = std::min(lhsEquiv.size(), rhsEquiv.size()) - 1; i >= 0; --i) {
 153       if (lhsEquiv[i] != rhsEquiv[i]) {
 154         return lhsEquiv[i] < rhsEquiv[i];
 155       }
 156     }
 157
 158     // break ties deterministically by cpu
 159     return lhs < rhs;
 160   });
 161
 162   // the cpus are now sorted by locality, with neighboring entries closer
 163   // to each other than entries that are far away.  For striping we want
 164   // the inverse map, since we are starting with the cpu
 165   std::vector<size_t> indexes(cpus.size());
 166   for (int i = 0; i < cpus.size(); ++i) {
 167     indexes[cpus[i]] = i;
 168   }
 169
 170   return CacheLocality{
 171       cpus.size(), std::move(numCachesByLevel), std::move(indexes) };
 172 }
 173
 174 CacheLocality CacheLocality::readFromSysfs() {
 175   return readFromSysfsTree([](std::string name) {
 176     std::ifstream xi(name.c_str());
 177     std::string rv;
 178     std::getline(xi, rv);
 179     return rv;
 180   });
 181 }
 182
 183
 184 CacheLocality CacheLocality::uniform(size_t numCpus) {
 185   CacheLocality rv;
 186
 187   rv.numCpus = numCpus;
 188
 189   // one cache shared by all cpus
 190   rv.numCachesByLevel.push_back(numCpus);
 191
 192   // no permutations in locality index mapping
 193   for (size_t cpu = 0; cpu < numCpus; ++cpu) {
 194     rv.localityIndexByCpu.push_back(cpu);
 195   }
 196
 197   return rv;
 198 }
 199
 200 ////////////// Getcpu
 201
 202 /// Resolves the dynamically loaded symbol __vdso_getcpu, returning null
 203 /// on failure
 204 static Getcpu::Func loadVdsoGetcpu() {
 205   void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
 206   if (h == nullptr) {
 207     return nullptr;
 208   }
 209
 210   auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
 211   if (func == nullptr) {
 212     // technically a null result could either be a failure or a successful
 213     // lookup of a symbol with the null value, but the second can't actually
 214     // happen for this symbol.  No point holding the handle forever if
 215     // we don't need the code
 216     dlclose(h);
 217   }
 218
 219   return func;
 220 }
 221
 222 Getcpu::Func Getcpu::vdsoFunc() {
 223   static Func func = loadVdsoGetcpu();
 224   return func;
 225 }
 226
 227 /////////////// SequentialThreadId
 228
 229 template<>
 230 std::atomic<size_t> SequentialThreadId<std::atomic>::prevId(0);
 231
 232 template<>
 233 __thread size_t SequentialThreadId<std::atomic>::currentId(0);
 234
 235 /////////////// AccessSpreader
 236
 237 template<>
 238 const AccessSpreader<std::atomic>
 239 AccessSpreader<std::atomic>::stripeByCore(
 240     CacheLocality::system<>().numCachesByLevel.front());
 241
 242 template<>
 243 const AccessSpreader<std::atomic>
 244 AccessSpreader<std::atomic>::stripeByChip(
 245     CacheLocality::system<>().numCachesByLevel.back());
 246
 247 template<>
 248 AccessSpreaderArray<std::atomic,128>
 249 AccessSpreaderArray<std::atomic,128>::sharedInstance = {};
 250
 251 /// Always claims to be on CPU zero, node zero
 252 static int degenerateGetcpu(unsigned* cpu, unsigned* node, void* unused) {
 253   if (cpu != nullptr) {
 254     *cpu = 0;
 255   }
 256   if (node != nullptr) {
 257     *node = 0;
 258   }
 259   return 0;
 260 }
 261
 262 template<>
 263 Getcpu::Func AccessSpreader<std::atomic>::pickGetcpuFunc(size_t numStripes) {
 264   if (numStripes == 1) {
 265     // there's no need to call getcpu if there is only one stripe.
 266     // This should not be common, so we don't want to waste a test and
 267     // branch in the main code path, but we might as well use a faster
 268     // function pointer
 269     return &degenerateGetcpu;
 270   } else {
 271     auto best = Getcpu::vdsoFunc();
 272     return best ? best : &SequentialThreadId<std::atomic>::getcpu;
 273   }
 274 }
 275
 276 } } // namespace folly::detail