folly/Benchmark.cpp

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  18
  19 #include <folly/Benchmark.h>
  20 #include <folly/Foreach.h>
  21 #include <folly/json.h>
  22 #include <folly/String.h>
  23
  24 #include <algorithm>
  25 #include <boost/regex.hpp>
  26 #include <cmath>
  27 #include <iostream>
  28 #include <limits>
  29 #include <utility>
  30 #include <vector>
  31 #include <cstring>
  32
  33 using namespace std;
  34
  35 DEFINE_bool(benchmark, false, "Run benchmarks.");
  36 DEFINE_bool(json, false, "Output in JSON format.");
  37
  38 DEFINE_string(bm_regex, "",
  39               "Only benchmarks whose names match this regex will be run.");
  40
  41 DEFINE_int64(bm_min_usec, 100,
  42              "Minimum # of microseconds we'll accept for each benchmark.");
  43
  44 DEFINE_int64(bm_min_iters, 1,
  45              "Minimum # of iterations we'll try for each benchmark.");
  46
  47 DEFINE_int32(bm_max_secs, 1,
  48              "Maximum # of seconds we'll spend on each benchmark.");
  49
  50
  51 namespace folly {
  52
  53 BenchmarkSuspender::NanosecondsSpent BenchmarkSuspender::nsSpent;
  54
  55 typedef function<detail::TimeIterPair(unsigned int)> BenchmarkFun;
  56
  57
  58 vector<tuple<string, string, BenchmarkFun>>& benchmarks() {
  59   static vector<tuple<string, string, BenchmarkFun>> _benchmarks;
  60   return _benchmarks;
  61 }
  62
  63 #define FB_FOLLY_GLOBAL_BENCHMARK_BASELINE fbFollyGlobalBenchmarkBaseline
  64 #define FB_STRINGIZE_X2(x) FB_STRINGIZE(x)
  65
  66 // Add the global baseline
  67 BENCHMARK(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE) {
  68 #ifdef _MSC_VER
  69   _ReadWriteBarrier();
  70 #else
  71   asm volatile("");
  72 #endif
  73 }
  74
  75 int getGlobalBenchmarkBaselineIndex() {
  76   const char *global = FB_STRINGIZE_X2(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE);
  77   auto it = std::find_if(
  78     benchmarks().begin(),
  79     benchmarks().end(),
  80     [global](const tuple<string, string, BenchmarkFun> &v) {
  81       return get<1>(v) == global;
  82     }
  83   );
  84   CHECK(it != benchmarks().end());
  85   return it - benchmarks().begin();
  86 }
  87
  88 #undef FB_STRINGIZE_X2
  89 #undef FB_FOLLY_GLOBAL_BENCHMARK_BASELINE
  90
  91 void detail::addBenchmarkImpl(const char* file, const char* name,
  92                               BenchmarkFun fun) {
  93   benchmarks().emplace_back(file, name, std::move(fun));
  94 }
  95
  96 /**
  97  * Given a point, gives density at that point as a number 0.0 < x <=
  98  * 1.0. The result is 1.0 if all samples are equal to where, and
  99  * decreases near 0 if all points are far away from it. The density is
 100  * computed with the help of a radial basis function.
 101  */
 102 static double density(const double * begin, const double *const end,
 103                       const double where, const double bandwidth) {
 104   assert(begin < end);
 105   assert(bandwidth > 0.0);
 106   double sum = 0.0;
 107   FOR_EACH_RANGE (i, begin, end) {
 108     auto d = (*i - where) / bandwidth;
 109     sum += exp(- d * d);
 110   }
 111   return sum / (end - begin);
 112 }
 113
 114 /**
 115  * Computes mean and variance for a bunch of data points. Note that
 116  * mean is currently not being used.
 117  */
 118 static pair<double, double>
 119 meanVariance(const double * begin, const double *const end) {
 120   assert(begin < end);
 121   double sum = 0.0, sum2 = 0.0;
 122   FOR_EACH_RANGE (i, begin, end) {
 123     sum += *i;
 124     sum2 += *i * *i;
 125   }
 126   auto const n = end - begin;
 127   return make_pair(sum / n, sqrt((sum2 - sum * sum / n) / n));
 128 }
 129
 130 /**
 131  * Computes the mode of a sample set through brute force. Assumes
 132  * input is sorted.
 133  */
 134 static double mode(const double * begin, const double *const end) {
 135   assert(begin < end);
 136   // Lower bound and upper bound for result and their respective
 137   // densities.
 138   auto
 139     result = 0.0,
 140     bestDensity = 0.0;
 141
 142   // Get the variance so we pass it down to density()
 143   auto const sigma = meanVariance(begin, end).second;
 144   if (!sigma) {
 145     // No variance means constant signal
 146     return *begin;
 147   }
 148
 149   FOR_EACH_RANGE (i, begin, end) {
 150     assert(i == begin || *i >= i[-1]);
 151     auto candidate = density(begin, end, *i, sigma * sqrt(2.0));
 152     if (candidate > bestDensity) {
 153       // Found a new best
 154       bestDensity = candidate;
 155       result = *i;
 156     } else {
 157       // Density is decreasing... we could break here if we definitely
 158       // knew this is unimodal.
 159     }
 160   }
 161
 162   return result;
 163 }
 164
 165 /**
 166  * Given a bunch of benchmark samples, estimate the actual run time.
 167  */
 168 static double estimateTime(double * begin, double * end) {
 169   assert(begin < end);
 170
 171   // Current state of the art: get the minimum. After some
 172   // experimentation, it seems taking the minimum is the best.
 173
 174   return *min_element(begin, end);
 175
 176   // What follows after estimates the time as the mode of the
 177   // distribution.
 178
 179   // Select the awesomest (i.e. most frequent) result. We do this by
 180   // sorting and then computing the longest run length.
 181   sort(begin, end);
 182
 183   // Eliminate outliers. A time much larger than the minimum time is
 184   // considered an outlier.
 185   while (end[-1] > 2.0 * *begin) {
 186     --end;
 187     if (begin == end) {
 188       LOG(INFO) << *begin;
 189     }
 190     assert(begin < end);
 191   }
 192
 193   double result = 0;
 194
 195   /* Code used just for comparison purposes */ {
 196     unsigned bestFrequency = 0;
 197     unsigned candidateFrequency = 1;
 198     double candidateValue = *begin;
 199     for (auto current = begin + 1; ; ++current) {
 200       if (current == end || *current != candidateValue) {
 201         // Done with the current run, see if it was best
 202         if (candidateFrequency > bestFrequency) {
 203           bestFrequency = candidateFrequency;
 204           result = candidateValue;
 205         }
 206         if (current == end) {
 207           break;
 208         }
 209         // Start a new run
 210         candidateValue = *current;
 211         candidateFrequency = 1;
 212       } else {
 213         // Cool, inside a run, increase the frequency
 214         ++candidateFrequency;
 215       }
 216     }
 217   }
 218
 219   result = mode(begin, end);
 220
 221   return result;
 222 }
 223
 224 static double runBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
 225                                             const double globalBaseline) {
 226   // They key here is accuracy; too low numbers means the accuracy was
 227   // coarse. We up the ante until we get to at least minNanoseconds
 228   // timings.
 229   static uint64_t resolutionInNs = 0;
 230   if (!resolutionInNs) {
 231     timespec ts;
 232     CHECK_EQ(0, clock_getres(detail::DEFAULT_CLOCK_ID, &ts));
 233     CHECK_EQ(0, ts.tv_sec) << "Clock sucks.";
 234     CHECK_LT(0, ts.tv_nsec) << "Clock too fast for its own good.";
 235     CHECK_EQ(1, ts.tv_nsec) << "Clock too coarse, upgrade your kernel.";
 236     resolutionInNs = ts.tv_nsec;
 237   }
 238   // We choose a minimum minimum (sic) of 100,000 nanoseconds, but if
 239   // the clock resolution is worse than that, it will be larger. In
 240   // essence we're aiming at making the quantization noise 0.01%.
 241   static const auto minNanoseconds =
 242     max<uint64_t>(FLAGS_bm_min_usec * 1000UL,
 243         min<uint64_t>(resolutionInNs * 100000, 1000000000ULL));
 244
 245   // We do measurements in several epochs and take the minimum, to
 246   // account for jitter.
 247   static const unsigned int epochs = 1000;
 248   // We establish a total time budget as we don't want a measurement
 249   // to take too long. This will curtail the number of actual epochs.
 250   const uint64_t timeBudgetInNs = FLAGS_bm_max_secs * 1000000000ULL;
 251   timespec global;
 252   CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &global));
 253
 254   double epochResults[epochs] = { 0 };
 255   size_t actualEpochs = 0;
 256
 257   for (; actualEpochs < epochs; ++actualEpochs) {
 258     for (unsigned int n = FLAGS_bm_min_iters; n < (1UL << 30); n *= 2) {
 259       auto const nsecsAndIter = fun(n);
 260       if (nsecsAndIter.first < minNanoseconds) {
 261         continue;
 262       }
 263       // We got an accurate enough timing, done. But only save if
 264       // smaller than the current result.
 265       epochResults[actualEpochs] = max(0.0, double(nsecsAndIter.first) /
 266                                        nsecsAndIter.second - globalBaseline);
 267       // Done with the current epoch, we got a meaningful timing.
 268       break;
 269     }
 270     timespec now;
 271     CHECK_EQ(0, clock_gettime(CLOCK_REALTIME, &now));
 272     if (detail::timespecDiff(now, global) >= timeBudgetInNs) {
 273       // No more time budget available.
 274       ++actualEpochs;
 275       break;
 276     }
 277   }
 278
 279   // If the benchmark was basically drowned in baseline noise, it's
 280   // possible it became negative.
 281   return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
 282 }
 283
 284 struct ScaleInfo {
 285   double boundary;
 286   const char* suffix;
 287 };
 288
 289 static const ScaleInfo kTimeSuffixes[] {
 290   { 365.25 * 24 * 3600, "years" },
 291   { 24 * 3600, "days" },
 292   { 3600, "hr" },
 293   { 60, "min" },
 294   { 1, "s" },
 295   { 1E-3, "ms" },
 296   { 1E-6, "us" },
 297   { 1E-9, "ns" },
 298   { 1E-12, "ps" },
 299   { 1E-15, "fs" },
 300   { 0, nullptr },
 301 };
 302
 303 static const ScaleInfo kMetricSuffixes[] {
 304   { 1E24, "Y" },  // yotta
 305   { 1E21, "Z" },  // zetta
 306   { 1E18, "X" },  // "exa" written with suffix 'X' so as to not create
 307                   //   confusion with scientific notation
 308   { 1E15, "P" },  // peta
 309   { 1E12, "T" },  // terra
 310   { 1E9, "G" },   // giga
 311   { 1E6, "M" },   // mega
 312   { 1E3, "K" },   // kilo
 313   { 1, "" },
 314   { 1E-3, "m" },  // milli
 315   { 1E-6, "u" },  // micro
 316   { 1E-9, "n" },  // nano
 317   { 1E-12, "p" }, // pico
 318   { 1E-15, "f" }, // femto
 319   { 1E-18, "a" }, // atto
 320   { 1E-21, "z" }, // zepto
 321   { 1E-24, "y" }, // yocto
 322   { 0, nullptr },
 323 };
 324
 325 static string humanReadable(double n, unsigned int decimals,
 326                             const ScaleInfo* scales) {
 327   if (std::isinf(n) || std::isnan(n)) {
 328     return folly::to<string>(n);
 329   }
 330
 331   const double absValue = fabs(n);
 332   const ScaleInfo* scale = scales;
 333   while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
 334     ++scale;
 335   }
 336
 337   const double scaledValue = n / scale->boundary;
 338   return stringPrintf("%.*f%s", decimals, scaledValue, scale->suffix);
 339 }
 340
 341 static string readableTime(double n, unsigned int decimals) {
 342   return humanReadable(n, decimals, kTimeSuffixes);
 343 }
 344
 345 static string metricReadable(double n, unsigned int decimals) {
 346   return humanReadable(n, decimals, kMetricSuffixes);
 347 }
 348
 349 static void printBenchmarkResultsAsTable(
 350   const vector<tuple<string, string, double> >& data) {
 351   // Width available
 352   static const unsigned int columns = 76;
 353
 354   // Compute the longest benchmark name
 355   size_t longestName = 0;
 356   FOR_EACH_RANGE (i, 1, benchmarks().size()) {
 357     longestName = max(longestName, get<1>(benchmarks()[i]).size());
 358   }
 359
 360   // Print a horizontal rule
 361   auto separator = [&](char pad) {
 362     puts(string(columns, pad).c_str());
 363   };
 364
 365   // Print header for a file
 366   auto header = [&](const string& file) {
 367     separator('=');
 368     printf("%-*srelative  time/iter  iters/s\n",
 369            columns - 28, file.c_str());
 370     separator('=');
 371   };
 372
 373   double baselineNsPerIter = numeric_limits<double>::max();
 374   string lastFile;
 375
 376   for (auto& datum : data) {
 377     auto file = get<0>(datum);
 378     if (file != lastFile) {
 379       // New file starting
 380       header(file);
 381       lastFile = file;
 382     }
 383
 384     string s = get<1>(datum);
 385     if (s == "-") {
 386       separator('-');
 387       continue;
 388     }
 389     bool useBaseline /* = void */;
 390     if (s[0] == '%') {
 391       s.erase(0, 1);
 392       useBaseline = true;
 393     } else {
 394       baselineNsPerIter = get<2>(datum);
 395       useBaseline = false;
 396     }
 397     s.resize(columns - 29, ' ');
 398     auto nsPerIter = get<2>(datum);
 399     auto secPerIter = nsPerIter / 1E9;
 400     auto itersPerSec = 1 / secPerIter;
 401     if (!useBaseline) {
 402       // Print without baseline
 403       printf("%*s           %9s  %7s\n",
 404              static_cast<int>(s.size()), s.c_str(),
 405              readableTime(secPerIter, 2).c_str(),
 406              metricReadable(itersPerSec, 2).c_str());
 407     } else {
 408       // Print with baseline
 409       auto rel = baselineNsPerIter / nsPerIter * 100.0;
 410       printf("%*s %7.2f%%  %9s  %7s\n",
 411              static_cast<int>(s.size()), s.c_str(),
 412              rel,
 413              readableTime(secPerIter, 2).c_str(),
 414              metricReadable(itersPerSec, 2).c_str());
 415     }
 416   }
 417   separator('=');
 418 }
 419
 420 static void printBenchmarkResultsAsJson(
 421   const vector<tuple<string, string, double> >& data) {
 422   dynamic d = dynamic::object;
 423   for (auto& datum: data) {
 424     d[std::get<1>(datum)] = std::get<2>(datum) * 1000.;
 425   }
 426
 427   printf("%s\n", toPrettyJson(d).c_str());
 428 }
 429
 430 static void printBenchmarkResults(
 431   const vector<tuple<string, string, double> >& data) {
 432
 433   if (FLAGS_json) {
 434     printBenchmarkResultsAsJson(data);
 435   } else {
 436     printBenchmarkResultsAsTable(data);
 437   }
 438 }
 439
 440 void runBenchmarks() {
 441   CHECK(!benchmarks().empty());
 442
 443   vector<tuple<string, string, double>> results;
 444   results.reserve(benchmarks().size() - 1);
 445
 446   std::unique_ptr<boost::regex> bmRegex;
 447   if (!FLAGS_bm_regex.empty()) {
 448     bmRegex.reset(new boost::regex(FLAGS_bm_regex));
 449   }
 450
 451   // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
 452
 453   unsigned int baselineIndex = getGlobalBenchmarkBaselineIndex();
 454
 455   auto const globalBaseline =
 456       runBenchmarkGetNSPerIteration(get<2>(benchmarks()[baselineIndex]), 0);
 457   FOR_EACH_RANGE (i, 0, benchmarks().size()) {
 458     if (i == baselineIndex) {
 459       continue;
 460     }
 461     double elapsed = 0.0;
 462     if (get<1>(benchmarks()[i]) != "-") { // skip separators
 463       if (bmRegex && !boost::regex_search(get<1>(benchmarks()[i]), *bmRegex)) {
 464         continue;
 465       }
 466       elapsed = runBenchmarkGetNSPerIteration(get<2>(benchmarks()[i]),
 467                                               globalBaseline);
 468     }
 469     results.emplace_back(get<0>(benchmarks()[i]),
 470                          get<1>(benchmarks()[i]), elapsed);
 471   }
 472
 473   // PLEASE MAKE NOISE. MEASUREMENTS DONE.
 474
 475   printBenchmarkResults(results);
 476 }
 477
 478 } // namespace folly