folly/Benchmark.cpp

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  18
  19 #include <folly/Benchmark.h>
  20
  21 #include <algorithm>
  22 #include <cmath>
  23 #include <cstring>
  24 #include <iostream>
  25 #include <limits>
  26 #include <map>
  27 #include <utility>
  28 #include <vector>
  29
  30 #include <boost/regex.hpp>
  31
  32 #include <folly/Foreach.h>
  33 #include <folly/MapUtil.h>
  34 #include <folly/String.h>
  35 #include <folly/json.h>
  36
  37 using namespace std;
  38
  39 DEFINE_bool(benchmark, false, "Run benchmarks.");
  40 DEFINE_bool(json, false, "Output in JSON format.");
  41 DEFINE_bool(json_verbose, false, "Output in verbose JSON format.");
  42
  43 DEFINE_string(
  44     bm_regex,
  45     "",
  46     "Only benchmarks whose names match this regex will be run.");
  47
  48 DEFINE_int64(
  49     bm_min_usec,
  50     100,
  51     "Minimum # of microseconds we'll accept for each benchmark.");
  52
  53 DEFINE_int32(
  54     bm_min_iters,
  55     1,
  56     "Minimum # of iterations we'll try for each benchmark.");
  57
  58 DEFINE_int64(
  59     bm_max_iters,
  60     1 << 30,
  61     "Maximum # of iterations we'll try for each benchmark.");
  62
  63 DEFINE_int32(
  64     bm_max_secs,
  65     1,
  66     "Maximum # of seconds we'll spend on each benchmark.");
  67
  68 namespace folly {
  69
  70 std::chrono::high_resolution_clock::duration BenchmarkSuspender::timeSpent;
  71
  72 typedef function<detail::TimeIterPair(unsigned int)> BenchmarkFun;
  73
  74 vector<detail::BenchmarkRegistration>& benchmarks() {
  75   static vector<detail::BenchmarkRegistration> _benchmarks;
  76   return _benchmarks;
  77 }
  78
  79 #define FB_FOLLY_GLOBAL_BENCHMARK_BASELINE fbFollyGlobalBenchmarkBaseline
  80 #define FB_STRINGIZE_X2(x) FB_STRINGIZE(x)
  81
  82 // Add the global baseline
  83 BENCHMARK(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE) {
  84 #ifdef _MSC_VER
  85   _ReadWriteBarrier();
  86 #else
  87   asm volatile("");
  88 #endif
  89 }
  90
  91 size_t getGlobalBenchmarkBaselineIndex() {
  92   const char *global = FB_STRINGIZE_X2(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE);
  93   auto it = std::find_if(
  94       benchmarks().begin(),
  95       benchmarks().end(),
  96       [global](const detail::BenchmarkRegistration& v) {
  97         return v.name == global;
  98       });
  99   CHECK(it != benchmarks().end());
 100   return size_t(std::distance(benchmarks().begin(), it));
 101 }
 102
 103 #undef FB_STRINGIZE_X2
 104 #undef FB_FOLLY_GLOBAL_BENCHMARK_BASELINE
 105
 106 void detail::addBenchmarkImpl(const char* file, const char* name,
 107                               BenchmarkFun fun) {
 108   benchmarks().push_back({file, name, std::move(fun)});
 109 }
 110
 111 /**
 112  * Given a bunch of benchmark samples, estimate the actual run time.
 113  */
 114 static double estimateTime(double * begin, double * end) {
 115   assert(begin < end);
 116
 117   // Current state of the art: get the minimum. After some
 118   // experimentation, it seems taking the minimum is the best.
 119   return *min_element(begin, end);
 120 }
 121
 122 static double runBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
 123                                             const double globalBaseline) {
 124   using std::chrono::duration_cast;
 125   using std::chrono::high_resolution_clock;
 126   using std::chrono::microseconds;
 127   using std::chrono::nanoseconds;
 128   using std::chrono::seconds;
 129
 130   // They key here is accuracy; too low numbers means the accuracy was
 131   // coarse. We up the ante until we get to at least minNanoseconds
 132   // timings.
 133   static_assert(
 134       std::is_same<high_resolution_clock::duration, nanoseconds>::value,
 135       "High resolution clock must be nanosecond resolution.");
 136   // We choose a minimum minimum (sic) of 100,000 nanoseconds, but if
 137   // the clock resolution is worse than that, it will be larger. In
 138   // essence we're aiming at making the quantization noise 0.01%.
 139   static const auto minNanoseconds = std::max<nanoseconds>(
 140       nanoseconds(100000), microseconds(FLAGS_bm_min_usec));
 141
 142   // We do measurements in several epochs and take the minimum, to
 143   // account for jitter.
 144   static const unsigned int epochs = 1000;
 145   // We establish a total time budget as we don't want a measurement
 146   // to take too long. This will curtail the number of actual epochs.
 147   const auto timeBudget = seconds(FLAGS_bm_max_secs);
 148   auto global = high_resolution_clock::now();
 149
 150   double epochResults[epochs] = { 0 };
 151   size_t actualEpochs = 0;
 152
 153   for (; actualEpochs < epochs; ++actualEpochs) {
 154     const auto maxIters = uint32_t(FLAGS_bm_max_iters);
 155     for (auto n = uint32_t(FLAGS_bm_min_iters); n < maxIters; n *= 2) {
 156       auto const nsecsAndIter = fun(static_cast<unsigned int>(n));
 157       if (nsecsAndIter.first < minNanoseconds) {
 158         continue;
 159       }
 160       // We got an accurate enough timing, done. But only save if
 161       // smaller than the current result.
 162       auto nsecs = duration_cast<nanoseconds>(nsecsAndIter.first).count();
 163       epochResults[actualEpochs] =
 164           max(0.0, double(nsecs) / nsecsAndIter.second - globalBaseline);
 165       // Done with the current epoch, we got a meaningful timing.
 166       break;
 167     }
 168     auto now = high_resolution_clock::now();
 169     if (now - global >= timeBudget) {
 170       // No more time budget available.
 171       ++actualEpochs;
 172       break;
 173     }
 174   }
 175
 176   // If the benchmark was basically drowned in baseline noise, it's
 177   // possible it became negative.
 178   return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
 179 }
 180
 181 struct ScaleInfo {
 182   double boundary;
 183   const char* suffix;
 184 };
 185
 186 static const ScaleInfo kTimeSuffixes[] {
 187   { 365.25 * 24 * 3600, "years" },
 188   { 24 * 3600, "days" },
 189   { 3600, "hr" },
 190   { 60, "min" },
 191   { 1, "s" },
 192   { 1E-3, "ms" },
 193   { 1E-6, "us" },
 194   { 1E-9, "ns" },
 195   { 1E-12, "ps" },
 196   { 1E-15, "fs" },
 197   { 0, nullptr },
 198 };
 199
 200 static const ScaleInfo kMetricSuffixes[] {
 201   { 1E24, "Y" },  // yotta
 202   { 1E21, "Z" },  // zetta
 203   { 1E18, "X" },  // "exa" written with suffix 'X' so as to not create
 204                   //   confusion with scientific notation
 205   { 1E15, "P" },  // peta
 206   { 1E12, "T" },  // terra
 207   { 1E9, "G" },   // giga
 208   { 1E6, "M" },   // mega
 209   { 1E3, "K" },   // kilo
 210   { 1, "" },
 211   { 1E-3, "m" },  // milli
 212   { 1E-6, "u" },  // micro
 213   { 1E-9, "n" },  // nano
 214   { 1E-12, "p" }, // pico
 215   { 1E-15, "f" }, // femto
 216   { 1E-18, "a" }, // atto
 217   { 1E-21, "z" }, // zepto
 218   { 1E-24, "y" }, // yocto
 219   { 0, nullptr },
 220 };
 221
 222 static string humanReadable(double n, unsigned int decimals,
 223                             const ScaleInfo* scales) {
 224   if (std::isinf(n) || std::isnan(n)) {
 225     return folly::to<string>(n);
 226   }
 227
 228   const double absValue = fabs(n);
 229   const ScaleInfo* scale = scales;
 230   while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
 231     ++scale;
 232   }
 233
 234   const double scaledValue = n / scale->boundary;
 235   return stringPrintf("%.*f%s", decimals, scaledValue, scale->suffix);
 236 }
 237
 238 static string readableTime(double n, unsigned int decimals) {
 239   return humanReadable(n, decimals, kTimeSuffixes);
 240 }
 241
 242 static string metricReadable(double n, unsigned int decimals) {
 243   return humanReadable(n, decimals, kMetricSuffixes);
 244 }
 245
 246 static void printBenchmarkResultsAsTable(
 247     const vector<detail::BenchmarkResult>& data) {
 248   // Width available
 249   static const unsigned int columns = 76;
 250
 251   // Compute the longest benchmark name
 252   size_t longestName = 0;
 253   for (auto& bm : benchmarks()) {
 254     longestName = max(longestName, bm.name.size());
 255   }
 256
 257   // Print a horizontal rule
 258   auto separator = [&](char pad) {
 259     puts(string(columns, pad).c_str());
 260   };
 261
 262   // Print header for a file
 263   auto header = [&](const string& file) {
 264     separator('=');
 265     printf("%-*srelative  time/iter  iters/s\n",
 266            columns - 28, file.c_str());
 267     separator('=');
 268   };
 269
 270   double baselineNsPerIter = numeric_limits<double>::max();
 271   string lastFile;
 272
 273   for (auto& datum : data) {
 274     auto file = datum.file;
 275     if (file != lastFile) {
 276       // New file starting
 277       header(file);
 278       lastFile = file;
 279     }
 280
 281     string s = datum.name;
 282     if (s == "-") {
 283       separator('-');
 284       continue;
 285     }
 286     bool useBaseline /* = void */;
 287     if (s[0] == '%') {
 288       s.erase(0, 1);
 289       useBaseline = true;
 290     } else {
 291       baselineNsPerIter = datum.timeInNs;
 292       useBaseline = false;
 293     }
 294     s.resize(columns - 29, ' ');
 295     auto nsPerIter = datum.timeInNs;
 296     auto secPerIter = nsPerIter / 1E9;
 297     auto itersPerSec = (secPerIter == 0)
 298                            ? std::numeric_limits<double>::infinity()
 299                            : (1 / secPerIter);
 300     if (!useBaseline) {
 301       // Print without baseline
 302       printf("%*s           %9s  %7s\n",
 303              static_cast<int>(s.size()), s.c_str(),
 304              readableTime(secPerIter, 2).c_str(),
 305              metricReadable(itersPerSec, 2).c_str());
 306     } else {
 307       // Print with baseline
 308       auto rel = baselineNsPerIter / nsPerIter * 100.0;
 309       printf("%*s %7.2f%%  %9s  %7s\n",
 310              static_cast<int>(s.size()), s.c_str(),
 311              rel,
 312              readableTime(secPerIter, 2).c_str(),
 313              metricReadable(itersPerSec, 2).c_str());
 314     }
 315   }
 316   separator('=');
 317 }
 318
 319 static void printBenchmarkResultsAsJson(
 320     const vector<detail::BenchmarkResult>& data) {
 321   dynamic d = dynamic::object;
 322   for (auto& datum: data) {
 323     d[datum.name] = datum.timeInNs * 1000.;
 324   }
 325
 326   printf("%s\n", toPrettyJson(d).c_str());
 327 }
 328
 329 static void printBenchmarkResultsAsVerboseJson(
 330     const vector<detail::BenchmarkResult>& data) {
 331   dynamic d;
 332   benchmarkResultsToDynamic(data, d);
 333   printf("%s\n", toPrettyJson(d).c_str());
 334 }
 335
 336 static void printBenchmarkResults(const vector<detail::BenchmarkResult>& data) {
 337   if (FLAGS_json_verbose) {
 338     printBenchmarkResultsAsVerboseJson(data);
 339   } else if (FLAGS_json) {
 340     printBenchmarkResultsAsJson(data);
 341   } else {
 342     printBenchmarkResultsAsTable(data);
 343   }
 344 }
 345
 346 void benchmarkResultsToDynamic(
 347     const vector<detail::BenchmarkResult>& data,
 348     dynamic& out) {
 349   out = dynamic::array;
 350   for (auto& datum : data) {
 351     out.push_back(dynamic::array(datum.file, datum.name, datum.timeInNs));
 352   }
 353 }
 354
 355 void benchmarkResultsFromDynamic(
 356     const dynamic& d,
 357     vector<detail::BenchmarkResult>& results) {
 358   for (auto& datum : d) {
 359     results.push_back(
 360         {datum[0].asString(), datum[1].asString(), datum[2].asDouble()});
 361   }
 362 }
 363
 364 static pair<StringPiece, StringPiece> resultKey(
 365     const detail::BenchmarkResult& result) {
 366   return pair<StringPiece, StringPiece>(result.file, result.name);
 367 }
 368
 369 void printResultComparison(
 370     const vector<detail::BenchmarkResult>& base,
 371     const vector<detail::BenchmarkResult>& test) {
 372   map<pair<StringPiece, StringPiece>, double> baselines;
 373
 374   for (auto& baseResult : base) {
 375     baselines[resultKey(baseResult)] = baseResult.timeInNs;
 376   }
 377   //
 378   // Width available
 379   static const unsigned int columns = 76;
 380
 381   // Compute the longest benchmark name
 382   size_t longestName = 0;
 383   for (auto& datum : test) {
 384     longestName = max(longestName, datum.name.size());
 385   }
 386
 387   // Print a horizontal rule
 388   auto separator = [&](char pad) { puts(string(columns, pad).c_str()); };
 389
 390   // Print header for a file
 391   auto header = [&](const string& file) {
 392     separator('=');
 393     printf("%-*srelative  time/iter  iters/s\n", columns - 28, file.c_str());
 394     separator('=');
 395   };
 396
 397   string lastFile;
 398
 399   for (auto& datum : test) {
 400     folly::Optional<double> baseline =
 401         folly::get_optional(baselines, resultKey(datum));
 402     auto file = datum.file;
 403     if (file != lastFile) {
 404       // New file starting
 405       header(file);
 406       lastFile = file;
 407     }
 408
 409     string s = datum.name;
 410     if (s == "-") {
 411       separator('-');
 412       continue;
 413     }
 414     if (s[0] == '%') {
 415       s.erase(0, 1);
 416     }
 417     s.resize(columns - 29, ' ');
 418     auto nsPerIter = datum.timeInNs;
 419     auto secPerIter = nsPerIter / 1E9;
 420     auto itersPerSec = (secPerIter == 0)
 421         ? std::numeric_limits<double>::infinity()
 422         : (1 / secPerIter);
 423     if (!baseline) {
 424       // Print without baseline
 425       printf(
 426           "%*s           %9s  %7s\n",
 427           static_cast<int>(s.size()),
 428           s.c_str(),
 429           readableTime(secPerIter, 2).c_str(),
 430           metricReadable(itersPerSec, 2).c_str());
 431     } else {
 432       // Print with baseline
 433       auto rel = *baseline / nsPerIter * 100.0;
 434       printf(
 435           "%*s %7.2f%%  %9s  %7s\n",
 436           static_cast<int>(s.size()),
 437           s.c_str(),
 438           rel,
 439           readableTime(secPerIter, 2).c_str(),
 440           metricReadable(itersPerSec, 2).c_str());
 441     }
 442   }
 443   separator('=');
 444 }
 445
 446 void runBenchmarks() {
 447   CHECK(!benchmarks().empty());
 448
 449   vector<detail::BenchmarkResult> results;
 450   results.reserve(benchmarks().size() - 1);
 451
 452   std::unique_ptr<boost::regex> bmRegex;
 453   if (!FLAGS_bm_regex.empty()) {
 454     bmRegex.reset(new boost::regex(FLAGS_bm_regex));
 455   }
 456
 457   // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
 458
 459   size_t baselineIndex = getGlobalBenchmarkBaselineIndex();
 460
 461   auto const globalBaseline =
 462       runBenchmarkGetNSPerIteration(benchmarks()[baselineIndex].func, 0);
 463   FOR_EACH_RANGE (i, 0, benchmarks().size()) {
 464     if (i == baselineIndex) {
 465       continue;
 466     }
 467     double elapsed = 0.0;
 468     auto& bm = benchmarks()[i];
 469     if (bm.name != "-") { // skip separators
 470       if (bmRegex && !boost::regex_search(bm.name, *bmRegex)) {
 471         continue;
 472       }
 473       elapsed = runBenchmarkGetNSPerIteration(bm.func, globalBaseline);
 474     }
 475     results.push_back({bm.file, bm.name, elapsed});
 476   }
 477
 478   // PLEASE MAKE NOISE. MEASUREMENTS DONE.
 479
 480   printBenchmarkResults(results);
 481 }
 482
 483 } // namespace folly