folly/Benchmark.cpp

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  18
  19 #include <folly/Benchmark.h>
  20
  21 #include <algorithm>
  22 #include <cmath>
  23 #include <cstring>
  24 #include <iostream>
  25 #include <limits>
  26 #include <map>
  27 #include <memory>
  28 #include <utility>
  29 #include <vector>
  30
  31 #include <boost/regex.hpp>
  32
  33 #include <folly/MapUtil.h>
  34 #include <folly/String.h>
  35 #include <folly/container/Foreach.h>
  36 #include <folly/json.h>
  37
  38 using namespace std;
  39
  40 DEFINE_bool(benchmark, false, "Run benchmarks.");
  41 DEFINE_bool(json, false, "Output in JSON format.");
  42 DEFINE_bool(json_verbose, false, "Output in verbose JSON format.");
  43
  44 DEFINE_string(
  45     bm_regex,
  46     "",
  47     "Only benchmarks whose names match this regex will be run.");
  48
  49 DEFINE_int64(
  50     bm_min_usec,
  51     100,
  52     "Minimum # of microseconds we'll accept for each benchmark.");
  53
  54 DEFINE_int32(
  55     bm_min_iters,
  56     1,
  57     "Minimum # of iterations we'll try for each benchmark.");
  58
  59 DEFINE_int64(
  60     bm_max_iters,
  61     1 << 30,
  62     "Maximum # of iterations we'll try for each benchmark.");
  63
  64 DEFINE_int32(
  65     bm_max_secs,
  66     1,
  67     "Maximum # of seconds we'll spend on each benchmark.");
  68
  69 namespace folly {
  70
  71 std::chrono::high_resolution_clock::duration BenchmarkSuspender::timeSpent;
  72
  73 typedef function<detail::TimeIterPair(unsigned int)> BenchmarkFun;
  74
  75 vector<detail::BenchmarkRegistration>& benchmarks() {
  76   static vector<detail::BenchmarkRegistration> _benchmarks;
  77   return _benchmarks;
  78 }
  79
  80 #define FB_FOLLY_GLOBAL_BENCHMARK_BASELINE fbFollyGlobalBenchmarkBaseline
  81 #define FB_STRINGIZE_X2(x) FB_STRINGIZE(x)
  82
  83 // Add the global baseline
  84 BENCHMARK(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE) {
  85 #ifdef _MSC_VER
  86   _ReadWriteBarrier();
  87 #else
  88   asm volatile("");
  89 #endif
  90 }
  91
  92 size_t getGlobalBenchmarkBaselineIndex() {
  93   const char *global = FB_STRINGIZE_X2(FB_FOLLY_GLOBAL_BENCHMARK_BASELINE);
  94   auto it = std::find_if(
  95       benchmarks().begin(),
  96       benchmarks().end(),
  97       [global](const detail::BenchmarkRegistration& v) {
  98         return v.name == global;
  99       });
 100   CHECK(it != benchmarks().end());
 101   return size_t(std::distance(benchmarks().begin(), it));
 102 }
 103
 104 #undef FB_STRINGIZE_X2
 105 #undef FB_FOLLY_GLOBAL_BENCHMARK_BASELINE
 106
 107 void detail::addBenchmarkImpl(const char* file, const char* name,
 108                               BenchmarkFun fun) {
 109   benchmarks().push_back({file, name, std::move(fun)});
 110 }
 111
 112 /**
 113  * Given a bunch of benchmark samples, estimate the actual run time.
 114  */
 115 static double estimateTime(double * begin, double * end) {
 116   assert(begin < end);
 117
 118   // Current state of the art: get the minimum. After some
 119   // experimentation, it seems taking the minimum is the best.
 120   return *min_element(begin, end);
 121 }
 122
 123 static double runBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
 124                                             const double globalBaseline) {
 125   using std::chrono::duration_cast;
 126   using std::chrono::high_resolution_clock;
 127   using std::chrono::microseconds;
 128   using std::chrono::nanoseconds;
 129   using std::chrono::seconds;
 130
 131   // They key here is accuracy; too low numbers means the accuracy was
 132   // coarse. We up the ante until we get to at least minNanoseconds
 133   // timings.
 134   static_assert(
 135       std::is_same<high_resolution_clock::duration, nanoseconds>::value,
 136       "High resolution clock must be nanosecond resolution.");
 137   // We choose a minimum minimum (sic) of 100,000 nanoseconds, but if
 138   // the clock resolution is worse than that, it will be larger. In
 139   // essence we're aiming at making the quantization noise 0.01%.
 140   static const auto minNanoseconds = std::max<nanoseconds>(
 141       nanoseconds(100000), microseconds(FLAGS_bm_min_usec));
 142
 143   // We do measurements in several epochs and take the minimum, to
 144   // account for jitter.
 145   static const unsigned int epochs = 1000;
 146   // We establish a total time budget as we don't want a measurement
 147   // to take too long. This will curtail the number of actual epochs.
 148   const auto timeBudget = seconds(FLAGS_bm_max_secs);
 149   auto global = high_resolution_clock::now();
 150
 151   double epochResults[epochs] = { 0 };
 152   size_t actualEpochs = 0;
 153
 154   for (; actualEpochs < epochs; ++actualEpochs) {
 155     const auto maxIters = uint32_t(FLAGS_bm_max_iters);
 156     for (auto n = uint32_t(FLAGS_bm_min_iters); n < maxIters; n *= 2) {
 157       auto const nsecsAndIter = fun(static_cast<unsigned int>(n));
 158       if (nsecsAndIter.first < minNanoseconds) {
 159         continue;
 160       }
 161       // We got an accurate enough timing, done. But only save if
 162       // smaller than the current result.
 163       auto nsecs = duration_cast<nanoseconds>(nsecsAndIter.first).count();
 164       epochResults[actualEpochs] =
 165           max(0.0, double(nsecs) / nsecsAndIter.second - globalBaseline);
 166       // Done with the current epoch, we got a meaningful timing.
 167       break;
 168     }
 169     auto now = high_resolution_clock::now();
 170     if (now - global >= timeBudget) {
 171       // No more time budget available.
 172       ++actualEpochs;
 173       break;
 174     }
 175   }
 176
 177   // If the benchmark was basically drowned in baseline noise, it's
 178   // possible it became negative.
 179   return max(0.0, estimateTime(epochResults, epochResults + actualEpochs));
 180 }
 181
 182 struct ScaleInfo {
 183   double boundary;
 184   const char* suffix;
 185 };
 186
 187 static const ScaleInfo kTimeSuffixes[] {
 188   { 365.25 * 24 * 3600, "years" },
 189   { 24 * 3600, "days" },
 190   { 3600, "hr" },
 191   { 60, "min" },
 192   { 1, "s" },
 193   { 1E-3, "ms" },
 194   { 1E-6, "us" },
 195   { 1E-9, "ns" },
 196   { 1E-12, "ps" },
 197   { 1E-15, "fs" },
 198   { 0, nullptr },
 199 };
 200
 201 static const ScaleInfo kMetricSuffixes[] {
 202   { 1E24, "Y" },  // yotta
 203   { 1E21, "Z" },  // zetta
 204   { 1E18, "X" },  // "exa" written with suffix 'X' so as to not create
 205                   //   confusion with scientific notation
 206   { 1E15, "P" },  // peta
 207   { 1E12, "T" },  // terra
 208   { 1E9, "G" },   // giga
 209   { 1E6, "M" },   // mega
 210   { 1E3, "K" },   // kilo
 211   { 1, "" },
 212   { 1E-3, "m" },  // milli
 213   { 1E-6, "u" },  // micro
 214   { 1E-9, "n" },  // nano
 215   { 1E-12, "p" }, // pico
 216   { 1E-15, "f" }, // femto
 217   { 1E-18, "a" }, // atto
 218   { 1E-21, "z" }, // zepto
 219   { 1E-24, "y" }, // yocto
 220   { 0, nullptr },
 221 };
 222
 223 static string humanReadable(double n, unsigned int decimals,
 224                             const ScaleInfo* scales) {
 225   if (std::isinf(n) || std::isnan(n)) {
 226     return folly::to<string>(n);
 227   }
 228
 229   const double absValue = fabs(n);
 230   const ScaleInfo* scale = scales;
 231   while (absValue < scale[0].boundary && scale[1].suffix != nullptr) {
 232     ++scale;
 233   }
 234
 235   const double scaledValue = n / scale->boundary;
 236   return stringPrintf("%.*f%s", decimals, scaledValue, scale->suffix);
 237 }
 238
 239 static string readableTime(double n, unsigned int decimals) {
 240   return humanReadable(n, decimals, kTimeSuffixes);
 241 }
 242
 243 static string metricReadable(double n, unsigned int decimals) {
 244   return humanReadable(n, decimals, kMetricSuffixes);
 245 }
 246
 247 static void printBenchmarkResultsAsTable(
 248     const vector<detail::BenchmarkResult>& data) {
 249   // Width available
 250   static const unsigned int columns = 76;
 251
 252   // Compute the longest benchmark name
 253   size_t longestName = 0;
 254   for (auto& bm : benchmarks()) {
 255     longestName = max(longestName, bm.name.size());
 256   }
 257
 258   // Print a horizontal rule
 259   auto separator = [&](char pad) {
 260     puts(string(columns, pad).c_str());
 261   };
 262
 263   // Print header for a file
 264   auto header = [&](const string& file) {
 265     separator('=');
 266     printf("%-*srelative  time/iter  iters/s\n",
 267            columns - 28, file.c_str());
 268     separator('=');
 269   };
 270
 271   double baselineNsPerIter = numeric_limits<double>::max();
 272   string lastFile;
 273
 274   for (auto& datum : data) {
 275     auto file = datum.file;
 276     if (file != lastFile) {
 277       // New file starting
 278       header(file);
 279       lastFile = file;
 280     }
 281
 282     string s = datum.name;
 283     if (s == "-") {
 284       separator('-');
 285       continue;
 286     }
 287     bool useBaseline /* = void */;
 288     if (s[0] == '%') {
 289       s.erase(0, 1);
 290       useBaseline = true;
 291     } else {
 292       baselineNsPerIter = datum.timeInNs;
 293       useBaseline = false;
 294     }
 295     s.resize(columns - 29, ' ');
 296     auto nsPerIter = datum.timeInNs;
 297     auto secPerIter = nsPerIter / 1E9;
 298     auto itersPerSec = (secPerIter == 0)
 299                            ? std::numeric_limits<double>::infinity()
 300                            : (1 / secPerIter);
 301     if (!useBaseline) {
 302       // Print without baseline
 303       printf("%*s           %9s  %7s\n",
 304              static_cast<int>(s.size()), s.c_str(),
 305              readableTime(secPerIter, 2).c_str(),
 306              metricReadable(itersPerSec, 2).c_str());
 307     } else {
 308       // Print with baseline
 309       auto rel = baselineNsPerIter / nsPerIter * 100.0;
 310       printf("%*s %7.2f%%  %9s  %7s\n",
 311              static_cast<int>(s.size()), s.c_str(),
 312              rel,
 313              readableTime(secPerIter, 2).c_str(),
 314              metricReadable(itersPerSec, 2).c_str());
 315     }
 316   }
 317   separator('=');
 318 }
 319
 320 static void printBenchmarkResultsAsJson(
 321     const vector<detail::BenchmarkResult>& data) {
 322   dynamic d = dynamic::object;
 323   for (auto& datum: data) {
 324     d[datum.name] = datum.timeInNs * 1000.;
 325   }
 326
 327   printf("%s\n", toPrettyJson(d).c_str());
 328 }
 329
 330 static void printBenchmarkResultsAsVerboseJson(
 331     const vector<detail::BenchmarkResult>& data) {
 332   dynamic d;
 333   benchmarkResultsToDynamic(data, d);
 334   printf("%s\n", toPrettyJson(d).c_str());
 335 }
 336
 337 static void printBenchmarkResults(const vector<detail::BenchmarkResult>& data) {
 338   if (FLAGS_json_verbose) {
 339     printBenchmarkResultsAsVerboseJson(data);
 340   } else if (FLAGS_json) {
 341     printBenchmarkResultsAsJson(data);
 342   } else {
 343     printBenchmarkResultsAsTable(data);
 344   }
 345 }
 346
 347 void benchmarkResultsToDynamic(
 348     const vector<detail::BenchmarkResult>& data,
 349     dynamic& out) {
 350   out = dynamic::array;
 351   for (auto& datum : data) {
 352     out.push_back(dynamic::array(datum.file, datum.name, datum.timeInNs));
 353   }
 354 }
 355
 356 void benchmarkResultsFromDynamic(
 357     const dynamic& d,
 358     vector<detail::BenchmarkResult>& results) {
 359   for (auto& datum : d) {
 360     results.push_back(
 361         {datum[0].asString(), datum[1].asString(), datum[2].asDouble()});
 362   }
 363 }
 364
 365 static pair<StringPiece, StringPiece> resultKey(
 366     const detail::BenchmarkResult& result) {
 367   return pair<StringPiece, StringPiece>(result.file, result.name);
 368 }
 369
 370 void printResultComparison(
 371     const vector<detail::BenchmarkResult>& base,
 372     const vector<detail::BenchmarkResult>& test) {
 373   map<pair<StringPiece, StringPiece>, double> baselines;
 374
 375   for (auto& baseResult : base) {
 376     baselines[resultKey(baseResult)] = baseResult.timeInNs;
 377   }
 378   //
 379   // Width available
 380   static const unsigned int columns = 76;
 381
 382   // Compute the longest benchmark name
 383   size_t longestName = 0;
 384   for (auto& datum : test) {
 385     longestName = max(longestName, datum.name.size());
 386   }
 387
 388   // Print a horizontal rule
 389   auto separator = [&](char pad) { puts(string(columns, pad).c_str()); };
 390
 391   // Print header for a file
 392   auto header = [&](const string& file) {
 393     separator('=');
 394     printf("%-*srelative  time/iter  iters/s\n", columns - 28, file.c_str());
 395     separator('=');
 396   };
 397
 398   string lastFile;
 399
 400   for (auto& datum : test) {
 401     folly::Optional<double> baseline =
 402         folly::get_optional(baselines, resultKey(datum));
 403     auto file = datum.file;
 404     if (file != lastFile) {
 405       // New file starting
 406       header(file);
 407       lastFile = file;
 408     }
 409
 410     string s = datum.name;
 411     if (s == "-") {
 412       separator('-');
 413       continue;
 414     }
 415     if (s[0] == '%') {
 416       s.erase(0, 1);
 417     }
 418     s.resize(columns - 29, ' ');
 419     auto nsPerIter = datum.timeInNs;
 420     auto secPerIter = nsPerIter / 1E9;
 421     auto itersPerSec = (secPerIter == 0)
 422         ? std::numeric_limits<double>::infinity()
 423         : (1 / secPerIter);
 424     if (!baseline) {
 425       // Print without baseline
 426       printf(
 427           "%*s           %9s  %7s\n",
 428           static_cast<int>(s.size()),
 429           s.c_str(),
 430           readableTime(secPerIter, 2).c_str(),
 431           metricReadable(itersPerSec, 2).c_str());
 432     } else {
 433       // Print with baseline
 434       auto rel = *baseline / nsPerIter * 100.0;
 435       printf(
 436           "%*s %7.2f%%  %9s  %7s\n",
 437           static_cast<int>(s.size()),
 438           s.c_str(),
 439           rel,
 440           readableTime(secPerIter, 2).c_str(),
 441           metricReadable(itersPerSec, 2).c_str());
 442     }
 443   }
 444   separator('=');
 445 }
 446
 447 void runBenchmarks() {
 448   CHECK(!benchmarks().empty());
 449
 450   vector<detail::BenchmarkResult> results;
 451   results.reserve(benchmarks().size() - 1);
 452
 453   std::unique_ptr<boost::regex> bmRegex;
 454   if (!FLAGS_bm_regex.empty()) {
 455     bmRegex = std::make_unique<boost::regex>(FLAGS_bm_regex);
 456   }
 457
 458   // PLEASE KEEP QUIET. MEASUREMENTS IN PROGRESS.
 459
 460   size_t baselineIndex = getGlobalBenchmarkBaselineIndex();
 461
 462   auto const globalBaseline =
 463       runBenchmarkGetNSPerIteration(benchmarks()[baselineIndex].func, 0);
 464   FOR_EACH_RANGE (i, 0, benchmarks().size()) {
 465     if (i == baselineIndex) {
 466       continue;
 467     }
 468     double elapsed = 0.0;
 469     auto& bm = benchmarks()[i];
 470     if (bm.name != "-") { // skip separators
 471       if (bmRegex && !boost::regex_search(bm.name, *bmRegex)) {
 472         continue;
 473       }
 474       elapsed = runBenchmarkGetNSPerIteration(bm.func, globalBaseline);
 475     }
 476     results.push_back({bm.file, bm.name, elapsed});
 477   }
 478
 479   // PLEASE MAKE NOISE. MEASUREMENTS DONE.
 480
 481   printBenchmarkResults(results);
 482 }
 483
 484 } // namespace folly