folly/stats/Histogram-defs.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <folly/Conv.h>
  20 #include <folly/stats/Histogram.h>
  21
  22 #include <glog/logging.h>
  23
  24 namespace folly {
  25
  26 namespace detail {
  27
  28 template <typename T, typename BucketT>
  29 HistogramBuckets<T, BucketT>::HistogramBuckets(ValueType bucketSize,
  30                                                ValueType min,
  31                                                ValueType max,
  32                                                const BucketType& defaultBucket)
  33   : bucketSize_(bucketSize),
  34     min_(min),
  35     max_(max) {
  36   CHECK_GT(bucketSize_, ValueType(0));
  37   CHECK_LT(min_, max_);
  38
  39   // Deliberately make this a signed type, because we're about
  40   // to compare it against max-min, which is nominally signed, too.
  41   int64_t numBuckets = int64_t((max - min) / bucketSize);
  42   // Round up if the bucket size does not fit evenly
  43   if (numBuckets * bucketSize < max - min) {
  44     ++numBuckets;
  45   }
  46   // Add 2 for the extra 'below min' and 'above max' buckets
  47   numBuckets += 2;
  48   buckets_.assign(size_t(numBuckets), defaultBucket);
  49 }
  50
  51 template <typename T, typename BucketType>
  52 size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const {
  53   if (value < min_) {
  54     return 0;
  55   } else if (value >= max_) {
  56     return buckets_.size() - 1;
  57   } else {
  58     // the 1 is the below_min bucket
  59     return size_t(((value - min_) / bucketSize_) + 1);
  60   }
  61 }
  62
  63 template <typename T, typename BucketType>
  64 template <typename CountFn>
  65 uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
  66     CountFn countFromBucket) const {
  67   uint64_t count = 0;
  68   for (size_t n = 0; n < buckets_.size(); ++n) {
  69     count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  70   }
  71   return count;
  72 }
  73
  74 template <typename T, typename BucketType>
  75 template <typename CountFn>
  76 size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
  77     double pct,
  78     CountFn countFromBucket,
  79     double* lowPct,
  80     double* highPct) const {
  81   CHECK_GE(pct, 0.0);
  82   CHECK_LE(pct, 1.0);
  83
  84   auto numBuckets = buckets_.size();
  85
  86   // Compute the counts in each bucket
  87   std::vector<uint64_t> counts(numBuckets);
  88   uint64_t totalCount = 0;
  89   for (size_t n = 0; n < numBuckets; ++n) {
  90     uint64_t bucketCount =
  91       countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  92     counts[n] = bucketCount;
  93     totalCount += bucketCount;
  94   }
  95
  96   // If there are no elements, just return the lowest bucket.
  97   // Note that we return bucket 1, which is the first bucket in the
  98   // histogram range; bucket 0 is for all values below min_.
  99   if (totalCount == 0) {
 100     // Set lowPct and highPct both to 0.
 101     // getPercentileEstimate() will recognize this to mean that the histogram
 102     // is empty.
 103     if (lowPct) {
 104       *lowPct = 0.0;
 105     }
 106     if (highPct) {
 107       *highPct = 0.0;
 108     }
 109     return 1;
 110   }
 111
 112   // Loop through all the buckets, keeping track of each bucket's
 113   // percentile range: [0,10], [10,17], [17,45], etc.  When we find a range
 114   // that includes our desired percentile, we return that bucket index.
 115   double prevPct = 0.0;
 116   double curPct = 0.0;
 117   uint64_t curCount = 0;
 118   size_t idx;
 119   for (idx = 0; idx < numBuckets; ++idx) {
 120     if (counts[idx] == 0) {
 121       // skip empty buckets
 122       continue;
 123     }
 124
 125     prevPct = curPct;
 126     curCount += counts[idx];
 127     curPct = static_cast<double>(curCount) / totalCount;
 128     if (pct <= curPct) {
 129       // This is the desired bucket
 130       break;
 131     }
 132   }
 133
 134   if (lowPct) {
 135     *lowPct = prevPct;
 136   }
 137   if (highPct) {
 138     *highPct = curPct;
 139   }
 140   return idx;
 141 }
 142
 143 template <typename T, typename BucketType>
 144 template <typename CountFn, typename AvgFn>
 145 T HistogramBuckets<T, BucketType>::getPercentileEstimate(
 146     double pct,
 147     CountFn countFromBucket,
 148     AvgFn avgFromBucket) const {
 149
 150   // Find the bucket where this percentile falls
 151   double lowPct;
 152   double highPct;
 153   size_t bucketIdx =
 154       getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct);
 155   if (lowPct == 0.0 && highPct == 0.0) {
 156     // Invalid range -- the buckets must all be empty
 157     // Return the default value for ValueType.
 158     return ValueType();
 159   }
 160   if (lowPct == highPct) {
 161     // Unlikely to have exact equality,
 162     // but just return the bucket average in this case.
 163     // We handle this here to avoid division by 0 below.
 164     return avgFromBucket(buckets_[bucketIdx]);
 165   }
 166
 167   CHECK_GE(pct, lowPct);
 168   CHECK_LE(pct, highPct);
 169   CHECK_LT(lowPct, highPct);
 170
 171   // Compute information about this bucket
 172   ValueType avg = avgFromBucket(buckets_[bucketIdx]);
 173   ValueType low;
 174   ValueType high;
 175   if (bucketIdx == 0) {
 176     if (avg > min_) {
 177       // This normally shouldn't happen.  This bucket is only supposed to track
 178       // values less than min_.  Most likely this means that integer overflow
 179       // occurred, and the code in avgFromBucket() returned a huge value
 180       // instead of a small one.  Just return the minimum possible value for
 181       // now.
 182       //
 183       // (Note that if the counter keeps being decremented, eventually it will
 184       // wrap and become small enough that we won't detect this any more, and
 185       // we will return bogus information.)
 186       LOG(ERROR) << "invalid average value in histogram minimum bucket: " <<
 187         avg << " > " << min_ << ": possible integer overflow?";
 188       return getBucketMin(bucketIdx);
 189     }
 190     // For the below-min bucket, just assume the lowest value ever seen is
 191     // twice as far away from min_ as avg.
 192     high = min_;
 193     low = high - (2 * (high - avg));
 194     // Adjust low in case it wrapped
 195     if (low > avg) {
 196       low = std::numeric_limits<ValueType>::min();
 197     }
 198   } else if (bucketIdx == buckets_.size() - 1) {
 199     if (avg < max_) {
 200       // Most likely this means integer overflow occurred.  See the comments
 201       // above in the minimum case.
 202       LOG(ERROR) << "invalid average value in histogram maximum bucket: " <<
 203         avg << " < " << max_ << ": possible integer overflow?";
 204       return getBucketMax(bucketIdx);
 205     }
 206     // Similarly for the above-max bucket, assume the highest value ever seen
 207     // is twice as far away from max_ as avg.
 208     low = max_;
 209     high = low + (2 * (avg - low));
 210     // Adjust high in case it wrapped
 211     if (high < avg) {
 212       high = std::numeric_limits<ValueType>::max();
 213     }
 214   } else {
 215     low = getBucketMin(bucketIdx);
 216     high = getBucketMax(bucketIdx);
 217     if (avg < low || avg > high) {
 218       // Most likely this means an integer overflow occurred.
 219       // See the comments above.  Return the midpoint between low and high
 220       // as a best guess, since avg is meaningless.
 221       LOG(ERROR) << "invalid average value in histogram bucket: " <<
 222         avg << " not in range [" << low << ", " << high <<
 223         "]: possible integer overflow?";
 224       return (low + high) / 2;
 225     }
 226   }
 227
 228   // Since we know the average value in this bucket, we can do slightly better
 229   // than just assuming the data points in this bucket are uniformly
 230   // distributed between low and high.
 231   //
 232   // Assume that the median value in this bucket is the same as the average
 233   // value.
 234   double medianPct = (lowPct + highPct) / 2.0;
 235   if (pct < medianPct) {
 236     // Assume that the data points lower than the median of this bucket
 237     // are uniformly distributed between low and avg
 238     double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
 239     return T(low + ((avg - low) * pctThroughSection));
 240   } else {
 241     // Assume that the data points greater than the median of this bucket
 242     // are uniformly distributed between avg and high
 243     double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
 244     return T(avg + ((high - avg) * pctThroughSection));
 245   }
 246 }
 247
 248 } // detail
 249
 250
 251 template <typename T>
 252 std::string Histogram<T>::debugString() const {
 253   std::string ret = folly::to<std::string>(
 254       "num buckets: ", buckets_.getNumBuckets(),
 255       ", bucketSize: ", buckets_.getBucketSize(),
 256       ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n");
 257
 258   for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
 259     folly::toAppend("  ", buckets_.getBucketMin(i), ": ",
 260                     buckets_.getByIndex(i).count, "\n",
 261                     &ret);
 262   }
 263
 264   return ret;
 265 }
 266
 267 template <typename T>
 268 void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
 269   for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
 270     // Do not output empty buckets in order to reduce data file size.
 271     if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
 272       continue;
 273     }
 274     const auto& bucket = getBucketByIndex(i);
 275     out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t'
 276         << bucket.count << '\t' << bucket.sum << '\n';
 277   }
 278 }
 279
 280 } // folly