folly/stats/Histogram-defs.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_HISTOGRAM_DEFS_H_
  18 #define FOLLY_HISTOGRAM_DEFS_H_
  19
  20 #include <folly/Conv.h>
  21
  22 #include <glog/logging.h>
  23
  24 namespace folly {
  25
  26 namespace detail {
  27
  28 template <typename T, typename BucketT>
  29 HistogramBuckets<T, BucketT>::HistogramBuckets(ValueType bucketSize,
  30                                                ValueType min,
  31                                                ValueType max,
  32                                                const BucketType& defaultBucket)
  33   : bucketSize_(bucketSize),
  34     min_(min),
  35     max_(max) {
  36   CHECK_GT(bucketSize_, ValueType(0));
  37   CHECK_LT(min_, max_);
  38
  39   unsigned int numBuckets = (max - min) / bucketSize;
  40   // Round up if the bucket size does not fit evenly
  41   if (numBuckets * bucketSize < max - min) {
  42     ++numBuckets;
  43   }
  44   // Add 2 for the extra 'below min' and 'above max' buckets
  45   numBuckets += 2;
  46   buckets_.assign(numBuckets, defaultBucket);
  47 }
  48
  49 template <typename T, typename BucketType>
  50 unsigned int HistogramBuckets<T, BucketType>::getBucketIdx(
  51     ValueType value) const {
  52   if (value < min_) {
  53     return 0;
  54   } else if (value >= max_) {
  55     return buckets_.size() - 1;
  56   } else {
  57     // the 1 is the below_min bucket
  58     return ((value - min_) / bucketSize_) + 1;
  59   }
  60 }
  61
  62 template <typename T, typename BucketType>
  63 template <typename CountFn>
  64 unsigned int HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
  65     double pct,
  66     CountFn countFromBucket,
  67     double* lowPct, double* highPct) const {
  68   CHECK_GE(pct, 0.0);
  69   CHECK_LE(pct, 1.0);
  70
  71   unsigned int numBuckets = buckets_.size();
  72
  73   // Compute the counts in each bucket
  74   std::vector<uint64_t> counts(numBuckets);
  75   uint64_t totalCount = 0;
  76   for (unsigned int n = 0; n < numBuckets; ++n) {
  77     uint64_t bucketCount =
  78       countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  79     counts[n] = bucketCount;
  80     totalCount += bucketCount;
  81   }
  82
  83   // If there are no elements, just return the lowest bucket.
  84   // Note that we return bucket 1, which is the first bucket in the
  85   // histogram range; bucket 0 is for all values below min_.
  86   if (totalCount == 0) {
  87     // Set lowPct and highPct both to 0.
  88     // getPercentileEstimate() will recognize this to mean that the histogram
  89     // is empty.
  90     if (lowPct) {
  91       *lowPct = 0.0;
  92     }
  93     if (highPct) {
  94       *highPct = 0.0;
  95     }
  96     return 1;
  97   }
  98
  99   // Loop through all the buckets, keeping track of each bucket's
 100   // percentile range: [0,10], [10,17], [17,45], etc.  When we find a range
 101   // that includes our desired percentile, we return that bucket index.
 102   double prevPct = 0.0;
 103   double curPct = 0.0;
 104   uint64_t curCount = 0;
 105   unsigned int idx;
 106   for (idx = 0; idx < numBuckets; ++idx) {
 107     if (counts[idx] == 0) {
 108       // skip empty buckets
 109       continue;
 110     }
 111
 112     prevPct = curPct;
 113     curCount += counts[idx];
 114     curPct = static_cast<double>(curCount) / totalCount;
 115     if (pct <= curPct) {
 116       // This is the desired bucket
 117       break;
 118     }
 119   }
 120
 121   if (lowPct) {
 122     *lowPct = prevPct;
 123   }
 124   if (highPct) {
 125     *highPct = curPct;
 126   }
 127   return idx;
 128 }
 129
 130 template <typename T, typename BucketType>
 131 template <typename CountFn, typename AvgFn>
 132 T HistogramBuckets<T, BucketType>::getPercentileEstimate(
 133     double pct,
 134     CountFn countFromBucket,
 135     AvgFn avgFromBucket) const {
 136
 137   // Find the bucket where this percentile falls
 138   double lowPct;
 139   double highPct;
 140   unsigned int bucketIdx = getPercentileBucketIdx(pct, countFromBucket,
 141                                                   &lowPct, &highPct);
 142   if (lowPct == 0.0 && highPct == 0.0) {
 143     // Invalid range -- the buckets must all be empty
 144     // Return the default value for ValueType.
 145     return ValueType();
 146   }
 147   if (lowPct == highPct) {
 148     // Unlikely to have exact equality,
 149     // but just return the bucket average in this case.
 150     // We handle this here to avoid division by 0 below.
 151     return avgFromBucket(buckets_[bucketIdx]);
 152   }
 153
 154   CHECK_GE(pct, lowPct);
 155   CHECK_LE(pct, highPct);
 156   CHECK_LT(lowPct, highPct);
 157
 158   // Compute information about this bucket
 159   ValueType avg = avgFromBucket(buckets_[bucketIdx]);
 160   ValueType low;
 161   ValueType high;
 162   if (bucketIdx == 0) {
 163     if (avg > min_) {
 164       // This normally shouldn't happen.  This bucket is only supposed to track
 165       // values less than min_.  Most likely this means that integer overflow
 166       // occurred, and the code in avgFromBucket() returned a huge value
 167       // instead of a small one.  Just return the minimum possible value for
 168       // now.
 169       //
 170       // (Note that if the counter keeps being decremented, eventually it will
 171       // wrap and become small enough that we won't detect this any more, and
 172       // we will return bogus information.)
 173       LOG(ERROR) << "invalid average value in histogram minimum bucket: " <<
 174         avg << " > " << min_ << ": possible integer overflow?";
 175       return getBucketMin(bucketIdx);
 176     }
 177     // For the below-min bucket, just assume the lowest value ever seen is
 178     // twice as far away from min_ as avg.
 179     high = min_;
 180     low = high - (2 * (high - avg));
 181     // Adjust low in case it wrapped
 182     if (low > avg) {
 183       low = std::numeric_limits<ValueType>::min();
 184     }
 185   } else if (bucketIdx == buckets_.size() - 1) {
 186     if (avg < max_) {
 187       // Most likely this means integer overflow occurred.  See the comments
 188       // above in the minimum case.
 189       LOG(ERROR) << "invalid average value in histogram maximum bucket: " <<
 190         avg << " < " << max_ << ": possible integer overflow?";
 191       return getBucketMax(bucketIdx);
 192     }
 193     // Similarly for the above-max bucket, assume the highest value ever seen
 194     // is twice as far away from max_ as avg.
 195     low = max_;
 196     high = low + (2 * (avg - low));
 197     // Adjust high in case it wrapped
 198     if (high < avg) {
 199       high = std::numeric_limits<ValueType>::max();
 200     }
 201   } else {
 202     low = getBucketMin(bucketIdx);
 203     high = getBucketMax(bucketIdx);
 204     if (avg < low || avg > high) {
 205       // Most likely this means an integer overflow occurred.
 206       // See the comments above.  Return the midpoint between low and high
 207       // as a best guess, since avg is meaningless.
 208       LOG(ERROR) << "invalid average value in histogram bucket: " <<
 209         avg << " not in range [" << low << ", " << high <<
 210         "]: possible integer overflow?";
 211       return (low + high) / 2;
 212     }
 213   }
 214
 215   // Since we know the average value in this bucket, we can do slightly better
 216   // than just assuming the data points in this bucket are uniformly
 217   // distributed between low and high.
 218   //
 219   // Assume that the median value in this bucket is the same as the average
 220   // value.
 221   double medianPct = (lowPct + highPct) / 2.0;
 222   if (pct < medianPct) {
 223     // Assume that the data points lower than the median of this bucket
 224     // are uniformly distributed between low and avg
 225     double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
 226     return low + ((avg - low) * pctThroughSection);
 227   } else {
 228     // Assume that the data points greater than the median of this bucket
 229     // are uniformly distributed between avg and high
 230     double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
 231     return avg + ((high - avg) * pctThroughSection);
 232   }
 233 }
 234
 235 } // detail
 236
 237
 238 template <typename T>
 239 std::string Histogram<T>::debugString() const {
 240   std::string ret = folly::to<std::string>(
 241       "num buckets: ", buckets_.getNumBuckets(),
 242       ", bucketSize: ", buckets_.getBucketSize(),
 243       ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n");
 244
 245   for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
 246     folly::toAppend("  ", buckets_.getBucketMin(i), ": ",
 247                     buckets_.getByIndex(i).count, "\n",
 248                     &ret);
 249   }
 250
 251   return ret;
 252 }
 253
 254 template <typename T>
 255 void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
 256   for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
 257     // Do not output empty buckets in order to reduce data file size.
 258     if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
 259       continue;
 260     }
 261     const auto& bucket = getBucketByIndex(i);
 262     out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t'
 263         << bucket.count << '\t' << bucket.sum << '\n';
 264   }
 265 }
 266
 267 } // folly
 268
 269 #endif // FOLLY_HISTOGRAM_DEFS_H_