folly/stats/Histogram-defs.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_HISTOGRAM_DEFS_H_
  18 #define FOLLY_HISTOGRAM_DEFS_H_
  19
  20 #include <folly/Conv.h>
  21
  22 #include <glog/logging.h>
  23
  24 namespace folly {
  25
  26 namespace detail {
  27
  28 template <typename T, typename BucketT>
  29 HistogramBuckets<T, BucketT>::HistogramBuckets(ValueType bucketSize,
  30                                                ValueType min,
  31                                                ValueType max,
  32                                                const BucketType& defaultBucket)
  33   : bucketSize_(bucketSize),
  34     min_(min),
  35     max_(max) {
  36   CHECK_GT(bucketSize_, ValueType(0));
  37   CHECK_LT(min_, max_);
  38
  39   // Deliberately make this a signed type, because we're about
  40   // to compare it against max-min, which is nominally signed, too.
  41   int numBuckets = (max - min) / bucketSize;
  42   // Round up if the bucket size does not fit evenly
  43   if (numBuckets * bucketSize < max - min) {
  44     ++numBuckets;
  45   }
  46   // Add 2 for the extra 'below min' and 'above max' buckets
  47   numBuckets += 2;
  48   buckets_.assign(numBuckets, defaultBucket);
  49 }
  50
  51 template <typename T, typename BucketType>
  52 unsigned int HistogramBuckets<T, BucketType>::getBucketIdx(
  53     ValueType value) const {
  54   if (value < min_) {
  55     return 0;
  56   } else if (value >= max_) {
  57     return buckets_.size() - 1;
  58   } else {
  59     // the 1 is the below_min bucket
  60     return ((value - min_) / bucketSize_) + 1;
  61   }
  62 }
  63
  64 template <typename T, typename BucketType>
  65 template <typename CountFn>
  66 unsigned int HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
  67     double pct,
  68     CountFn countFromBucket,
  69     double* lowPct, double* highPct) const {
  70   CHECK_GE(pct, 0.0);
  71   CHECK_LE(pct, 1.0);
  72
  73   unsigned int numBuckets = buckets_.size();
  74
  75   // Compute the counts in each bucket
  76   std::vector<uint64_t> counts(numBuckets);
  77   uint64_t totalCount = 0;
  78   for (unsigned int n = 0; n < numBuckets; ++n) {
  79     uint64_t bucketCount =
  80       countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  81     counts[n] = bucketCount;
  82     totalCount += bucketCount;
  83   }
  84
  85   // If there are no elements, just return the lowest bucket.
  86   // Note that we return bucket 1, which is the first bucket in the
  87   // histogram range; bucket 0 is for all values below min_.
  88   if (totalCount == 0) {
  89     // Set lowPct and highPct both to 0.
  90     // getPercentileEstimate() will recognize this to mean that the histogram
  91     // is empty.
  92     if (lowPct) {
  93       *lowPct = 0.0;
  94     }
  95     if (highPct) {
  96       *highPct = 0.0;
  97     }
  98     return 1;
  99   }
 100
 101   // Loop through all the buckets, keeping track of each bucket's
 102   // percentile range: [0,10], [10,17], [17,45], etc.  When we find a range
 103   // that includes our desired percentile, we return that bucket index.
 104   double prevPct = 0.0;
 105   double curPct = 0.0;
 106   uint64_t curCount = 0;
 107   unsigned int idx;
 108   for (idx = 0; idx < numBuckets; ++idx) {
 109     if (counts[idx] == 0) {
 110       // skip empty buckets
 111       continue;
 112     }
 113
 114     prevPct = curPct;
 115     curCount += counts[idx];
 116     curPct = static_cast<double>(curCount) / totalCount;
 117     if (pct <= curPct) {
 118       // This is the desired bucket
 119       break;
 120     }
 121   }
 122
 123   if (lowPct) {
 124     *lowPct = prevPct;
 125   }
 126   if (highPct) {
 127     *highPct = curPct;
 128   }
 129   return idx;
 130 }
 131
 132 template <typename T, typename BucketType>
 133 template <typename CountFn, typename AvgFn>
 134 T HistogramBuckets<T, BucketType>::getPercentileEstimate(
 135     double pct,
 136     CountFn countFromBucket,
 137     AvgFn avgFromBucket) const {
 138
 139   // Find the bucket where this percentile falls
 140   double lowPct;
 141   double highPct;
 142   unsigned int bucketIdx = getPercentileBucketIdx(pct, countFromBucket,
 143                                                   &lowPct, &highPct);
 144   if (lowPct == 0.0 && highPct == 0.0) {
 145     // Invalid range -- the buckets must all be empty
 146     // Return the default value for ValueType.
 147     return ValueType();
 148   }
 149   if (lowPct == highPct) {
 150     // Unlikely to have exact equality,
 151     // but just return the bucket average in this case.
 152     // We handle this here to avoid division by 0 below.
 153     return avgFromBucket(buckets_[bucketIdx]);
 154   }
 155
 156   CHECK_GE(pct, lowPct);
 157   CHECK_LE(pct, highPct);
 158   CHECK_LT(lowPct, highPct);
 159
 160   // Compute information about this bucket
 161   ValueType avg = avgFromBucket(buckets_[bucketIdx]);
 162   ValueType low;
 163   ValueType high;
 164   if (bucketIdx == 0) {
 165     if (avg > min_) {
 166       // This normally shouldn't happen.  This bucket is only supposed to track
 167       // values less than min_.  Most likely this means that integer overflow
 168       // occurred, and the code in avgFromBucket() returned a huge value
 169       // instead of a small one.  Just return the minimum possible value for
 170       // now.
 171       //
 172       // (Note that if the counter keeps being decremented, eventually it will
 173       // wrap and become small enough that we won't detect this any more, and
 174       // we will return bogus information.)
 175       LOG(ERROR) << "invalid average value in histogram minimum bucket: " <<
 176         avg << " > " << min_ << ": possible integer overflow?";
 177       return getBucketMin(bucketIdx);
 178     }
 179     // For the below-min bucket, just assume the lowest value ever seen is
 180     // twice as far away from min_ as avg.
 181     high = min_;
 182     low = high - (2 * (high - avg));
 183     // Adjust low in case it wrapped
 184     if (low > avg) {
 185       low = std::numeric_limits<ValueType>::min();
 186     }
 187   } else if (bucketIdx == buckets_.size() - 1) {
 188     if (avg < max_) {
 189       // Most likely this means integer overflow occurred.  See the comments
 190       // above in the minimum case.
 191       LOG(ERROR) << "invalid average value in histogram maximum bucket: " <<
 192         avg << " < " << max_ << ": possible integer overflow?";
 193       return getBucketMax(bucketIdx);
 194     }
 195     // Similarly for the above-max bucket, assume the highest value ever seen
 196     // is twice as far away from max_ as avg.
 197     low = max_;
 198     high = low + (2 * (avg - low));
 199     // Adjust high in case it wrapped
 200     if (high < avg) {
 201       high = std::numeric_limits<ValueType>::max();
 202     }
 203   } else {
 204     low = getBucketMin(bucketIdx);
 205     high = getBucketMax(bucketIdx);
 206     if (avg < low || avg > high) {
 207       // Most likely this means an integer overflow occurred.
 208       // See the comments above.  Return the midpoint between low and high
 209       // as a best guess, since avg is meaningless.
 210       LOG(ERROR) << "invalid average value in histogram bucket: " <<
 211         avg << " not in range [" << low << ", " << high <<
 212         "]: possible integer overflow?";
 213       return (low + high) / 2;
 214     }
 215   }
 216
 217   // Since we know the average value in this bucket, we can do slightly better
 218   // than just assuming the data points in this bucket are uniformly
 219   // distributed between low and high.
 220   //
 221   // Assume that the median value in this bucket is the same as the average
 222   // value.
 223   double medianPct = (lowPct + highPct) / 2.0;
 224   if (pct < medianPct) {
 225     // Assume that the data points lower than the median of this bucket
 226     // are uniformly distributed between low and avg
 227     double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
 228     return low + ((avg - low) * pctThroughSection);
 229   } else {
 230     // Assume that the data points greater than the median of this bucket
 231     // are uniformly distributed between avg and high
 232     double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
 233     return avg + ((high - avg) * pctThroughSection);
 234   }
 235 }
 236
 237 } // detail
 238
 239
 240 template <typename T>
 241 std::string Histogram<T>::debugString() const {
 242   std::string ret = folly::to<std::string>(
 243       "num buckets: ", buckets_.getNumBuckets(),
 244       ", bucketSize: ", buckets_.getBucketSize(),
 245       ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n");
 246
 247   for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
 248     folly::toAppend("  ", buckets_.getBucketMin(i), ": ",
 249                     buckets_.getByIndex(i).count, "\n",
 250                     &ret);
 251   }
 252
 253   return ret;
 254 }
 255
 256 template <typename T>
 257 void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
 258   for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
 259     // Do not output empty buckets in order to reduce data file size.
 260     if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
 261       continue;
 262     }
 263     const auto& bucket = getBucketByIndex(i);
 264     out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t'
 265         << bucket.count << '\t' << bucket.sum << '\n';
 266   }
 267 }
 268
 269 } // folly
 270
 271 #endif // FOLLY_HISTOGRAM_DEFS_H_