folly/stats/Histogram-defs.h

   1 /*
   2  * Copyright 2016 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <folly/Conv.h>
  20
  21 #include <glog/logging.h>
  22
  23 namespace folly {
  24
  25 namespace detail {
  26
  27 template <typename T, typename BucketT>
  28 HistogramBuckets<T, BucketT>::HistogramBuckets(ValueType bucketSize,
  29                                                ValueType min,
  30                                                ValueType max,
  31                                                const BucketType& defaultBucket)
  32   : bucketSize_(bucketSize),
  33     min_(min),
  34     max_(max) {
  35   CHECK_GT(bucketSize_, ValueType(0));
  36   CHECK_LT(min_, max_);
  37
  38   // Deliberately make this a signed type, because we're about
  39   // to compare it against max-min, which is nominally signed, too.
  40   int numBuckets = (max - min) / bucketSize;
  41   // Round up if the bucket size does not fit evenly
  42   if (numBuckets * bucketSize < max - min) {
  43     ++numBuckets;
  44   }
  45   // Add 2 for the extra 'below min' and 'above max' buckets
  46   numBuckets += 2;
  47   buckets_.assign(numBuckets, defaultBucket);
  48 }
  49
  50 template <typename T, typename BucketType>
  51 unsigned int HistogramBuckets<T, BucketType>::getBucketIdx(
  52     ValueType value) const {
  53   if (value < min_) {
  54     return 0;
  55   } else if (value >= max_) {
  56     return buckets_.size() - 1;
  57   } else {
  58     // the 1 is the below_min bucket
  59     return ((value - min_) / bucketSize_) + 1;
  60   }
  61 }
  62
  63 template <typename T, typename BucketType>
  64 template <typename CountFn>
  65 uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
  66     CountFn countFromBucket) const {
  67   uint64_t count = 0;
  68   for (unsigned int n = 0; n < buckets_.size(); ++n) {
  69     count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  70   }
  71   return count;
  72 }
  73
  74 template <typename T, typename BucketType>
  75 template <typename CountFn>
  76 unsigned int HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
  77     double pct,
  78     CountFn countFromBucket,
  79     double* lowPct, double* highPct) const {
  80   CHECK_GE(pct, 0.0);
  81   CHECK_LE(pct, 1.0);
  82
  83   unsigned int numBuckets = buckets_.size();
  84
  85   // Compute the counts in each bucket
  86   std::vector<uint64_t> counts(numBuckets);
  87   uint64_t totalCount = 0;
  88   for (unsigned int n = 0; n < numBuckets; ++n) {
  89     uint64_t bucketCount =
  90       countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  91     counts[n] = bucketCount;
  92     totalCount += bucketCount;
  93   }
  94
  95   // If there are no elements, just return the lowest bucket.
  96   // Note that we return bucket 1, which is the first bucket in the
  97   // histogram range; bucket 0 is for all values below min_.
  98   if (totalCount == 0) {
  99     // Set lowPct and highPct both to 0.
 100     // getPercentileEstimate() will recognize this to mean that the histogram
 101     // is empty.
 102     if (lowPct) {
 103       *lowPct = 0.0;
 104     }
 105     if (highPct) {
 106       *highPct = 0.0;
 107     }
 108     return 1;
 109   }
 110
 111   // Loop through all the buckets, keeping track of each bucket's
 112   // percentile range: [0,10], [10,17], [17,45], etc.  When we find a range
 113   // that includes our desired percentile, we return that bucket index.
 114   double prevPct = 0.0;
 115   double curPct = 0.0;
 116   uint64_t curCount = 0;
 117   unsigned int idx;
 118   for (idx = 0; idx < numBuckets; ++idx) {
 119     if (counts[idx] == 0) {
 120       // skip empty buckets
 121       continue;
 122     }
 123
 124     prevPct = curPct;
 125     curCount += counts[idx];
 126     curPct = static_cast<double>(curCount) / totalCount;
 127     if (pct <= curPct) {
 128       // This is the desired bucket
 129       break;
 130     }
 131   }
 132
 133   if (lowPct) {
 134     *lowPct = prevPct;
 135   }
 136   if (highPct) {
 137     *highPct = curPct;
 138   }
 139   return idx;
 140 }
 141
 142 template <typename T, typename BucketType>
 143 template <typename CountFn, typename AvgFn>
 144 T HistogramBuckets<T, BucketType>::getPercentileEstimate(
 145     double pct,
 146     CountFn countFromBucket,
 147     AvgFn avgFromBucket) const {
 148
 149   // Find the bucket where this percentile falls
 150   double lowPct;
 151   double highPct;
 152   unsigned int bucketIdx = getPercentileBucketIdx(pct, countFromBucket,
 153                                                   &lowPct, &highPct);
 154   if (lowPct == 0.0 && highPct == 0.0) {
 155     // Invalid range -- the buckets must all be empty
 156     // Return the default value for ValueType.
 157     return ValueType();
 158   }
 159   if (lowPct == highPct) {
 160     // Unlikely to have exact equality,
 161     // but just return the bucket average in this case.
 162     // We handle this here to avoid division by 0 below.
 163     return avgFromBucket(buckets_[bucketIdx]);
 164   }
 165
 166   CHECK_GE(pct, lowPct);
 167   CHECK_LE(pct, highPct);
 168   CHECK_LT(lowPct, highPct);
 169
 170   // Compute information about this bucket
 171   ValueType avg = avgFromBucket(buckets_[bucketIdx]);
 172   ValueType low;
 173   ValueType high;
 174   if (bucketIdx == 0) {
 175     if (avg > min_) {
 176       // This normally shouldn't happen.  This bucket is only supposed to track
 177       // values less than min_.  Most likely this means that integer overflow
 178       // occurred, and the code in avgFromBucket() returned a huge value
 179       // instead of a small one.  Just return the minimum possible value for
 180       // now.
 181       //
 182       // (Note that if the counter keeps being decremented, eventually it will
 183       // wrap and become small enough that we won't detect this any more, and
 184       // we will return bogus information.)
 185       LOG(ERROR) << "invalid average value in histogram minimum bucket: " <<
 186         avg << " > " << min_ << ": possible integer overflow?";
 187       return getBucketMin(bucketIdx);
 188     }
 189     // For the below-min bucket, just assume the lowest value ever seen is
 190     // twice as far away from min_ as avg.
 191     high = min_;
 192     low = high - (2 * (high - avg));
 193     // Adjust low in case it wrapped
 194     if (low > avg) {
 195       low = std::numeric_limits<ValueType>::min();
 196     }
 197   } else if (bucketIdx == buckets_.size() - 1) {
 198     if (avg < max_) {
 199       // Most likely this means integer overflow occurred.  See the comments
 200       // above in the minimum case.
 201       LOG(ERROR) << "invalid average value in histogram maximum bucket: " <<
 202         avg << " < " << max_ << ": possible integer overflow?";
 203       return getBucketMax(bucketIdx);
 204     }
 205     // Similarly for the above-max bucket, assume the highest value ever seen
 206     // is twice as far away from max_ as avg.
 207     low = max_;
 208     high = low + (2 * (avg - low));
 209     // Adjust high in case it wrapped
 210     if (high < avg) {
 211       high = std::numeric_limits<ValueType>::max();
 212     }
 213   } else {
 214     low = getBucketMin(bucketIdx);
 215     high = getBucketMax(bucketIdx);
 216     if (avg < low || avg > high) {
 217       // Most likely this means an integer overflow occurred.
 218       // See the comments above.  Return the midpoint between low and high
 219       // as a best guess, since avg is meaningless.
 220       LOG(ERROR) << "invalid average value in histogram bucket: " <<
 221         avg << " not in range [" << low << ", " << high <<
 222         "]: possible integer overflow?";
 223       return (low + high) / 2;
 224     }
 225   }
 226
 227   // Since we know the average value in this bucket, we can do slightly better
 228   // than just assuming the data points in this bucket are uniformly
 229   // distributed between low and high.
 230   //
 231   // Assume that the median value in this bucket is the same as the average
 232   // value.
 233   double medianPct = (lowPct + highPct) / 2.0;
 234   if (pct < medianPct) {
 235     // Assume that the data points lower than the median of this bucket
 236     // are uniformly distributed between low and avg
 237     double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
 238     return low + ((avg - low) * pctThroughSection);
 239   } else {
 240     // Assume that the data points greater than the median of this bucket
 241     // are uniformly distributed between avg and high
 242     double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
 243     return avg + ((high - avg) * pctThroughSection);
 244   }
 245 }
 246
 247 } // detail
 248
 249
 250 template <typename T>
 251 std::string Histogram<T>::debugString() const {
 252   std::string ret = folly::to<std::string>(
 253       "num buckets: ", buckets_.getNumBuckets(),
 254       ", bucketSize: ", buckets_.getBucketSize(),
 255       ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n");
 256
 257   for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
 258     folly::toAppend("  ", buckets_.getBucketMin(i), ": ",
 259                     buckets_.getByIndex(i).count, "\n",
 260                     &ret);
 261   }
 262
 263   return ret;
 264 }
 265
 266 template <typename T>
 267 void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
 268   for (unsigned int i = 0; i < buckets_.getNumBuckets(); ++i) {
 269     // Do not output empty buckets in order to reduce data file size.
 270     if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
 271       continue;
 272     }
 273     const auto& bucket = getBucketByIndex(i);
 274     out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t'
 275         << bucket.count << '\t' << bucket.sum << '\n';
 276   }
 277 }
 278
 279 } // folly