folly/stats/Histogram-defs.h

   1 /*
   2  * Copyright 2017 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <folly/Conv.h>
  20 #include <folly/stats/Histogram.h>
  21
  22 #include <glog/logging.h>
  23
  24 namespace folly {
  25
  26 namespace detail {
  27
  28 template <typename T, typename BucketT>
  29 HistogramBuckets<T, BucketT>::HistogramBuckets(
  30     ValueType bucketSize,
  31     ValueType min,
  32     ValueType max,
  33     const BucketType& defaultBucket)
  34     : bucketSize_(bucketSize), min_(min), max_(max) {
  35   CHECK_GT(bucketSize_, ValueType(0));
  36   CHECK_LT(min_, max_);
  37
  38   // Deliberately make this a signed type, because we're about
  39   // to compare it against max-min, which is nominally signed, too.
  40   int64_t numBuckets = int64_t((max - min) / bucketSize);
  41   // Round up if the bucket size does not fit evenly
  42   if (numBuckets * bucketSize < max - min) {
  43     ++numBuckets;
  44   }
  45   // Add 2 for the extra 'below min' and 'above max' buckets
  46   numBuckets += 2;
  47   buckets_.assign(size_t(numBuckets), defaultBucket);
  48 }
  49
  50 template <typename T, typename BucketType>
  51 size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const {
  52   if (value < min_) {
  53     return 0;
  54   } else if (value >= max_) {
  55     return buckets_.size() - 1;
  56   } else {
  57     // the 1 is the below_min bucket
  58     return size_t(((value - min_) / bucketSize_) + 1);
  59   }
  60 }
  61
  62 template <typename T, typename BucketType>
  63 template <typename CountFn>
  64 uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
  65     CountFn countFromBucket) const {
  66   uint64_t count = 0;
  67   for (size_t n = 0; n < buckets_.size(); ++n) {
  68     count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  69   }
  70   return count;
  71 }
  72
  73 template <typename T, typename BucketType>
  74 template <typename CountFn>
  75 size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
  76     double pct,
  77     CountFn countFromBucket,
  78     double* lowPct,
  79     double* highPct) const {
  80   CHECK_GE(pct, 0.0);
  81   CHECK_LE(pct, 1.0);
  82
  83   auto numBuckets = buckets_.size();
  84
  85   // Compute the counts in each bucket
  86   std::vector<uint64_t> counts(numBuckets);
  87   uint64_t totalCount = 0;
  88   for (size_t n = 0; n < numBuckets; ++n) {
  89     uint64_t bucketCount =
  90         countFromBucket(const_cast<const BucketType&>(buckets_[n]));
  91     counts[n] = bucketCount;
  92     totalCount += bucketCount;
  93   }
  94
  95   // If there are no elements, just return the lowest bucket.
  96   // Note that we return bucket 1, which is the first bucket in the
  97   // histogram range; bucket 0 is for all values below min_.
  98   if (totalCount == 0) {
  99     // Set lowPct and highPct both to 0.
 100     // getPercentileEstimate() will recognize this to mean that the histogram
 101     // is empty.
 102     if (lowPct) {
 103       *lowPct = 0.0;
 104     }
 105     if (highPct) {
 106       *highPct = 0.0;
 107     }
 108     return 1;
 109   }
 110
 111   // Loop through all the buckets, keeping track of each bucket's
 112   // percentile range: [0,10], [10,17], [17,45], etc.  When we find a range
 113   // that includes our desired percentile, we return that bucket index.
 114   double prevPct = 0.0;
 115   double curPct = 0.0;
 116   uint64_t curCount = 0;
 117   size_t idx;
 118   for (idx = 0; idx < numBuckets; ++idx) {
 119     if (counts[idx] == 0) {
 120       // skip empty buckets
 121       continue;
 122     }
 123
 124     prevPct = curPct;
 125     curCount += counts[idx];
 126     curPct = static_cast<double>(curCount) / totalCount;
 127     if (pct <= curPct) {
 128       // This is the desired bucket
 129       break;
 130     }
 131   }
 132
 133   if (lowPct) {
 134     *lowPct = prevPct;
 135   }
 136   if (highPct) {
 137     *highPct = curPct;
 138   }
 139   return idx;
 140 }
 141
 142 template <typename T, typename BucketType>
 143 template <typename CountFn, typename AvgFn>
 144 T HistogramBuckets<T, BucketType>::getPercentileEstimate(
 145     double pct,
 146     CountFn countFromBucket,
 147     AvgFn avgFromBucket) const {
 148   // Find the bucket where this percentile falls
 149   double lowPct;
 150   double highPct;
 151   size_t bucketIdx =
 152       getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct);
 153   if (lowPct == 0.0 && highPct == 0.0) {
 154     // Invalid range -- the buckets must all be empty
 155     // Return the default value for ValueType.
 156     return ValueType();
 157   }
 158   if (lowPct == highPct) {
 159     // Unlikely to have exact equality,
 160     // but just return the bucket average in this case.
 161     // We handle this here to avoid division by 0 below.
 162     return avgFromBucket(buckets_[bucketIdx]);
 163   }
 164
 165   CHECK_GE(pct, lowPct);
 166   CHECK_LE(pct, highPct);
 167   CHECK_LT(lowPct, highPct);
 168
 169   // Compute information about this bucket
 170   ValueType avg = avgFromBucket(buckets_[bucketIdx]);
 171   ValueType low;
 172   ValueType high;
 173   if (bucketIdx == 0) {
 174     if (avg > min_) {
 175       // This normally shouldn't happen.  This bucket is only supposed to track
 176       // values less than min_.  Most likely this means that integer overflow
 177       // occurred, and the code in avgFromBucket() returned a huge value
 178       // instead of a small one.  Just return the minimum possible value for
 179       // now.
 180       //
 181       // (Note that if the counter keeps being decremented, eventually it will
 182       // wrap and become small enough that we won't detect this any more, and
 183       // we will return bogus information.)
 184       LOG(ERROR) << "invalid average value in histogram minimum bucket: " << avg
 185                  << " > " << min_ << ": possible integer overflow?";
 186       return getBucketMin(bucketIdx);
 187     }
 188     // For the below-min bucket, just assume the lowest value ever seen is
 189     // twice as far away from min_ as avg.
 190     high = min_;
 191     low = high - (2 * (high - avg));
 192     // Adjust low in case it wrapped
 193     if (low > avg) {
 194       low = std::numeric_limits<ValueType>::min();
 195     }
 196   } else if (bucketIdx == buckets_.size() - 1) {
 197     if (avg < max_) {
 198       // Most likely this means integer overflow occurred.  See the comments
 199       // above in the minimum case.
 200       LOG(ERROR) << "invalid average value in histogram maximum bucket: " << avg
 201                  << " < " << max_ << ": possible integer overflow?";
 202       return getBucketMax(bucketIdx);
 203     }
 204     // Similarly for the above-max bucket, assume the highest value ever seen
 205     // is twice as far away from max_ as avg.
 206     low = max_;
 207     high = low + (2 * (avg - low));
 208     // Adjust high in case it wrapped
 209     if (high < avg) {
 210       high = std::numeric_limits<ValueType>::max();
 211     }
 212   } else {
 213     low = getBucketMin(bucketIdx);
 214     high = getBucketMax(bucketIdx);
 215     if (avg < low || avg > high) {
 216       // Most likely this means an integer overflow occurred.
 217       // See the comments above.  Return the midpoint between low and high
 218       // as a best guess, since avg is meaningless.
 219       LOG(ERROR) << "invalid average value in histogram bucket: " << avg
 220                  << " not in range [" << low << ", " << high
 221                  << "]: possible integer overflow?";
 222       return (low + high) / 2;
 223     }
 224   }
 225
 226   // Since we know the average value in this bucket, we can do slightly better
 227   // than just assuming the data points in this bucket are uniformly
 228   // distributed between low and high.
 229   //
 230   // Assume that the median value in this bucket is the same as the average
 231   // value.
 232   double medianPct = (lowPct + highPct) / 2.0;
 233   if (pct < medianPct) {
 234     // Assume that the data points lower than the median of this bucket
 235     // are uniformly distributed between low and avg
 236     double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
 237     return T(low + ((avg - low) * pctThroughSection));
 238   } else {
 239     // Assume that the data points greater than the median of this bucket
 240     // are uniformly distributed between avg and high
 241     double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
 242     return T(avg + ((high - avg) * pctThroughSection));
 243   }
 244 }
 245
 246 } // namespace detail
 247
 248 template <typename T>
 249 std::string Histogram<T>::debugString() const {
 250   std::string ret = folly::to<std::string>(
 251       "num buckets: ",
 252       buckets_.getNumBuckets(),
 253       ", bucketSize: ",
 254       buckets_.getBucketSize(),
 255       ", min: ",
 256       buckets_.getMin(),
 257       ", max: ",
 258       buckets_.getMax(),
 259       "\n");
 260
 261   for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
 262     folly::toAppend(
 263         "  ",
 264         buckets_.getBucketMin(i),
 265         ": ",
 266         buckets_.getByIndex(i).count,
 267         "\n",
 268         &ret);
 269   }
 270
 271   return ret;
 272 }
 273
 274 template <typename T>
 275 void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
 276   for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
 277     // Do not output empty buckets in order to reduce data file size.
 278     if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
 279       continue;
 280     }
 281     const auto& bucket = getBucketByIndex(i);
 282     out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t' << bucket.count
 283         << '\t' << bucket.sum << '\n';
 284   }
 285 }
 286
 287 } // namespace folly