2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <folly/Conv.h>
20 #include <folly/stats/Histogram.h>
22 #include <glog/logging.h>
28 template <typename T, typename BucketT>
29 HistogramBuckets<T, BucketT>::HistogramBuckets(ValueType bucketSize,
32 const BucketType& defaultBucket)
33 : bucketSize_(bucketSize),
36 CHECK_GT(bucketSize_, ValueType(0));
39 // Deliberately make this a signed type, because we're about
40 // to compare it against max-min, which is nominally signed, too.
41 int64_t numBuckets = int64_t((max - min) / bucketSize);
42 // Round up if the bucket size does not fit evenly
43 if (numBuckets * bucketSize < max - min) {
46 // Add 2 for the extra 'below min' and 'above max' buckets
48 buckets_.assign(size_t(numBuckets), defaultBucket);
51 template <typename T, typename BucketType>
52 size_t HistogramBuckets<T, BucketType>::getBucketIdx(ValueType value) const {
55 } else if (value >= max_) {
56 return buckets_.size() - 1;
58 // the 1 is the below_min bucket
59 return size_t(((value - min_) / bucketSize_) + 1);
63 template <typename T, typename BucketType>
64 template <typename CountFn>
65 uint64_t HistogramBuckets<T, BucketType>::computeTotalCount(
66 CountFn countFromBucket) const {
68 for (size_t n = 0; n < buckets_.size(); ++n) {
69 count += countFromBucket(const_cast<const BucketType&>(buckets_[n]));
74 template <typename T, typename BucketType>
75 template <typename CountFn>
76 size_t HistogramBuckets<T, BucketType>::getPercentileBucketIdx(
78 CountFn countFromBucket,
80 double* highPct) const {
84 auto numBuckets = buckets_.size();
86 // Compute the counts in each bucket
87 std::vector<uint64_t> counts(numBuckets);
88 uint64_t totalCount = 0;
89 for (size_t n = 0; n < numBuckets; ++n) {
90 uint64_t bucketCount =
91 countFromBucket(const_cast<const BucketType&>(buckets_[n]));
92 counts[n] = bucketCount;
93 totalCount += bucketCount;
96 // If there are no elements, just return the lowest bucket.
97 // Note that we return bucket 1, which is the first bucket in the
98 // histogram range; bucket 0 is for all values below min_.
99 if (totalCount == 0) {
100 // Set lowPct and highPct both to 0.
101 // getPercentileEstimate() will recognize this to mean that the histogram
112 // Loop through all the buckets, keeping track of each bucket's
113 // percentile range: [0,10], [10,17], [17,45], etc. When we find a range
114 // that includes our desired percentile, we return that bucket index.
115 double prevPct = 0.0;
117 uint64_t curCount = 0;
119 for (idx = 0; idx < numBuckets; ++idx) {
120 if (counts[idx] == 0) {
121 // skip empty buckets
126 curCount += counts[idx];
127 curPct = static_cast<double>(curCount) / totalCount;
129 // This is the desired bucket
143 template <typename T, typename BucketType>
144 template <typename CountFn, typename AvgFn>
145 T HistogramBuckets<T, BucketType>::getPercentileEstimate(
147 CountFn countFromBucket,
148 AvgFn avgFromBucket) const {
150 // Find the bucket where this percentile falls
154 getPercentileBucketIdx(pct, countFromBucket, &lowPct, &highPct);
155 if (lowPct == 0.0 && highPct == 0.0) {
156 // Invalid range -- the buckets must all be empty
157 // Return the default value for ValueType.
160 if (lowPct == highPct) {
161 // Unlikely to have exact equality,
162 // but just return the bucket average in this case.
163 // We handle this here to avoid division by 0 below.
164 return avgFromBucket(buckets_[bucketIdx]);
167 CHECK_GE(pct, lowPct);
168 CHECK_LE(pct, highPct);
169 CHECK_LT(lowPct, highPct);
171 // Compute information about this bucket
172 ValueType avg = avgFromBucket(buckets_[bucketIdx]);
175 if (bucketIdx == 0) {
177 // This normally shouldn't happen. This bucket is only supposed to track
178 // values less than min_. Most likely this means that integer overflow
179 // occurred, and the code in avgFromBucket() returned a huge value
180 // instead of a small one. Just return the minimum possible value for
183 // (Note that if the counter keeps being decremented, eventually it will
184 // wrap and become small enough that we won't detect this any more, and
185 // we will return bogus information.)
186 LOG(ERROR) << "invalid average value in histogram minimum bucket: " <<
187 avg << " > " << min_ << ": possible integer overflow?";
188 return getBucketMin(bucketIdx);
190 // For the below-min bucket, just assume the lowest value ever seen is
191 // twice as far away from min_ as avg.
193 low = high - (2 * (high - avg));
194 // Adjust low in case it wrapped
196 low = std::numeric_limits<ValueType>::min();
198 } else if (bucketIdx == buckets_.size() - 1) {
200 // Most likely this means integer overflow occurred. See the comments
201 // above in the minimum case.
202 LOG(ERROR) << "invalid average value in histogram maximum bucket: " <<
203 avg << " < " << max_ << ": possible integer overflow?";
204 return getBucketMax(bucketIdx);
206 // Similarly for the above-max bucket, assume the highest value ever seen
207 // is twice as far away from max_ as avg.
209 high = low + (2 * (avg - low));
210 // Adjust high in case it wrapped
212 high = std::numeric_limits<ValueType>::max();
215 low = getBucketMin(bucketIdx);
216 high = getBucketMax(bucketIdx);
217 if (avg < low || avg > high) {
218 // Most likely this means an integer overflow occurred.
219 // See the comments above. Return the midpoint between low and high
220 // as a best guess, since avg is meaningless.
221 LOG(ERROR) << "invalid average value in histogram bucket: " <<
222 avg << " not in range [" << low << ", " << high <<
223 "]: possible integer overflow?";
224 return (low + high) / 2;
228 // Since we know the average value in this bucket, we can do slightly better
229 // than just assuming the data points in this bucket are uniformly
230 // distributed between low and high.
232 // Assume that the median value in this bucket is the same as the average
234 double medianPct = (lowPct + highPct) / 2.0;
235 if (pct < medianPct) {
236 // Assume that the data points lower than the median of this bucket
237 // are uniformly distributed between low and avg
238 double pctThroughSection = (pct - lowPct) / (medianPct - lowPct);
239 return T(low + ((avg - low) * pctThroughSection));
241 // Assume that the data points greater than the median of this bucket
242 // are uniformly distributed between avg and high
243 double pctThroughSection = (pct - medianPct) / (highPct - medianPct);
244 return T(avg + ((high - avg) * pctThroughSection));
251 template <typename T>
252 std::string Histogram<T>::debugString() const {
253 std::string ret = folly::to<std::string>(
254 "num buckets: ", buckets_.getNumBuckets(),
255 ", bucketSize: ", buckets_.getBucketSize(),
256 ", min: ", buckets_.getMin(), ", max: ", buckets_.getMax(), "\n");
258 for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
259 folly::toAppend(" ", buckets_.getBucketMin(i), ": ",
260 buckets_.getByIndex(i).count, "\n",
267 template <typename T>
268 void Histogram<T>::toTSV(std::ostream& out, bool skipEmptyBuckets) const {
269 for (size_t i = 0; i < buckets_.getNumBuckets(); ++i) {
270 // Do not output empty buckets in order to reduce data file size.
271 if (skipEmptyBuckets && getBucketByIndex(i).count == 0) {
274 const auto& bucket = getBucketByIndex(i);
275 out << getBucketMin(i) << '\t' << getBucketMax(i) << '\t'
276 << bucket.count << '\t' << bucket.sum << '\n';