Port TimeseriesHistogram from common/stats to folly

author Stephen Chen <tracelog@fb.com>

Mon, 11 Nov 2013 21:32:34 +0000 (13:32 -0800)

committer Peter Griess <pgriess@fb.com>

Tue, 26 Nov 2013 15:05:17 +0000 (07:05 -0800)
author Stephen Chen <tracelog@fb.com>
Mon, 11 Nov 2013 21:32:34 +0000 (13:32 -0800)
committer Peter Griess <pgriess@fb.com>
Tue, 26 Nov 2013 15:05:17 +0000 (07:05 -0800)
diff --git a/folly/stats/Instantiations.cpp b/folly/stats/Instantiations.cpp

index e94421620db6ece0ef24aac6b944c2b26026f144..40aa80202f6e71fe4104dc69554fc2e3d2840d1f 100644 (file)
--- a/folly/stats/Instantiations.cpp
+++ b/folly/stats/Instantiations.cpp
@@ -30,11 +30,15 @@
  #include "folly/stats/MultiLevelTimeSeries.h"
  #include "folly/stats/MultiLevelTimeSeries-defs.h"
  
+#include "folly/stats/TimeseriesHistogram.h"
+#include "folly/stats/TimeseriesHistogram-defs.h"
+
  namespace folly {
  
  template class BucketedTimeSeries<int64_t>;
  template class Histogram<int64_t>;
  template class detail::HistogramBuckets<int64_t, Histogram<int64_t>::Bucket>;
  template class MultiLevelTimeSeries<int64_t>;
+template class TimeseriesHistogram<int64_t>;
  
  } // folly
diff --git a/folly/stats/MultiLevelTimeSeries.h b/folly/stats/MultiLevelTimeSeries.h

index 32e5d639a91a99cb689e54773847616f16bd7b8b..849cc922ce47b4895a869bc60f1e24dc3b8d29b0 100644 (file)
--- a/folly/stats/MultiLevelTimeSeries.h
+++ b/folly/stats/MultiLevelTimeSeries.h
@@ -21,6 +21,7 @@
  #include <string>
  #include <vector>
  
+#include <glog/logging.h>
  #include "folly/stats/BucketedTimeSeries.h"
  
  namespace folly {
diff --git a/folly/stats/TimeseriesHistogram-defs.h b/folly/stats/TimeseriesHistogram-defs.h

new file mode 100644 (file)

index 0000000..ddf71d3
--- /dev/null
+++ b/folly/stats/TimeseriesHistogram-defs.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_TIMESERIES_HISTOGRAM_DEF_H_
+#define FOLLY_TIMESERIES_HISTOGRAM_DEF_H_
+
+#include "folly/Conv.h"
+#include "folly/stats/Histogram-defs.h"
+#include "folly/stats/MultiLevelTimeSeries-defs.h"
+#include "folly/stats/BucketedTimeSeries-defs.h"
+
+namespace folly {
+
+template <class T, class TT, class C>
+template <typename ReturnType>
+ReturnType TimeseriesHistogram<T, TT, C>::avg(int level) const {
+  ValueType total = ValueType();
+  int64_t count = 0;
+  for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+    const auto& levelObj = buckets_.getByIndex(b).getLevel(level);
+    total += levelObj.sum();
+    count += levelObj.count();
+  }
+  return folly::detail::avgHelper<ReturnType>(total, count);
+}
+
+template <class T, class TT, class C>
+template <typename ReturnType>
+ReturnType TimeseriesHistogram<T, TT, C>::avg(TimeType start,
+                                              TimeType end) const {
+  ValueType total = ValueType();
+  int64_t count = 0;
+  for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+    const auto& levelObj = buckets_.getByIndex(b).getLevel(start, end);
+    total += levelObj.sum(start, end);
+    count += levelObj.count(start, end);
+  }
+  return folly::detail::avgHelper<ReturnType>(total, count);
+}
+
+template <class T, class TT, class C>
+template <typename ReturnType>
+ReturnType TimeseriesHistogram<T, TT, C>::rate(TimeType start,
+                                               TimeType end) const {
+  ValueType total = ValueType();
+  TimeType elapsed(0);
+  for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+    const auto& level = buckets_.getByIndex(b).getLevel(start);
+    total += level.sum(start, end);
+    elapsed = std::max(elapsed, level.elapsed(start, end));
+  }
+  return folly::detail::rateHelper<ReturnType, TimeType, TimeType>(
+      total, elapsed);
+}
+
+template <typename T, typename TT, typename C>
+TimeseriesHistogram<T, TT, C>::TimeseriesHistogram(ValueType bucketSize,
+                                            ValueType min,
+                                            ValueType max,
+                                            const ContainerType& copyMe)
+  : buckets_(bucketSize, min, max, copyMe),
+    haveNotSeenValue_(true),
+    singleUniqueValue_(false) {
+}
+
+template <typename T, typename TT, typename C>
+void TimeseriesHistogram<T, TT, C>::addValue(TimeType now,
+                                             const ValueType& value) {
+  buckets_.getByValue(value).addValue(now, value);
+  maybeHandleSingleUniqueValue(value);
+}
+
+template <typename T, typename TT, typename C>
+void TimeseriesHistogram<T, TT, C>::addValue(TimeType now,
+                                      const ValueType& value,
+                                      int64_t times) {
+  buckets_.getByValue(value).addValue(now, value, times);
+  maybeHandleSingleUniqueValue(value);
+}
+
+template <typename T, typename TT, typename C>
+void TimeseriesHistogram<T, TT, C>::addValues(
+    TimeType now, const folly::Histogram<ValueType>& hist) {
+  CHECK_EQ(hist.getMin(), getMin());
+  CHECK_EQ(hist.getMax(), getMax());
+  CHECK_EQ(hist.getBucketSize(), getBucketSize());
+  CHECK_EQ(hist.getNumBuckets(), getNumBuckets());
+
+  for (unsigned int n = 0; n < hist.getNumBuckets(); ++n) {
+    const typename folly::Histogram<ValueType>::Bucket& histBucket =
+      hist.getBucketByIndex(n);
+    Bucket& myBucket = buckets_.getByIndex(n);
+    myBucket.addValueAggregated(now, histBucket.sum, histBucket.count);
+  }
+
+  // We don't bother with the singleUniqueValue_ tracking.
+  haveNotSeenValue_ = false;
+  singleUniqueValue_ = false;
+}
+
+template <typename T, typename TT, typename C>
+void TimeseriesHistogram<T, TT, C>::maybeHandleSingleUniqueValue(
+  const ValueType& value) {
+  if (haveNotSeenValue_) {
+    firstValue_ = value;
+    singleUniqueValue_ = true;
+    haveNotSeenValue_ = false;
+  } else if (singleUniqueValue_) {
+    if (value != firstValue_) {
+      singleUniqueValue_ = false;
+    }
+  }
+}
+
+template <typename T, typename TT, typename C>
+T TimeseriesHistogram<T, TT, C>::getPercentileEstimate(double pct,
+                                                       int level) const {
+  if (singleUniqueValue_) {
+    return firstValue_;
+  }
+
+  return buckets_.getPercentileEstimate(pct / 100.0, CountFromLevel(level),
+                                        AvgFromLevel(level));
+}
+
+template <typename T, typename TT, typename C>
+T TimeseriesHistogram<T, TT, C>::getPercentileEstimate(double pct,
+                                                TimeType start,
+                                                TimeType end) const {
+  if (singleUniqueValue_) {
+    return firstValue_;
+  }
+
+  return buckets_.getPercentileEstimate(pct / 100.0,
+                                        CountFromInterval(start, end),
+                                        AvgFromInterval<T>(start, end));
+}
+
+template <typename T, typename TT, typename C>
+int TimeseriesHistogram<T, TT, C>::getPercentileBucketIdx(
+  double pct,
+  int level
+) const {
+  return buckets_.getPercentileBucketIdx(pct / 100.0, CountFromLevel(level));
+}
+
+template <typename T, typename TT, typename C>
+int TimeseriesHistogram<T, TT, C>::getPercentileBucketIdx(double pct,
+                                                   TimeType start,
+                                                   TimeType end) const {
+  return buckets_.getPercentileBucketIdx(pct / 100.0,
+                                         CountFromInterval(start, end));
+}
+
+template <typename T, typename TT, typename C>
+T TimeseriesHistogram<T, TT, C>::rate(int level) const {
+  ValueType total = ValueType();
+  TimeType elapsed(0);
+  for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+    const auto& levelObj = buckets_.getByIndex(b).getLevel(level);
+    total += levelObj.sum();
+    elapsed = std::max(elapsed, levelObj.elapsed());
+  }
+  return elapsed == TimeType(0) ? 0 : (total / elapsed.count());
+}
+
+template <typename T, typename TT, typename C>
+void TimeseriesHistogram<T, TT, C>::clear() {
+  for (int i = 0; i < buckets_.getNumBuckets(); i++) {
+    buckets_.getByIndex(i).clear();
+  }
+}
+
+template <typename T, typename TT, typename C>
+void TimeseriesHistogram<T, TT, C>::update(TimeType now) {
+  for (int i = 0; i < buckets_.getNumBuckets(); i++) {
+    buckets_.getByIndex(i).update(now);
+  }
+}
+
+template <typename T, typename TT, typename C>
+std::string TimeseriesHistogram<T, TT, C>::getString(int level) const {
+  std::string result;
+
+  for (int i = 0; i < buckets_.getNumBuckets(); i++) {
+    if (i > 0) {
+      toAppend(",", &result);
+    }
+    const ContainerType& cont = buckets_.getByIndex(i);
+    toAppend(buckets_.getBucketMin(i),
+             ":", cont.count(level),
+             ":", cont.avg<ValueType>(level), &result);
+  }
+
+  return result;
+}
+
+template <typename T, typename TT, typename C>
+std::string TimeseriesHistogram<T, TT, C>::getString(TimeType start,
+                                                     TimeType end) const {
+  std::string result;
+
+  for (int i = 0; i < buckets_.getNumBuckets(); i++) {
+    if (i > 0) {
+      toAppend(",", &result);
+    }
+    const ContainerType& cont = buckets_.getByIndex(i);
+    toAppend(buckets_.getBucketMin(i),
+             ":", cont.count(start, end),
+             ":", cont.avg(start, end), &result);
+  }
+
+  return result;
+}
+
+}  // namespace folly
+
+#endif
diff --git a/folly/stats/TimeseriesHistogram.h b/folly/stats/TimeseriesHistogram.h

new file mode 100644 (file)

index 0000000..9481542
--- /dev/null
+++ b/folly/stats/TimeseriesHistogram.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_TIMESERIES_HISTOGRAM_H_
+#define FOLLY_TIMESERIES_HISTOGRAM_H_
+
+#include <string>
+#include <boost/static_assert.hpp>
+#include "folly/stats/Histogram.h"
+#include "folly/stats/MultiLevelTimeSeries.h"
+
+namespace folly {
+
+/*
+ * TimeseriesHistogram tracks data distributions as they change over time.
+ *
+ * Specifically, it is a bucketed histogram with different value ranges assigned
+ * to each bucket.  Within each bucket is a MultiLevelTimeSeries from
+ * 'folly/stats/MultiLevelTimeSeries.h'. This means that each bucket contains a
+ * different set of data for different historical time periods, and one can
+ * query data distributions over different trailing time windows.
+ *
+ * For example, this can answer questions: "What is the data distribution over
+ * the last minute? Over the last 10 minutes?  Since I last cleared this
+ * histogram?"
+ *
+ * The class can also estimate percentiles and answer questions like: "What was
+ * the 99th percentile data value over the last 10 minutes?"
+ *
+ * Note: that depending on the size of your buckets and the smoothness
+ * of your data distribution, the estimate may be way off from the actual
+ * value.  In particular, if the given percentile falls outside of the bucket
+ * range (i.e. your buckets range in 0 - 100,000 but the 99th percentile is
+ * around 115,000) this estimate may be very wrong.
+ *
+ * The memory usage for a typical histogram is roughly 3k * (# of buckets).  All
+ * insertion operations are amortized O(1), and all queries are O(# of buckets).
+ */
+template <class T, class TT=std::chrono::seconds,
+          class C=folly::MultiLevelTimeSeries<T, TT>>
+class TimeseriesHistogram {
+ private:
+   // NOTE: T must be equivalent to _signed_ numeric type for our math.
+   BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_signed);
+
+ public:
+  // values to be inserted into container
+  typedef T ValueType;
+  // the container type we use internally for each bucket
+  typedef C ContainerType;
+  // The time type.
+  typedef TT TimeType;
+
+  /*
+   * Create a TimeSeries histogram and initialize the bucketing and levels.
+   *
+   * The buckets are created by chopping the range [min, max) into pieces
+   * of size bucketSize, with the last bucket being potentially shorter.  Two
+   * additional buckets are always created -- the "under" bucket for the range
+   * (-inf, min) and the "over" bucket for the range [max, +inf).
+   *
+   * @param bucketSize the width of each bucket
+   * @param min the smallest value for the bucket range.
+   * @param max the largest value for the bucket range
+   * @param defaultContainer a pre-initialized timeseries with the desired
+   *                         number of levels and their durations.
+   */
+  TimeseriesHistogram(ValueType bucketSize, ValueType min, ValueType max,
+                      const ContainerType& defaultContainer);
+
+  /* Return the bucket size of each bucket in the histogram. */
+  ValueType getBucketSize() const { return buckets_.getBucketSize(); }
+
+  /* Return the min value at which bucketing begins. */
+  ValueType getMin() const { return buckets_.getMin(); }
+
+  /* Return the max value at which bucketing ends. */
+  ValueType getMax() const { return buckets_.getMax(); }
+
+  /* Return the number of levels of the Timeseries object in each bucket */
+  int getNumLevels() const {
+    return buckets_.getByIndex(0).numLevels();
+  }
+
+  /* Return the number of buckets */
+  int getNumBuckets() const { return buckets_.getNumBuckets(); }
+
+  /* Return the bucket index into which the given value would fall. */
+  int getBucketIdx(const ValueType& value) const;
+
+  /*
+   * Return the threshold of the bucket for the given index in range
+   * [0..numBuckets).  The bucket will have range [thresh, thresh + bucketSize)
+   * or [thresh, max), whichever is shorter.
+   */
+  ValueType getBucketMin(int bucketIdx) const {
+    return buckets_.getBucketMin(bucketIdx);
+  }
+
+  /* Return the actual timeseries in the given bucket (for reading only!) */
+  const ContainerType& getBucket(int bucketIdx) const {
+    return buckets_.getByIndex(bucketIdx);
+  }
+
+  /* Total count of values at the given timeseries level (all buckets). */
+  int64_t count(int level) const {
+    int64_t total = 0;
+    for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+      total += buckets_.getByIndex(b).count(level);
+    }
+    return total;
+  }
+
+  /* Total count of values added during the given interval (all buckets). */
+  int64_t count(TimeType start, TimeType end) const {
+    int64_t total = 0;
+    for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+      total += buckets_.getByIndex(b).count(start, end);
+    }
+    return total;
+  }
+
+  /* Total sum of values at the given timeseries level (all buckets). */
+  ValueType sum(int level) const {
+    ValueType total = ValueType();
+    for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+      total += buckets_.getByIndex(b).sum(level);
+    }
+    return total;
+  }
+
+  /* Total sum of values added during the given interval (all buckets). */
+  ValueType sum(TimeType start, TimeType end) const {
+    ValueType total = ValueType();
+    for (int b = 0; b < buckets_.getNumBuckets(); ++b) {
+      total += buckets_.getByIndex(b).sum(start, end);
+    }
+    return total;
+  }
+
+  /* Average of values at the given timeseries level (all buckets). */
+  template <typename ReturnType=double>
+  ReturnType avg(int level) const;
+
+  /* Average of values added during the given interval (all buckets). */
+  template <typename ReturnType=double>
+  ReturnType avg(TimeType start, TimeType end) const;
+
+  /*
+   * Rate at the given timeseries level (all buckets).
+   * This is the sum of all values divided by the time interval (in seconds).
+   */
+  ValueType rate(int level) const;
+
+  /*
+   * Rate for the given interval (all buckets).
+   * This is the sum of all values divided by the time interval (in seconds).
+   */
+  template <typename ReturnType=double>
+  ReturnType rate(TimeType start, TimeType end) const;
+
+  /*
+   * Update every underlying timeseries object with the given timestamp. You
+   * must call this directly before querying to ensure that the data in all
+   * buckets is decayed properly.
+   */
+  void update(TimeType now);
+
+  /* clear all the data from the histogram. */
+  void clear();
+
+  /* Add a value into the histogram with timestamp 'now' */
+  void addValue(TimeType now, const ValueType& value);
+  /* Add a value the given number of times with timestamp 'now' */
+  void addValue(TimeType now, const ValueType& value, int64_t times);
+
+  /*
+   * Add all of the values from the specified histogram.
+   *
+   * All of the values will be added to the current time-slot.
+   *
+   * One use of this is for thread-local caching of frequently updated
+   * histogram data.  For example, each thread can store a thread-local
+   * Histogram that is updated frequently, and only add it to the global
+   * TimeseriesHistogram once a second.
+   */
+  void addValues(TimeType now, const folly::Histogram<ValueType>& values);
+
+  /*
+   * Return an estimate of the value at the given percentile in the histogram
+   * in the given timeseries level.  The percentile is estimated as follows:
+   *
+   * - We retrieve a count of the values in each bucket (at the given level)
+   * - We determine via the counts which bucket the given percentile falls in.
+   * - We assume the average value in the bucket is also its median
+   * - We then linearly interpolate within the bucket, by assuming that the
+   *   distribution is uniform in the two value ranges [left, median) and
+   *   [median, right) where [left, right) is the bucket value range.
+   *
+   * Caveats:
+   * - If the histogram is empty, this always returns ValueType(), usually 0.
+   * - For the 'under' and 'over' special buckets, their range is unbounded
+   *   on one side.  In order for the interpolation to work, we assume that
+   *   the average value in the bucket is equidistant from the two edges of
+   *   the bucket.  In other words, we assume that the distance between the
+   *   average and the known bound is equal to the distance between the average
+   *   and the unknown bound.
+   */
+  ValueType getPercentileEstimate(double pct, int level) const;
+  /*
+   * Return an estimate of the value at the given percentile in the histogram
+   * in the given historical interval.  Please see the documentation for
+   * getPercentileEstimate(int pct, int level) for the explanation of the
+   * estimation algorithm.
+   */
+  ValueType getPercentileEstimate(double pct, TimeType start, TimeType end)
+    const;
+
+  /*
+   * Return the bucket index that the given percentile falls into (in the
+   * given timeseries level).  This index can then be used to retrieve either
+   * the bucket threshold, or other data from inside the bucket.
+   */
+  int getPercentileBucketIdx(double pct, int level) const;
+  /*
+   * Return the bucket index that the given percentile falls into (in the
+   * given historical interval).  This index can then be used to retrieve either
+   * the bucket threshold, or other data from inside the bucket.
+   */
+  int getPercentileBucketIdx(double pct, TimeType start, TimeType end) const;
+
+  /* Get the bucket threshold for the bucket containing the given pct. */
+  int getPercentileBucketMin(double pct, int level) const {
+    return getBucketMin(getPercentileBucketIdx(pct, level));
+  }
+  /* Get the bucket threshold for the bucket containing the given pct. */
+  int getPercentileBucketMin(double pct, TimeType start, TimeType end) const {
+    return getBucketMin(getPercentileBucketIdx(pct, start, end));
+  }
+
+  /*
+   * Print out serialized data from all buckets at the given level.
+   * Format is: BUCKET [',' BUCKET ...]
+   * Where: BUCKET == bucketMin ':' count ':' avg
+   */
+  std::string getString(int level) const;
+
+  /*
+   * Print out serialized data for all buckets in the historical interval.
+   * For format, please see getString(int level).
+   */
+  std::string getString(TimeType start, TimeType end) const;
+
+ private:
+  typedef ContainerType Bucket;
+  struct CountFromLevel {
+    explicit CountFromLevel(int level) : level_(level) {}
+
+    uint64_t operator()(const ContainerType& bucket) const {
+      return bucket.count(level_);
+    }
+
+   private:
+    int level_;
+  };
+  struct CountFromInterval {
+    explicit CountFromInterval(TimeType start, TimeType end)
+      : start_(start),
+        end_(end) {}
+
+    uint64_t operator()(const ContainerType& bucket) const {
+      return bucket.count(start_, end_);
+    }
+
+   private:
+    TimeType start_;
+    TimeType end_;
+  };
+
+  struct AvgFromLevel {
+    explicit AvgFromLevel(int level) : level_(level) {}
+
+    ValueType operator()(const ContainerType& bucket) const {
+      return bucket.template avg<ValueType>(level_);
+    }
+
+   private:
+    int level_;
+  };
+
+  template <typename ReturnType>
+  struct AvgFromInterval {
+    explicit AvgFromInterval(TimeType start, TimeType end)
+      : start_(start),
+        end_(end) {}
+
+    ReturnType operator()(const ContainerType& bucket) const {
+      return bucket.template avg<ReturnType>(start_, end_);
+    }
+
+   private:
+    TimeType start_;
+    TimeType end_;
+  };
+
+  /*
+   * Special logic for the case of only one unique value registered
+   * (this can happen when clients don't pick good bucket ranges or have
+   * other bugs).  It's a lot easier for clients to track down these issues
+   * if they are getting the correct value.
+   */
+  void maybeHandleSingleUniqueValue(const ValueType& value);
+
+  folly::detail::HistogramBuckets<ValueType, ContainerType> buckets_;
+  bool haveNotSeenValue_;
+  bool singleUniqueValue_;
+  ValueType firstValue_;
+};
+
+}  // folly
+
+#endif  // FOLLY_TIMESERIES_HISTOGRAM_H_
diff --git a/folly/test/TimeseriesHistogramTest.cpp b/folly/test/TimeseriesHistogramTest.cpp

new file mode 100644 (file)

index 0000000..196fd82
--- /dev/null
+++ b/folly/test/TimeseriesHistogramTest.cpp
@@ -0,0 +1,487 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "folly/stats/TimeseriesHistogram.h"
+#include "folly/stats/TimeseriesHistogram-defs.h"
+
+#include <gtest/gtest.h>
+
+using namespace std;
+using namespace folly;
+using std::chrono::seconds;
+
+namespace IntMTMHTS {
+  enum Levels {
+    MINUTE,
+    TEN_MINUTE,
+    HOUR,
+    ALLTIME,
+    NUM_LEVELS,
+  };
+
+  const seconds kDurations[] = {
+    seconds(60), seconds(600), seconds(3600), seconds(0)
+  };
+};
+
+namespace IntMHTS {
+  enum Levels {
+    MINUTE,
+    HOUR,
+    ALLTIME,
+    NUM_LEVELS,
+  };
+
+  const seconds kDurations[] = {
+    seconds(60), seconds(3600), seconds(0)
+  };
+};
+
+typedef std::mt19937 RandomInt32;
+
+TEST(TimeseriesHistogram, Percentile) {
+  RandomInt32 random(5);
+  // [10, 109], 12 buckets including above and below
+  {
+    TimeseriesHistogram<int> h(10, 10, 110,
+                               MultiLevelTimeSeries<int>(
+                                 60, IntMTMHTS::NUM_LEVELS,
+                                 IntMTMHTS::kDurations));
+
+    EXPECT_EQ(0, h.getPercentileEstimate(0, IntMTMHTS::ALLTIME));
+
+    EXPECT_EQ(12, h.getNumBuckets());
+    EXPECT_EQ(10, h.getBucketSize());
+    EXPECT_EQ(10, h.getMin());
+    EXPECT_EQ(110, h.getMax());
+
+    for (int i = 0; i < h.getNumBuckets(); ++i) {
+      EXPECT_EQ(4, h.getBucket(i).numLevels());
+    }
+
+    int maxVal = 120;
+    h.addValue(seconds(0), 0);
+    h.addValue(seconds(0), maxVal);
+    for (int i = 0; i < 98; i++) {
+      h.addValue(seconds(0), random() % maxVal);
+    }
+
+    h.update(std::chrono::duration_cast<std::chrono::seconds>(
+               std::chrono::system_clock::now().time_since_epoch()));
+    // bucket 0 stores everything below min, so its minimum
+    // is the lowest possible number
+    EXPECT_EQ(std::numeric_limits<int>::min(),
+              h.getPercentileBucketMin(1, IntMTMHTS::ALLTIME));
+    EXPECT_EQ(110, h.getPercentileBucketMin(99, IntMTMHTS::ALLTIME));
+
+    EXPECT_EQ(-2, h.getPercentileEstimate(0, IntMTMHTS::ALLTIME));
+    EXPECT_EQ(-1, h.getPercentileEstimate(1, IntMTMHTS::ALLTIME));
+    EXPECT_EQ(119, h.getPercentileEstimate(99, IntMTMHTS::ALLTIME));
+    EXPECT_EQ(120, h.getPercentileEstimate(100, IntMTMHTS::ALLTIME));
+  }
+}
+
+TEST(TimeseriesHistogram, String) {
+  RandomInt32 random(5);
+  // [10, 109], 12 buckets including above and below
+  {
+    TimeseriesHistogram<int> hist(10, 10, 110,
+                                  MultiLevelTimeSeries<int>(
+                                    60, IntMTMHTS::NUM_LEVELS,
+                                    IntMTMHTS::kDurations));
+
+    int maxVal = 120;
+    hist.addValue(seconds(0), 0);
+    hist.addValue(seconds(0), maxVal);
+    for (int i = 0; i < 98; i++) {
+      hist.addValue(seconds(0), random() % maxVal);
+    }
+
+    hist.update(seconds(0));
+
+    const char* const kStringValues1[IntMTMHTS::NUM_LEVELS] =  {
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+    };
+
+    CHECK_EQ(IntMTMHTS::NUM_LEVELS, hist.getNumLevels());
+
+    for (int level = 0; level < hist.getNumLevels(); ++level) {
+      EXPECT_EQ(kStringValues1[level], hist.getString(level));
+    }
+
+    const char* const kStringValues2[IntMTMHTS::NUM_LEVELS] =  {
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+      "-2147483648:12:4,10:8:13,20:8:24,30:6:34,40:13:46,50:8:54,60:7:64,"
+        "70:7:74,80:8:84,90:10:94,100:3:103,110:10:115",
+    };
+
+    CHECK_EQ(IntMTMHTS::NUM_LEVELS, hist.getNumLevels());
+
+    for (int level = 0; level < hist.getNumLevels(); ++level) {
+      EXPECT_EQ(kStringValues2[level], hist.getString(level));
+    }
+  }
+}
+
+TEST(TimeseriesHistogram, Clear) {
+  {
+    TimeseriesHistogram<int> hist(10, 0, 100,
+                                  MultiLevelTimeSeries<int>(
+                                    60, IntMTMHTS::NUM_LEVELS,
+                                    IntMTMHTS::kDurations));
+
+    for (int now = 0; now < 3600; now++) {
+      for (int i = 0; i < 100; i++) {
+        hist.addValue(seconds(now), i, 2);  // adds each item 2 times
+      }
+    }
+
+    // check clearing
+    hist.clear();
+
+    for (int b = 0; b < hist.getNumBuckets(); ++b) {
+      EXPECT_EQ(0, hist.getBucket(b).count(IntMTMHTS::MINUTE));
+      EXPECT_EQ(0, hist.getBucket(b).count(IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(0, hist.getBucket(b).count(IntMTMHTS::HOUR));
+      EXPECT_EQ(0, hist.getBucket(b).count(IntMTMHTS::ALLTIME));
+    }
+
+    for (int pct = 0; pct <= 100; pct++) {
+      EXPECT_EQ(0, hist.getPercentileBucketMin(pct, IntMTMHTS::MINUTE));
+      EXPECT_EQ(0, hist.getPercentileBucketMin(pct, IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(0, hist.getPercentileBucketMin(pct, IntMTMHTS::HOUR));
+      EXPECT_EQ(0, hist.getPercentileBucketMin(pct, IntMTMHTS::ALLTIME));
+
+      EXPECT_EQ(0, hist.getPercentileEstimate(pct, IntMTMHTS::MINUTE));
+      EXPECT_EQ(0, hist.getPercentileEstimate(pct, IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(0, hist.getPercentileEstimate(pct, IntMTMHTS::HOUR));
+      EXPECT_EQ(0, hist.getPercentileEstimate(pct, IntMTMHTS::ALLTIME));
+    }
+  }
+}
+
+
+TEST(TimeseriesHistogram, Basic) {
+  {
+    TimeseriesHistogram<int> hist(10, 0, 100,
+                                  MultiLevelTimeSeries<int>(
+                                    60, IntMTMHTS::NUM_LEVELS,
+                                    IntMTMHTS::kDurations));
+
+    for (int now = 0; now < 3600; now++) {
+      for (int i = 0; i < 100; i++) {
+        hist.addValue(seconds(now), i);
+      }
+    }
+
+    hist.update(seconds(3599));
+    for (int pct = 1; pct <= 100; pct++) {
+      int expected = (pct - 1) / 10 * 10;
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::MINUTE));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct,
+                                                      IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::HOUR));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::ALLTIME));
+    }
+
+    for (int b = 1; (b + 1) < hist.getNumBuckets(); ++b) {
+      EXPECT_EQ(600, hist.getBucket(b).count(IntMTMHTS::MINUTE));
+      EXPECT_EQ(6000, hist.getBucket(b).count(IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(36000, hist.getBucket(b).count(IntMTMHTS::HOUR));
+      EXPECT_EQ(36000, hist.getBucket(b).count(IntMTMHTS::ALLTIME));
+    }
+    EXPECT_EQ(0, hist.getBucket(0).count(IntMTMHTS::MINUTE));
+    EXPECT_EQ(0, hist.getBucket(hist.getNumBuckets() - 1).count(
+                IntMTMHTS::MINUTE));
+  }
+
+  // -----------------
+
+  {
+    TimeseriesHistogram<int> hist(10, 0, 100,
+                                  MultiLevelTimeSeries<int>(
+                                    60, IntMTMHTS::NUM_LEVELS,
+                                    IntMTMHTS::kDurations));
+
+    for (int now = 0; now < 3600; now++) {
+      for (int i = 0; i < 100; i++) {
+        hist.addValue(seconds(now), i, 2);  // adds each item 2 times
+      }
+    }
+
+    hist.update(seconds(3599));
+    for (int pct = 1; pct <= 100; pct++) {
+      int expected = (pct - 1) / 10 * 10;
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::MINUTE));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct,
+                                                      IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::HOUR));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::ALLTIME));
+   }
+
+    for (int b = 1; (b + 1) < hist.getNumBuckets(); ++b) {
+      EXPECT_EQ(600 * 2, hist.getBucket(b).count(IntMTMHTS::MINUTE));
+      EXPECT_EQ(6000 * 2, hist.getBucket(b).count(IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(36000 * 2, hist.getBucket(b).count(IntMTMHTS::HOUR));
+      EXPECT_EQ(36000 * 2, hist.getBucket(b).count(IntMTMHTS::ALLTIME));
+    }
+    EXPECT_EQ(0, hist.getBucket(0).count(IntMTMHTS::MINUTE));
+    EXPECT_EQ(0, hist.getBucket(hist.getNumBuckets() - 1).count(
+                IntMTMHTS::MINUTE));
+  }
+
+  // -----------------
+
+  {
+    TimeseriesHistogram<int> hist(10, 0, 100,
+                                  MultiLevelTimeSeries<int>(
+                                    60, IntMTMHTS::NUM_LEVELS,
+                                    IntMTMHTS::kDurations));
+
+    for (int now = 0; now < 3600; now++) {
+      for (int i = 0; i < 50; i++) {
+        hist.addValue(seconds(now), i * 2, 2);  // adds each item 2 times
+      }
+    }
+
+    hist.update(seconds(3599));
+    for (int pct = 1; pct <= 100; pct++) {
+      int expected = (pct - 1) / 10 * 10;
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::MINUTE));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct,
+                                                      IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::HOUR));
+      EXPECT_EQ(expected, hist.getPercentileBucketMin(pct, IntMTMHTS::ALLTIME));
+    }
+
+    EXPECT_EQ(0, hist.getBucket(0).count(IntMTMHTS::MINUTE));
+    EXPECT_EQ(0, hist.getBucket(0).count(IntMTMHTS::TEN_MINUTE));
+    EXPECT_EQ(0, hist.getBucket(0).count(IntMTMHTS::HOUR));
+    EXPECT_EQ(0, hist.getBucket(0).count(IntMTMHTS::ALLTIME));
+    EXPECT_EQ(0, hist.getBucket(hist.getNumBuckets() - 1).count(
+                IntMTMHTS::MINUTE));
+    EXPECT_EQ(0,
+              hist.getBucket(hist.getNumBuckets() - 1).
+                count(IntMTMHTS::TEN_MINUTE));
+    EXPECT_EQ(0, hist.getBucket(hist.getNumBuckets() - 1).count(
+                IntMTMHTS::HOUR));
+    EXPECT_EQ(0,
+              hist.getBucket(hist.getNumBuckets() - 1).count(
+                IntMTMHTS::ALLTIME));
+
+    for (int b = 1; (b + 1) < hist.getNumBuckets(); ++b) {
+      EXPECT_EQ(600, hist.getBucket(b).count(IntMTMHTS::MINUTE));
+      EXPECT_EQ(6000, hist.getBucket(b).count(IntMTMHTS::TEN_MINUTE));
+      EXPECT_EQ(36000, hist.getBucket(b).count(IntMTMHTS::HOUR));
+      EXPECT_EQ(36000, hist.getBucket(b).count(IntMTMHTS::ALLTIME));
+    }
+
+    for (int i = 0; i < 100; ++i) {
+      hist.addValue(seconds(3599), 200 + i);
+    }
+    hist.update(seconds(3599));
+    EXPECT_EQ(100,
+              hist.getBucket(hist.getNumBuckets() - 1).count(
+                IntMTMHTS::ALLTIME));
+
+  }
+}
+
+TEST(TimeseriesHistogram, QueryByInterval) {
+  TimeseriesHistogram<int> mhts(8, 8, 120,
+                                MultiLevelTimeSeries<int>(
+                                  60, IntMHTS::NUM_LEVELS,
+                                  IntMHTS::kDurations));
+
+  mhts.update(seconds(0));
+
+  int curTime;
+  for (curTime = 0; curTime < 7200; curTime++) {
+    mhts.addValue(seconds(curTime), 1);
+  }
+  for (curTime = 7200; curTime < 7200 + 3540; curTime++) {
+    mhts.addValue(seconds(curTime), 10);
+  }
+  for (curTime = 7200 + 3540; curTime < 7200 + 3600; curTime++) {
+    mhts.addValue(seconds(curTime), 100);
+  }
+
+  mhts.update(seconds(7200 + 3600 - 1));
+
+  struct TimeInterval {
+    TimeInterval(int s, int e)
+      : start(s), end(e) {}
+
+    std::chrono::seconds start;
+    std::chrono::seconds end;
+  };
+  TimeInterval intervals[12] = {
+    { curTime - 60, curTime },
+    { curTime - 3600, curTime },
+    { curTime - 7200, curTime },
+    { curTime - 3600, curTime - 60 },
+    { curTime - 7200, curTime - 60 },
+    { curTime - 7200, curTime - 3600 },
+    { curTime - 50, curTime - 20 },
+    { curTime - 3020, curTime - 20 },
+    { curTime - 7200, curTime - 20 },
+    { curTime - 3000, curTime - 1000 },
+    { curTime - 7200, curTime - 1000 },
+    { curTime - 7200, curTime - 3600 },
+  };
+
+  int expectedSums[12] = {
+    6000, 41400, 32400, 35400, 32129, 16200, 3000, 33600, 32308, 20000, 27899,
+    16200
+  };
+
+  int expectedCounts[12] = {
+    60, 3600, 7200, 3540, 7139, 3600, 30, 3000, 7178, 2000, 6199, 3600
+  };
+
+  // The first 7200 values added all fell below the histogram minimum,
+  // and went into the bucket that tracks all of the too-small values.
+  // This bucket reports a minimum value of the smallest possible integer.
+  int belowMinBucket = std::numeric_limits<int>::min();
+
+  int expectedValues[12][3] = {
+    {96, 96, 96},
+    { 8,  8, 96},
+    { belowMinBucket,  belowMinBucket,  8}, // alltime
+    { 8,  8,  8},
+    { belowMinBucket,  belowMinBucket,  8}, // alltime
+    { belowMinBucket,  belowMinBucket,  8}, // alltime
+    {96, 96, 96},
+    { 8,  8, 96},
+    { belowMinBucket,  belowMinBucket,  8}, // alltime
+    { 8,  8,  8},
+    { belowMinBucket,  belowMinBucket,  8}, // alltime
+    { belowMinBucket,  belowMinBucket,  8}  // alltime
+  };
+
+  for (int i = 0; i < 12; i++) {
+    const auto& itv = intervals[i];
+    int s = mhts.sum(itv.start, itv.end);
+    EXPECT_EQ(expectedSums[i], s);
+
+    int c = mhts.count(itv.start, itv.end);
+    EXPECT_EQ(expectedCounts[i], c);
+  }
+
+  // 3 levels
+  for (int i = 1; i <= 100; i++) {
+    EXPECT_EQ(96, mhts.getPercentileBucketMin(i, 0));
+    EXPECT_EQ(96, mhts.getPercentileBucketMin(i, seconds(curTime - 60),
+                                              seconds(curTime)));
+    EXPECT_EQ(8, mhts.getPercentileBucketMin(i, seconds(curTime - 3540),
+                                             seconds(curTime - 60)));
+  }
+
+  EXPECT_EQ(8, mhts.getPercentileBucketMin(1, 1));
+  EXPECT_EQ(8, mhts.getPercentileBucketMin(98, 1));
+  EXPECT_EQ(96, mhts.getPercentileBucketMin(99, 1));
+  EXPECT_EQ(96, mhts.getPercentileBucketMin(100, 1));
+
+  EXPECT_EQ(belowMinBucket, mhts.getPercentileBucketMin(1, 2));
+  EXPECT_EQ(belowMinBucket, mhts.getPercentileBucketMin(66, 2));
+  EXPECT_EQ(8, mhts.getPercentileBucketMin(67, 2));
+  EXPECT_EQ(8, mhts.getPercentileBucketMin(99, 2));
+  EXPECT_EQ(96, mhts.getPercentileBucketMin(100, 2));
+
+  // 0 is currently the value for bucket 0 (below min)
+  for (int i = 0; i < 12; i++) {
+    const auto& itv = intervals[i];
+    int v = mhts.getPercentileBucketMin(1, itv.start, itv.end);
+    EXPECT_EQ(expectedValues[i][0], v);
+
+    v = mhts.getPercentileBucketMin(50, itv.start, itv.end);
+    EXPECT_EQ(expectedValues[i][1], v);
+
+    v = mhts.getPercentileBucketMin(99, itv.start, itv.end);
+    EXPECT_EQ(expectedValues[i][2], v);
+  }
+
+  for (int i = 0; i < 12; i++) {
+    const auto& itv = intervals[i];
+    int c = mhts.count(itv.start, itv.end);
+    // Some of the older intervals that fall in the alltime bucket
+    // are off by 1 or 2 in their estimated counts.
+    size_t tolerance = 0;
+    if (itv.start <= seconds(curTime - 7200)) {
+      tolerance = 2;
+    } else if (itv.start <= seconds(curTime - 3000)) {
+      tolerance = 1;
+    }
+    size_t actualCount = (itv.end - itv.start).count();
+    size_t estimatedCount = mhts.count(itv.start, itv.end);
+    EXPECT_GE(actualCount, estimatedCount);
+    EXPECT_LE(actualCount - tolerance, estimatedCount);
+  }
+}
+
+TEST(TimeseriesHistogram, SingleUniqueValue) {
+  int values[] = {-1, 0, 500, 1000, 1500};
+  for (int ii = 0; ii < 5; ++ii) {
+    int value = values[ii];
+    TimeseriesHistogram<int> h(10, 0, 1000,
+                               MultiLevelTimeSeries<int>(
+                                 60, IntMTMHTS::NUM_LEVELS,
+                                 IntMTMHTS::kDurations));
+
+    const int kNumIters = 1000;
+    for (int jj = 0; jj < kNumIters; ++jj) {
+      h.addValue(seconds(time(nullptr)), value);
+    }
+    h.update(seconds(time(nullptr)));
+    // since we've only added one unique value, all percentiles should
+    // be that value
+    EXPECT_EQ(h.getPercentileEstimate(10, 0), value);
+    EXPECT_EQ(h.getPercentileEstimate(50, 0), value);
+    EXPECT_EQ(h.getPercentileEstimate(99, 0), value);
+
+    // Things get trickier if there are multiple unique values.
+    const int kNewValue = 750;
+    for (int kk = 0; kk < 2*kNumIters; ++kk) {
+      h.addValue(seconds(time(nullptr)), kNewValue);
+    }
+    h.update(seconds(time(nullptr)));
+    EXPECT_NEAR(h.getPercentileEstimate(50, 0), kNewValue+5, 5);
+    if (value >= 0 && value <= 1000) {
+      // only do further testing if value is within our bucket range,
+      // else estimates can be wildly off
+      if (kNewValue > value) {
+        EXPECT_NEAR(h.getPercentileEstimate(10, 0), value+5, 5);
+        EXPECT_NEAR(h.getPercentileEstimate(99, 0), kNewValue+5, 5);
+      } else {
+        EXPECT_NEAR(h.getPercentileEstimate(10, 0), kNewValue+5, 5);
+        EXPECT_NEAR(h.getPercentileEstimate(99, 0), value+5, 5);
+      }
+    }
+  }
+}
+
author	Stephen Chen <tracelog@fb.com>
	Mon, 11 Nov 2013 21:32:34 +0000 (13:32 -0800)
committer	Peter Griess <pgriess@fb.com>
	Tue, 26 Nov 2013 15:05:17 +0000 (07:05 -0800)
folly/stats/Instantiations.cpp		patch \| blob \| history
folly/stats/MultiLevelTimeSeries.h		patch \| blob \| history
folly/stats/TimeseriesHistogram-defs.h	[new file with mode: 0644]	patch \| blob
folly/stats/TimeseriesHistogram.h	[new file with mode: 0644]	patch \| blob
folly/test/TimeseriesHistogramTest.cpp	[new file with mode: 0644]	patch \| blob