/**
* apply() - Send all values produced by this generator to given
- * handler until it returns false. Returns true if the false iff the handler
- * returned false.
+ * handler until the handler returns false. Returns true until the handler
+ * returns false. GOTCHA: It should return true even if it completes (without
+ * the handler returning false), as 'Chain' uses the return value of apply
+ * to determine if it should process the second object in its chain.
*/
template<class Handler>
bool apply(Handler&& handler) const;
}
};
+/**
+ * Sample - For taking a random sample of N elements from a sequence
+ * (without replacement).
+ */
+template<class Random>
+class Sample : public Operator<Sample<Random>> {
+ size_t count_;
+ Random rng_;
+ public:
+ explicit Sample(size_t count, Random rng)
+ : count_(count), rng_(std::move(rng)) {}
+
+ template<class Value,
+ class Source,
+ class Rand,
+ class StorageType = typename std::decay<Value>::type>
+ class Generator :
+ public GenImpl<StorageType&&,
+ Generator<Value, Source, Rand, StorageType>> {
+ static_assert(!Source::infinite, "Cannot sample infinite source!");
+ // It's too easy to bite ourselves if random generator is only 16-bit
+ static_assert(Random::max() >= std::numeric_limits<int32_t>::max() - 1,
+ "Random number generator must support big values");
+ Source source_;
+ size_t count_;
+ mutable Rand rng_;
+ public:
+ explicit Generator(Source source, size_t count, Random rng)
+ : source_(std::move(source)) , count_(count), rng_(std::move(rng)) {}
+
+ template<class Handler>
+ bool apply(Handler&& handler) const {
+ if (count_ == 0) { return false; }
+ std::vector<StorageType> v;
+ v.reserve(count_);
+ // use reservoir sampling to give each source value an equal chance
+ // of appearing in our output.
+ size_t n = 1;
+ source_.foreach([&](Value value) -> void {
+ if (v.size() < count_) {
+ v.push_back(std::forward<Value>(value));
+ } else {
+ // alternatively, we could create a std::uniform_int_distribution
+ // instead of using modulus, but benchmarks show this has
+ // substantial overhead.
+ size_t index = rng_() % n;
+ if (index < v.size()) {
+ v[index] = std::forward<Value>(value);
+ }
+ }
+ ++n;
+ });
+
+ // output is unsorted!
+ for (auto& val: v) {
+ if (!handler(std::move(val))) {
+ return false;
+ }
+ }
+ return true;
+ }
+ };
+
+ template<class Source,
+ class Value,
+ class Gen = Generator<Value, Source, Random>>
+ Gen compose(GenImpl<Value, Source>&& source) const {
+ return Gen(std::move(source.self()), count_, rng_);
+ }
+
+ template<class Source,
+ class Value,
+ class Gen = Generator<Value, Source, Random>>
+ Gen compose(const GenImpl<Value, Source>& source) const {
+ return Gen(source.self(), count_, rng_);
+ }
+};
+
/**
* Skip - For skipping N items from the beginning of a source generator.
*
return detail::Take(count);
}
+template<class Random = std::default_random_engine>
+inline detail::Sample<Random> sample(size_t count, Random rng = Random()) {
+ return detail::Sample<Random>(count, std::move(rng));
+}
+
inline detail::Skip skip(size_t count) {
return detail::Skip(count);
}
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <iostream>
+#include <random>
#include <set>
#include <vector>
#include "folly/experimental/Gen.h"
EXPECT_EQ(expected, actual);
}
+TEST(Gen, Sample) {
+ std::mt19937 rnd(42);
+
+ auto sampler =
+ seq(1, 100)
+ | sample(50, rnd);
+ std::unordered_map<int,int> hits;
+ const int kNumIters = 80;
+ for (int i = 0; i < kNumIters; i++) {
+ auto vec = sampler | as<vector<int>>();
+ EXPECT_EQ(vec.size(), 50);
+ auto uniq = fromConst(vec) | as<set<int>>();
+ EXPECT_EQ(uniq.size(), vec.size()); // sampling without replacement
+ for (auto v: vec) {
+ ++hits[v];
+ }
+ }
+
+ // In 80 separate samples of our range, we should have seen every value
+ // at least once and no value all 80 times. (The odds of either of those
+ // events is 1/2^80).
+ EXPECT_EQ(hits.size(), 100);
+ for (auto hit: hits) {
+ EXPECT_GT(hit.second, 0);
+ EXPECT_LT(hit.second, kNumIters);
+ }
+
+ auto small =
+ seq(1, 5)
+ | sample(10);
+ EXPECT_EQ((small | sum), 15);
+ EXPECT_EQ((small | take(3) | count), 3);
+}
+
TEST(Gen, Skip) {
auto gen =
seq(1, 1000)
EXPECT_EQ(dynamic(6), from(obj.items()) | get<1>() | sum);
}
+TEST(Gen, Collect) {
+ auto s = from({7, 6, 5, 4, 3}) | as<set<int>>();
+ EXPECT_EQ(s.size(), 5);
+}
+
TEST(StringGen, EmptySplit) {
auto collect = eachTo<std::string>() | as<vector>();
{