2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/io/Compression.h>
22 #include <unordered_map>
24 #include <boost/noncopyable.hpp>
25 #include <glog/logging.h>
27 #include <folly/Benchmark.h>
28 #include <folly/Hash.h>
29 #include <folly/Memory.h>
30 #include <folly/Random.h>
31 #include <folly/Varint.h>
32 #include <folly/io/IOBufQueue.h>
33 #include <folly/portability/GTest.h>
35 namespace folly { namespace io { namespace test {
37 class DataHolder : private boost::noncopyable {
39 uint64_t hash(size_t size) const;
40 ByteRange data(size_t size) const;
43 explicit DataHolder(size_t sizeLog2);
45 std::unique_ptr<uint8_t[]> data_;
46 mutable std::unordered_map<uint64_t, uint64_t> hashCache_;
49 DataHolder::DataHolder(size_t sizeLog2)
50 : size_(size_t(1) << sizeLog2),
51 data_(new uint8_t[size_]) {
54 uint64_t DataHolder::hash(size_t size) const {
55 CHECK_LE(size, size_);
56 auto p = hashCache_.find(size);
57 if (p != hashCache_.end()) {
61 uint64_t h = folly::hash::fnv64_buf(data_.get(), size);
66 ByteRange DataHolder::data(size_t size) const {
67 CHECK_LE(size, size_);
68 return ByteRange(data_.get(), size);
71 uint64_t hashIOBuf(const IOBuf* buf) {
72 uint64_t h = folly::hash::FNV_64_HASH_START;
73 for (auto& range : *buf) {
74 h = folly::hash::fnv64_buf(range.data(), range.size(), h);
79 class RandomDataHolder : public DataHolder {
81 explicit RandomDataHolder(size_t sizeLog2);
84 RandomDataHolder::RandomDataHolder(size_t sizeLog2)
85 : DataHolder(sizeLog2) {
86 constexpr size_t numThreadsLog2 = 3;
87 constexpr size_t numThreads = size_t(1) << numThreadsLog2;
89 uint32_t seed = randomNumberSeed();
91 std::vector<std::thread> threads;
92 threads.reserve(numThreads);
93 for (size_t t = 0; t < numThreads; ++t) {
95 [this, seed, t, numThreadsLog2, sizeLog2] () {
96 std::mt19937 rng(seed + t);
97 size_t countLog2 = sizeLog2 - numThreadsLog2;
98 size_t start = size_t(t) << countLog2;
99 for (size_t i = 0; i < countLog2; ++i) {
100 this->data_[start + i] = rng();
105 for (auto& t : threads) {
110 class ConstantDataHolder : public DataHolder {
112 explicit ConstantDataHolder(size_t sizeLog2);
115 ConstantDataHolder::ConstantDataHolder(size_t sizeLog2)
116 : DataHolder(sizeLog2) {
117 memset(data_.get(), 'a', size_);
120 constexpr size_t dataSizeLog2 = 27; // 128MiB
121 RandomDataHolder randomDataHolder(dataSizeLog2);
122 ConstantDataHolder constantDataHolder(dataSizeLog2);
124 // The intersection of the provided codecs & those that are compiled in.
125 static std::vector<CodecType> supportedCodecs(std::vector<CodecType> const& v) {
126 std::vector<CodecType> supported;
131 std::back_inserter(supported),
137 // All compiled-in compression codecs.
138 static std::vector<CodecType> availableCodecs() {
139 std::vector<CodecType> codecs;
141 for (size_t i = 0; i < static_cast<size_t>(CodecType::NUM_CODEC_TYPES); ++i) {
142 auto type = static_cast<CodecType>(i);
143 if (hasCodec(type)) {
144 codecs.push_back(type);
151 TEST(CompressionTestNeedsUncompressedLength, Simple) {
152 static const struct { CodecType type; bool needsUncompressedLength; }
154 { CodecType::NO_COMPRESSION, false },
155 { CodecType::LZ4, true },
156 { CodecType::SNAPPY, false },
157 { CodecType::ZLIB, false },
158 { CodecType::LZ4_VARINT_SIZE, false },
159 { CodecType::LZMA2, false },
160 { CodecType::LZMA2_VARINT_SIZE, false },
161 { CodecType::ZSTD, false },
162 { CodecType::GZIP, false },
163 { CodecType::LZ4_FRAME, false },
164 { CodecType::BZIP2, false },
167 for (auto const& test : expectations) {
168 if (hasCodec(test.type)) {
169 EXPECT_EQ(getCodec(test.type)->needsUncompressedLength(),
170 test.needsUncompressedLength);
175 class CompressionTest
176 : public testing::TestWithParam<std::tr1::tuple<int, int, CodecType>> {
178 void SetUp() override {
179 auto tup = GetParam();
180 uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
181 chunks_ = std::tr1::get<1>(tup);
182 codec_ = getCodec(std::tr1::get<2>(tup));
185 void runSimpleIOBufTest(const DataHolder& dh);
187 void runSimpleStringTest(const DataHolder& dh);
190 std::unique_ptr<IOBuf> split(std::unique_ptr<IOBuf> data) const;
192 uint64_t uncompressedLength_;
194 std::unique_ptr<Codec> codec_;
197 void CompressionTest::runSimpleIOBufTest(const DataHolder& dh) {
198 const auto original = split(IOBuf::wrapBuffer(dh.data(uncompressedLength_)));
199 const auto compressed = split(codec_->compress(original.get()));
200 if (!codec_->needsUncompressedLength()) {
201 auto uncompressed = codec_->uncompress(compressed.get());
202 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
203 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
206 auto uncompressed = codec_->uncompress(compressed.get(),
207 uncompressedLength_);
208 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
209 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
213 void CompressionTest::runSimpleStringTest(const DataHolder& dh) {
214 const auto original = std::string(
215 reinterpret_cast<const char*>(dh.data(uncompressedLength_).data()),
216 uncompressedLength_);
217 const auto compressed = codec_->compress(original);
218 if (!codec_->needsUncompressedLength()) {
219 auto uncompressed = codec_->uncompress(compressed);
220 EXPECT_EQ(uncompressedLength_, uncompressed.length());
221 EXPECT_EQ(uncompressed, original);
224 auto uncompressed = codec_->uncompress(compressed, uncompressedLength_);
225 EXPECT_EQ(uncompressedLength_, uncompressed.length());
226 EXPECT_EQ(uncompressed, original);
230 // Uniformly split data into (potentially empty) chunks.
231 std::unique_ptr<IOBuf> CompressionTest::split(
232 std::unique_ptr<IOBuf> data) const {
233 if (data->isChained()) {
237 const size_t size = data->computeChainDataLength();
239 std::multiset<size_t> splits;
240 for (size_t i = 1; i < chunks_; ++i) {
241 splits.insert(Random::rand64(size));
244 folly::IOBufQueue result;
247 for (size_t split : splits) {
248 result.append(IOBuf::copyBuffer(data->data() + offset, split - offset));
251 result.append(IOBuf::copyBuffer(data->data() + offset, size - offset));
253 return result.move();
256 TEST_P(CompressionTest, RandomData) {
257 runSimpleIOBufTest(randomDataHolder);
260 TEST_P(CompressionTest, ConstantData) {
261 runSimpleIOBufTest(constantDataHolder);
264 TEST_P(CompressionTest, RandomDataString) {
265 runSimpleStringTest(randomDataHolder);
268 TEST_P(CompressionTest, ConstantDataString) {
269 runSimpleStringTest(constantDataHolder);
272 INSTANTIATE_TEST_CASE_P(
276 testing::Values(0, 1, 12, 22, 25, 27),
277 testing::Values(1, 2, 3, 8, 65),
278 testing::ValuesIn(availableCodecs())));
280 class CompressionVarintTest
281 : public testing::TestWithParam<std::tr1::tuple<int, CodecType>> {
283 void SetUp() override {
284 auto tup = GetParam();
285 uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
286 codec_ = getCodec(std::tr1::get<1>(tup));
289 void runSimpleTest(const DataHolder& dh);
291 uint64_t uncompressedLength_;
292 std::unique_ptr<Codec> codec_;
295 inline uint64_t oneBasedMsbPos(uint64_t number) {
297 for (; number > 0; ++pos, number >>= 1) {
302 void CompressionVarintTest::runSimpleTest(const DataHolder& dh) {
303 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength_));
304 auto compressed = codec_->compress(original.get());
308 std::max(uint64_t(9), oneBasedMsbPos(uncompressedLength_)) / 9UL);
309 auto tinyBuf = IOBuf::copyBuffer(compressed->data(),
310 std::min(compressed->length(), breakPoint));
311 compressed->trimStart(breakPoint);
312 tinyBuf->prependChain(std::move(compressed));
313 compressed = std::move(tinyBuf);
315 auto uncompressed = codec_->uncompress(compressed.get());
317 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
318 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
321 TEST_P(CompressionVarintTest, RandomData) {
322 runSimpleTest(randomDataHolder);
325 TEST_P(CompressionVarintTest, ConstantData) {
326 runSimpleTest(constantDataHolder);
329 INSTANTIATE_TEST_CASE_P(
330 CompressionVarintTest,
331 CompressionVarintTest,
333 testing::Values(0, 1, 12, 22, 25, 27),
334 testing::ValuesIn(supportedCodecs({
335 CodecType::LZ4_VARINT_SIZE,
336 CodecType::LZMA2_VARINT_SIZE,
339 class CompressionCorruptionTest : public testing::TestWithParam<CodecType> {
341 void SetUp() override { codec_ = getCodec(GetParam()); }
343 void runSimpleTest(const DataHolder& dh);
345 std::unique_ptr<Codec> codec_;
348 void CompressionCorruptionTest::runSimpleTest(const DataHolder& dh) {
349 constexpr uint64_t uncompressedLength = 42;
350 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
351 auto compressed = codec_->compress(original.get());
353 if (!codec_->needsUncompressedLength()) {
354 auto uncompressed = codec_->uncompress(compressed.get());
355 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
356 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
359 auto uncompressed = codec_->uncompress(compressed.get(),
361 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
362 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
365 EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength + 1),
368 // Corrupt the first character
369 ++(compressed->writableData()[0]);
371 if (!codec_->needsUncompressedLength()) {
372 EXPECT_THROW(codec_->uncompress(compressed.get()),
376 EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength),
380 TEST_P(CompressionCorruptionTest, RandomData) {
381 runSimpleTest(randomDataHolder);
384 TEST_P(CompressionCorruptionTest, ConstantData) {
385 runSimpleTest(constantDataHolder);
388 INSTANTIATE_TEST_CASE_P(
389 CompressionCorruptionTest,
390 CompressionCorruptionTest,
392 // NO_COMPRESSION can't detect corruption
393 // LZ4 can't detect corruption reliably (sigh)
399 CodecType::LZ4_FRAME,
403 class AutomaticCodecTest : public testing::TestWithParam<CodecType> {
405 void SetUp() override {
406 codec_ = getCodec(GetParam());
407 auto_ = getAutoUncompressionCodec();
410 void runSimpleTest(const DataHolder& dh);
412 std::unique_ptr<Codec> codec_;
413 std::unique_ptr<Codec> auto_;
416 void AutomaticCodecTest::runSimpleTest(const DataHolder& dh) {
417 constexpr uint64_t uncompressedLength = 1000;
418 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
419 auto compressed = codec_->compress(original.get());
421 if (!codec_->needsUncompressedLength()) {
422 auto uncompressed = auto_->uncompress(compressed.get());
423 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
424 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
427 auto uncompressed = auto_->uncompress(compressed.get(), uncompressedLength);
428 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
429 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
431 ASSERT_GE(compressed->computeChainDataLength(), 8);
432 for (size_t i = 0; i < 8; ++i) {
433 auto split = compressed->clone();
434 auto rest = compressed->clone();
435 split->trimEnd(split->length() - i);
437 split->appendChain(std::move(rest));
438 auto uncompressed = auto_->uncompress(split.get(), uncompressedLength);
439 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
440 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
444 TEST_P(AutomaticCodecTest, RandomData) {
445 runSimpleTest(randomDataHolder);
448 TEST_P(AutomaticCodecTest, ConstantData) {
449 runSimpleTest(constantDataHolder);
452 TEST_P(AutomaticCodecTest, ValidPrefixes) {
453 const auto prefixes = codec_->validPrefixes();
454 for (const auto& prefix : prefixes) {
455 EXPECT_FALSE(prefix.empty());
456 // Ensure that all strings are at least 8 bytes for LZMA2.
457 // The bytes after the prefix should be ignored by `canUncompress()`.
458 IOBuf data{IOBuf::COPY_BUFFER, prefix, 0, 8};
460 EXPECT_TRUE(codec_->canUncompress(&data));
461 EXPECT_TRUE(auto_->canUncompress(&data));
465 TEST_P(AutomaticCodecTest, NeedsUncompressedLength) {
466 if (codec_->needsUncompressedLength()) {
467 EXPECT_TRUE(auto_->needsUncompressedLength());
471 TEST_P(AutomaticCodecTest, maxUncompressedLength) {
472 EXPECT_LE(codec_->maxUncompressedLength(), auto_->maxUncompressedLength());
475 TEST_P(AutomaticCodecTest, DefaultCodec) {
476 const uint64_t length = 42;
477 std::vector<std::unique_ptr<Codec>> codecs;
478 codecs.push_back(getCodec(CodecType::ZSTD));
479 auto automatic = getAutoUncompressionCodec(std::move(codecs));
480 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
481 auto compressed = codec_->compress(original.get());
482 auto decompressed = automatic->uncompress(compressed.get());
484 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
488 class CustomCodec : public Codec {
490 static std::unique_ptr<Codec> create(std::string prefix, CodecType type) {
491 return make_unique<CustomCodec>(std::move(prefix), type);
493 explicit CustomCodec(std::string prefix, CodecType type)
494 : Codec(CodecType::USER_DEFINED),
495 prefix_(std::move(prefix)),
496 codec_(getCodec(type)) {}
499 std::vector<std::string> validPrefixes() const override {
503 bool canUncompress(const IOBuf* data, Optional<uint64_t>) const override {
504 auto clone = data->cloneCoalescedAsValue();
505 if (clone.length() < prefix_.size()) {
508 return memcmp(clone.data(), prefix_.data(), prefix_.size()) == 0;
511 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override {
512 auto result = IOBuf::copyBuffer(prefix_);
513 result->appendChain(codec_->compress(data));
514 EXPECT_TRUE(canUncompress(result.get(), data->computeChainDataLength()));
518 std::unique_ptr<IOBuf> doUncompress(
520 Optional<uint64_t> uncompressedLength) override {
521 EXPECT_TRUE(canUncompress(data, uncompressedLength));
522 auto clone = data->cloneCoalescedAsValue();
523 clone.trimStart(prefix_.size());
524 return codec_->uncompress(&clone, uncompressedLength);
528 std::unique_ptr<Codec> codec_;
532 TEST_P(AutomaticCodecTest, CustomCodec) {
533 const uint64_t length = 42;
534 auto ab = CustomCodec::create("ab", CodecType::ZSTD);
535 std::vector<std::unique_ptr<Codec>> codecs;
536 codecs.push_back(CustomCodec::create("ab", CodecType::ZSTD));
537 auto automatic = getAutoUncompressionCodec(std::move(codecs));
538 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
540 auto abCompressed = ab->compress(original.get());
541 auto abDecompressed = automatic->uncompress(abCompressed.get());
542 EXPECT_TRUE(automatic->canUncompress(abCompressed.get()));
543 EXPECT_FALSE(auto_->canUncompress(abCompressed.get()));
544 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(abDecompressed.get()));
546 auto compressed = codec_->compress(original.get());
547 auto decompressed = automatic->uncompress(compressed.get());
548 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
551 TEST_P(AutomaticCodecTest, CustomDefaultCodec) {
552 const uint64_t length = 42;
553 auto none = CustomCodec::create("none", CodecType::NO_COMPRESSION);
554 std::vector<std::unique_ptr<Codec>> codecs;
555 codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
556 codecs.push_back(getCodec(CodecType::LZ4_FRAME));
557 auto automatic = getAutoUncompressionCodec(std::move(codecs));
558 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
560 auto noneCompressed = none->compress(original.get());
561 auto noneDecompressed = automatic->uncompress(noneCompressed.get());
562 EXPECT_TRUE(automatic->canUncompress(noneCompressed.get()));
563 EXPECT_FALSE(auto_->canUncompress(noneCompressed.get()));
564 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(noneDecompressed.get()));
566 auto compressed = codec_->compress(original.get());
567 auto decompressed = automatic->uncompress(compressed.get());
568 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
571 TEST_P(AutomaticCodecTest, canUncompressOneBytes) {
572 // No default codec can uncompress 1 bytes.
573 IOBuf buf{IOBuf::CREATE, 1};
575 EXPECT_FALSE(codec_->canUncompress(&buf, 1));
576 EXPECT_FALSE(codec_->canUncompress(&buf, folly::none));
577 EXPECT_FALSE(auto_->canUncompress(&buf, 1));
578 EXPECT_FALSE(auto_->canUncompress(&buf, folly::none));
581 INSTANTIATE_TEST_CASE_P(
585 CodecType::LZ4_FRAME,
592 TEST(ValidPrefixesTest, CustomCodec) {
593 std::vector<std::unique_ptr<Codec>> codecs;
594 codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
595 const auto none = getAutoUncompressionCodec(std::move(codecs));
596 const auto prefixes = none->validPrefixes();
597 const auto it = std::find(prefixes.begin(), prefixes.end(), "none");
598 EXPECT_TRUE(it != prefixes.end());
601 #define EXPECT_THROW_IF_DEBUG(statement, expected_exception) \
604 EXPECT_THROW((statement), expected_exception); \
606 EXPECT_NO_THROW((statement)); \
610 TEST(CheckCompatibleTest, SimplePrefixSecond) {
611 std::vector<std::unique_ptr<Codec>> codecs;
612 codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
613 codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
614 EXPECT_THROW_IF_DEBUG(
615 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
618 TEST(CheckCompatibleTest, SimplePrefixFirst) {
619 std::vector<std::unique_ptr<Codec>> codecs;
620 codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
621 codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
622 EXPECT_THROW_IF_DEBUG(
623 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
626 TEST(CheckCompatibleTest, Empty) {
627 std::vector<std::unique_ptr<Codec>> codecs;
628 codecs.push_back(CustomCodec::create("", CodecType::NO_COMPRESSION));
629 EXPECT_THROW_IF_DEBUG(
630 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
633 TEST(CheckCompatibleTest, ZstdPrefix) {
634 std::vector<std::unique_ptr<Codec>> codecs;
635 codecs.push_back(CustomCodec::create("\x28\xB5\x2F", CodecType::ZSTD));
636 EXPECT_THROW_IF_DEBUG(
637 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
640 TEST(CheckCompatibleTest, ZstdDuplicate) {
641 std::vector<std::unique_ptr<Codec>> codecs;
642 codecs.push_back(CustomCodec::create("\x28\xB5\x2F\xFD", CodecType::ZSTD));
643 EXPECT_THROW_IF_DEBUG(
644 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
647 TEST(CheckCompatibleTest, ZlibIsPrefix) {
648 std::vector<std::unique_ptr<Codec>> codecs;
649 codecs.push_back(CustomCodec::create("\x18\x76zzasdf", CodecType::ZSTD));
650 EXPECT_THROW_IF_DEBUG(
651 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
655 int main(int argc, char *argv[]) {
656 testing::InitGoogleTest(&argc, argv);
657 gflags::ParseCommandLineFlags(&argc, &argv, true);
659 auto ret = RUN_ALL_TESTS();
661 folly::runBenchmarksOnFlag();