2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/io/Compression.h>
22 #include <unordered_map>
24 #include <boost/noncopyable.hpp>
25 #include <glog/logging.h>
27 #include <folly/Benchmark.h>
28 #include <folly/Hash.h>
29 #include <folly/Memory.h>
30 #include <folly/Random.h>
31 #include <folly/Varint.h>
32 #include <folly/io/IOBufQueue.h>
33 #include <folly/portability/GTest.h>
35 namespace folly { namespace io { namespace test {
37 class DataHolder : private boost::noncopyable {
39 uint64_t hash(size_t size) const;
40 ByteRange data(size_t size) const;
43 explicit DataHolder(size_t sizeLog2);
45 std::unique_ptr<uint8_t[]> data_;
46 mutable std::unordered_map<uint64_t, uint64_t> hashCache_;
49 DataHolder::DataHolder(size_t sizeLog2)
50 : size_(size_t(1) << sizeLog2),
51 data_(new uint8_t[size_]) {
54 uint64_t DataHolder::hash(size_t size) const {
55 CHECK_LE(size, size_);
56 auto p = hashCache_.find(size);
57 if (p != hashCache_.end()) {
61 uint64_t h = folly::hash::fnv64_buf(data_.get(), size);
66 ByteRange DataHolder::data(size_t size) const {
67 CHECK_LE(size, size_);
68 return ByteRange(data_.get(), size);
71 uint64_t hashIOBuf(const IOBuf* buf) {
72 uint64_t h = folly::hash::FNV_64_HASH_START;
73 for (auto& range : *buf) {
74 h = folly::hash::fnv64_buf(range.data(), range.size(), h);
79 class RandomDataHolder : public DataHolder {
81 explicit RandomDataHolder(size_t sizeLog2);
84 RandomDataHolder::RandomDataHolder(size_t sizeLog2)
85 : DataHolder(sizeLog2) {
86 constexpr size_t numThreadsLog2 = 3;
87 constexpr size_t numThreads = size_t(1) << numThreadsLog2;
89 uint32_t seed = randomNumberSeed();
91 std::vector<std::thread> threads;
92 threads.reserve(numThreads);
93 for (size_t t = 0; t < numThreads; ++t) {
95 [this, seed, t, numThreadsLog2, sizeLog2] () {
96 std::mt19937 rng(seed + t);
97 size_t countLog2 = sizeLog2 - numThreadsLog2;
98 size_t start = size_t(t) << countLog2;
99 for (size_t i = 0; i < countLog2; ++i) {
100 this->data_[start + i] = rng();
105 for (auto& t : threads) {
110 class ConstantDataHolder : public DataHolder {
112 explicit ConstantDataHolder(size_t sizeLog2);
115 ConstantDataHolder::ConstantDataHolder(size_t sizeLog2)
116 : DataHolder(sizeLog2) {
117 memset(data_.get(), 'a', size_);
120 constexpr size_t dataSizeLog2 = 27; // 128MiB
121 RandomDataHolder randomDataHolder(dataSizeLog2);
122 ConstantDataHolder constantDataHolder(dataSizeLog2);
124 // The intersection of the provided codecs & those that are compiled in.
125 static std::vector<CodecType> supportedCodecs(std::vector<CodecType> const& v) {
126 std::vector<CodecType> supported;
131 std::back_inserter(supported),
137 // All compiled-in compression codecs.
138 static std::vector<CodecType> availableCodecs() {
139 std::vector<CodecType> codecs;
141 for (size_t i = 0; i < static_cast<size_t>(CodecType::NUM_CODEC_TYPES); ++i) {
142 auto type = static_cast<CodecType>(i);
143 if (hasCodec(type)) {
144 codecs.push_back(type);
151 TEST(CompressionTestNeedsUncompressedLength, Simple) {
152 static const struct { CodecType type; bool needsUncompressedLength; }
154 { CodecType::NO_COMPRESSION, false },
155 { CodecType::LZ4, true },
156 { CodecType::SNAPPY, false },
157 { CodecType::ZLIB, false },
158 { CodecType::LZ4_VARINT_SIZE, false },
159 { CodecType::LZMA2, false },
160 { CodecType::LZMA2_VARINT_SIZE, false },
161 { CodecType::ZSTD, false },
162 { CodecType::GZIP, false },
163 { CodecType::LZ4_FRAME, false },
166 for (auto const& test : expectations) {
167 if (hasCodec(test.type)) {
168 EXPECT_EQ(getCodec(test.type)->needsUncompressedLength(),
169 test.needsUncompressedLength);
174 class CompressionTest
175 : public testing::TestWithParam<std::tr1::tuple<int, int, CodecType>> {
177 void SetUp() override {
178 auto tup = GetParam();
179 uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
180 chunks_ = std::tr1::get<1>(tup);
181 codec_ = getCodec(std::tr1::get<2>(tup));
184 void runSimpleIOBufTest(const DataHolder& dh);
186 void runSimpleStringTest(const DataHolder& dh);
189 std::unique_ptr<IOBuf> split(std::unique_ptr<IOBuf> data) const;
191 uint64_t uncompressedLength_;
193 std::unique_ptr<Codec> codec_;
196 void CompressionTest::runSimpleIOBufTest(const DataHolder& dh) {
197 const auto original = split(IOBuf::wrapBuffer(dh.data(uncompressedLength_)));
198 const auto compressed = split(codec_->compress(original.get()));
199 if (!codec_->needsUncompressedLength()) {
200 auto uncompressed = codec_->uncompress(compressed.get());
201 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
202 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
205 auto uncompressed = codec_->uncompress(compressed.get(),
206 uncompressedLength_);
207 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
208 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
212 void CompressionTest::runSimpleStringTest(const DataHolder& dh) {
213 const auto original = std::string(
214 reinterpret_cast<const char*>(dh.data(uncompressedLength_).data()),
215 uncompressedLength_);
216 const auto compressed = codec_->compress(original);
217 if (!codec_->needsUncompressedLength()) {
218 auto uncompressed = codec_->uncompress(compressed);
219 EXPECT_EQ(uncompressedLength_, uncompressed.length());
220 EXPECT_EQ(uncompressed, original);
223 auto uncompressed = codec_->uncompress(compressed, uncompressedLength_);
224 EXPECT_EQ(uncompressedLength_, uncompressed.length());
225 EXPECT_EQ(uncompressed, original);
229 // Uniformly split data into (potentially empty) chunks.
230 std::unique_ptr<IOBuf> CompressionTest::split(
231 std::unique_ptr<IOBuf> data) const {
232 if (data->isChained()) {
236 const size_t size = data->computeChainDataLength();
238 std::multiset<size_t> splits;
239 for (size_t i = 1; i < chunks_; ++i) {
240 splits.insert(Random::rand64(size));
243 folly::IOBufQueue result;
246 for (size_t split : splits) {
247 result.append(IOBuf::copyBuffer(data->data() + offset, split - offset));
250 result.append(IOBuf::copyBuffer(data->data() + offset, size - offset));
252 return result.move();
255 TEST_P(CompressionTest, RandomData) {
256 runSimpleIOBufTest(randomDataHolder);
259 TEST_P(CompressionTest, ConstantData) {
260 runSimpleIOBufTest(constantDataHolder);
263 TEST_P(CompressionTest, RandomDataString) {
264 runSimpleStringTest(randomDataHolder);
267 TEST_P(CompressionTest, ConstantDataString) {
268 runSimpleStringTest(constantDataHolder);
271 INSTANTIATE_TEST_CASE_P(
275 testing::Values(0, 1, 12, 22, 25, 27),
276 testing::Values(1, 2, 3, 8, 65),
277 testing::ValuesIn(availableCodecs())));
279 class CompressionVarintTest
280 : public testing::TestWithParam<std::tr1::tuple<int, CodecType>> {
282 void SetUp() override {
283 auto tup = GetParam();
284 uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
285 codec_ = getCodec(std::tr1::get<1>(tup));
288 void runSimpleTest(const DataHolder& dh);
290 uint64_t uncompressedLength_;
291 std::unique_ptr<Codec> codec_;
294 inline uint64_t oneBasedMsbPos(uint64_t number) {
296 for (; number > 0; ++pos, number >>= 1) {
301 void CompressionVarintTest::runSimpleTest(const DataHolder& dh) {
302 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength_));
303 auto compressed = codec_->compress(original.get());
307 std::max(uint64_t(9), oneBasedMsbPos(uncompressedLength_)) / 9UL);
308 auto tinyBuf = IOBuf::copyBuffer(compressed->data(),
309 std::min(compressed->length(), breakPoint));
310 compressed->trimStart(breakPoint);
311 tinyBuf->prependChain(std::move(compressed));
312 compressed = std::move(tinyBuf);
314 auto uncompressed = codec_->uncompress(compressed.get());
316 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
317 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
320 TEST_P(CompressionVarintTest, RandomData) {
321 runSimpleTest(randomDataHolder);
324 TEST_P(CompressionVarintTest, ConstantData) {
325 runSimpleTest(constantDataHolder);
328 INSTANTIATE_TEST_CASE_P(
329 CompressionVarintTest,
330 CompressionVarintTest,
332 testing::Values(0, 1, 12, 22, 25, 27),
333 testing::ValuesIn(supportedCodecs({
334 CodecType::LZ4_VARINT_SIZE,
335 CodecType::LZMA2_VARINT_SIZE,
338 class CompressionCorruptionTest : public testing::TestWithParam<CodecType> {
340 void SetUp() override { codec_ = getCodec(GetParam()); }
342 void runSimpleTest(const DataHolder& dh);
344 std::unique_ptr<Codec> codec_;
347 void CompressionCorruptionTest::runSimpleTest(const DataHolder& dh) {
348 constexpr uint64_t uncompressedLength = 42;
349 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
350 auto compressed = codec_->compress(original.get());
352 if (!codec_->needsUncompressedLength()) {
353 auto uncompressed = codec_->uncompress(compressed.get());
354 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
355 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
358 auto uncompressed = codec_->uncompress(compressed.get(),
360 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
361 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
364 EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength + 1),
367 // Corrupt the first character
368 ++(compressed->writableData()[0]);
370 if (!codec_->needsUncompressedLength()) {
371 EXPECT_THROW(codec_->uncompress(compressed.get()),
375 EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength),
379 TEST_P(CompressionCorruptionTest, RandomData) {
380 runSimpleTest(randomDataHolder);
383 TEST_P(CompressionCorruptionTest, ConstantData) {
384 runSimpleTest(constantDataHolder);
387 INSTANTIATE_TEST_CASE_P(
388 CompressionCorruptionTest,
389 CompressionCorruptionTest,
391 // NO_COMPRESSION can't detect corruption
392 // LZ4 can't detect corruption reliably (sigh)
398 CodecType::LZ4_FRAME,
401 class AutomaticCodecTest : public testing::TestWithParam<CodecType> {
403 void SetUp() override {
404 codec_ = getCodec(GetParam());
405 auto_ = getAutoUncompressionCodec();
408 void runSimpleTest(const DataHolder& dh);
410 std::unique_ptr<Codec> codec_;
411 std::unique_ptr<Codec> auto_;
414 void AutomaticCodecTest::runSimpleTest(const DataHolder& dh) {
415 constexpr uint64_t uncompressedLength = 1000;
416 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
417 auto compressed = codec_->compress(original.get());
419 if (!codec_->needsUncompressedLength()) {
420 auto uncompressed = auto_->uncompress(compressed.get());
421 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
422 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
425 auto uncompressed = auto_->uncompress(compressed.get(), uncompressedLength);
426 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
427 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
429 ASSERT_GE(compressed->computeChainDataLength(), 8);
430 for (size_t i = 0; i < 8; ++i) {
431 auto split = compressed->clone();
432 auto rest = compressed->clone();
433 split->trimEnd(split->length() - i);
435 split->appendChain(std::move(rest));
436 auto uncompressed = auto_->uncompress(split.get(), uncompressedLength);
437 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
438 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
442 TEST_P(AutomaticCodecTest, RandomData) {
443 runSimpleTest(randomDataHolder);
446 TEST_P(AutomaticCodecTest, ConstantData) {
447 runSimpleTest(constantDataHolder);
450 TEST_P(AutomaticCodecTest, ValidPrefixes) {
451 const auto prefixes = codec_->validPrefixes();
452 for (const auto& prefix : prefixes) {
453 EXPECT_FALSE(prefix.empty());
454 // Ensure that all strings are at least 8 bytes for LZMA2.
455 // The bytes after the prefix should be ignored by `canUncompress()`.
456 IOBuf data{IOBuf::COPY_BUFFER, prefix, 0, 8};
458 EXPECT_TRUE(codec_->canUncompress(&data));
459 EXPECT_TRUE(auto_->canUncompress(&data));
463 TEST_P(AutomaticCodecTest, NeedsUncompressedLength) {
464 if (codec_->needsUncompressedLength()) {
465 EXPECT_TRUE(auto_->needsUncompressedLength());
469 TEST_P(AutomaticCodecTest, maxUncompressedLength) {
470 EXPECT_LE(codec_->maxUncompressedLength(), auto_->maxUncompressedLength());
473 TEST_P(AutomaticCodecTest, DefaultCodec) {
474 const uint64_t length = 42;
475 std::vector<std::unique_ptr<Codec>> codecs;
476 codecs.push_back(getCodec(CodecType::ZSTD));
477 auto automatic = getAutoUncompressionCodec(std::move(codecs));
478 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
479 auto compressed = codec_->compress(original.get());
480 auto decompressed = automatic->uncompress(compressed.get());
482 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
486 class CustomCodec : public Codec {
488 static std::unique_ptr<Codec> create(std::string prefix, CodecType type) {
489 return make_unique<CustomCodec>(std::move(prefix), type);
491 explicit CustomCodec(std::string prefix, CodecType type)
492 : Codec(CodecType::USER_DEFINED),
493 prefix_(std::move(prefix)),
494 codec_(getCodec(type)) {}
497 std::vector<std::string> validPrefixes() const override {
501 bool canUncompress(const IOBuf* data, uint64_t) const override {
502 auto clone = data->cloneCoalescedAsValue();
503 if (clone.length() < prefix_.size()) {
506 return memcmp(clone.data(), prefix_.data(), prefix_.size()) == 0;
509 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override {
510 auto result = IOBuf::copyBuffer(prefix_);
511 result->appendChain(codec_->compress(data));
512 EXPECT_TRUE(canUncompress(result.get(), data->computeChainDataLength()));
516 std::unique_ptr<IOBuf> doUncompress(
518 uint64_t uncompressedLength) override {
519 EXPECT_TRUE(canUncompress(data, uncompressedLength));
520 auto clone = data->cloneCoalescedAsValue();
521 clone.trimStart(prefix_.size());
522 return codec_->uncompress(&clone, uncompressedLength);
526 std::unique_ptr<Codec> codec_;
530 TEST_P(AutomaticCodecTest, CustomCodec) {
531 const uint64_t length = 42;
532 auto ab = CustomCodec::create("ab", CodecType::ZSTD);
533 std::vector<std::unique_ptr<Codec>> codecs;
534 codecs.push_back(CustomCodec::create("ab", CodecType::ZSTD));
535 auto automatic = getAutoUncompressionCodec(std::move(codecs));
536 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
538 auto abCompressed = ab->compress(original.get());
539 auto abDecompressed = automatic->uncompress(abCompressed.get());
540 EXPECT_TRUE(automatic->canUncompress(abCompressed.get()));
541 EXPECT_FALSE(auto_->canUncompress(abCompressed.get()));
542 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(abDecompressed.get()));
544 auto compressed = codec_->compress(original.get());
545 auto decompressed = automatic->uncompress(compressed.get());
546 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
549 TEST_P(AutomaticCodecTest, CustomDefaultCodec) {
550 const uint64_t length = 42;
551 auto none = CustomCodec::create("none", CodecType::NO_COMPRESSION);
552 std::vector<std::unique_ptr<Codec>> codecs;
553 codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
554 codecs.push_back(getCodec(CodecType::LZ4_FRAME));
555 auto automatic = getAutoUncompressionCodec(std::move(codecs));
556 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
558 auto noneCompressed = none->compress(original.get());
559 auto noneDecompressed = automatic->uncompress(noneCompressed.get());
560 EXPECT_TRUE(automatic->canUncompress(noneCompressed.get()));
561 EXPECT_FALSE(auto_->canUncompress(noneCompressed.get()));
562 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(noneDecompressed.get()));
564 auto compressed = codec_->compress(original.get());
565 auto decompressed = automatic->uncompress(compressed.get());
566 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
569 TEST_P(AutomaticCodecTest, canUncompressOneBytes) {
570 // No default codec can uncompress 1 bytes.
571 IOBuf buf{IOBuf::CREATE, 1};
573 EXPECT_FALSE(codec_->canUncompress(&buf, 1));
574 EXPECT_FALSE(codec_->canUncompress(&buf, Codec::UNKNOWN_UNCOMPRESSED_LENGTH));
575 EXPECT_FALSE(auto_->canUncompress(&buf, 1));
576 EXPECT_FALSE(auto_->canUncompress(&buf, Codec::UNKNOWN_UNCOMPRESSED_LENGTH));
579 INSTANTIATE_TEST_CASE_P(
583 CodecType::LZ4_FRAME,
589 TEST(ValidPrefixesTest, CustomCodec) {
590 std::vector<std::unique_ptr<Codec>> codecs;
591 codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
592 const auto none = getAutoUncompressionCodec(std::move(codecs));
593 const auto prefixes = none->validPrefixes();
594 const auto it = std::find(prefixes.begin(), prefixes.end(), "none");
595 EXPECT_TRUE(it != prefixes.end());
598 #define EXPECT_THROW_IF_DEBUG(statement, expected_exception) \
601 EXPECT_THROW((statement), expected_exception); \
603 EXPECT_NO_THROW((statement)); \
607 TEST(CheckCompatibleTest, SimplePrefixSecond) {
608 std::vector<std::unique_ptr<Codec>> codecs;
609 codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
610 codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
611 EXPECT_THROW_IF_DEBUG(
612 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
615 TEST(CheckCompatibleTest, SimplePrefixFirst) {
616 std::vector<std::unique_ptr<Codec>> codecs;
617 codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
618 codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
619 EXPECT_THROW_IF_DEBUG(
620 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
623 TEST(CheckCompatibleTest, Empty) {
624 std::vector<std::unique_ptr<Codec>> codecs;
625 codecs.push_back(CustomCodec::create("", CodecType::NO_COMPRESSION));
626 EXPECT_THROW_IF_DEBUG(
627 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
630 TEST(CheckCompatibleTest, ZstdPrefix) {
631 std::vector<std::unique_ptr<Codec>> codecs;
632 codecs.push_back(CustomCodec::create("\x28\xB5\x2F", CodecType::ZSTD));
633 EXPECT_THROW_IF_DEBUG(
634 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
637 TEST(CheckCompatibleTest, ZstdDuplicate) {
638 std::vector<std::unique_ptr<Codec>> codecs;
639 codecs.push_back(CustomCodec::create("\x28\xB5\x2F\xFD", CodecType::ZSTD));
640 EXPECT_THROW_IF_DEBUG(
641 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
644 TEST(CheckCompatibleTest, ZlibIsPrefix) {
645 std::vector<std::unique_ptr<Codec>> codecs;
646 codecs.push_back(CustomCodec::create("\x18\x76zzasdf", CodecType::ZSTD));
647 EXPECT_THROW_IF_DEBUG(
648 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
652 int main(int argc, char *argv[]) {
653 testing::InitGoogleTest(&argc, argv);
654 gflags::ParseCommandLineFlags(&argc, &argv, true);
656 auto ret = RUN_ALL_TESTS();
658 folly::runBenchmarksOnFlag();