2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/io/Compression.h>
22 #include <unordered_map>
24 #include <boost/noncopyable.hpp>
25 #include <glog/logging.h>
27 #include <folly/Benchmark.h>
28 #include <folly/Hash.h>
29 #include <folly/Memory.h>
30 #include <folly/Random.h>
31 #include <folly/Varint.h>
32 #include <folly/io/IOBufQueue.h>
33 #include <folly/portability/GTest.h>
35 namespace folly { namespace io { namespace test {
37 class DataHolder : private boost::noncopyable {
39 uint64_t hash(size_t size) const;
40 ByteRange data(size_t size) const;
43 explicit DataHolder(size_t sizeLog2);
45 std::unique_ptr<uint8_t[]> data_;
46 mutable std::unordered_map<uint64_t, uint64_t> hashCache_;
49 DataHolder::DataHolder(size_t sizeLog2)
50 : size_(size_t(1) << sizeLog2),
51 data_(new uint8_t[size_]) {
54 uint64_t DataHolder::hash(size_t size) const {
55 CHECK_LE(size, size_);
56 auto p = hashCache_.find(size);
57 if (p != hashCache_.end()) {
61 uint64_t h = folly::hash::fnv64_buf(data_.get(), size);
66 ByteRange DataHolder::data(size_t size) const {
67 CHECK_LE(size, size_);
68 return ByteRange(data_.get(), size);
71 uint64_t hashIOBuf(const IOBuf* buf) {
72 uint64_t h = folly::hash::FNV_64_HASH_START;
73 for (auto& range : *buf) {
74 h = folly::hash::fnv64_buf(range.data(), range.size(), h);
79 class RandomDataHolder : public DataHolder {
81 explicit RandomDataHolder(size_t sizeLog2);
84 RandomDataHolder::RandomDataHolder(size_t sizeLog2)
85 : DataHolder(sizeLog2) {
86 constexpr size_t numThreadsLog2 = 3;
87 constexpr size_t numThreads = size_t(1) << numThreadsLog2;
89 uint32_t seed = randomNumberSeed();
91 std::vector<std::thread> threads;
92 threads.reserve(numThreads);
93 for (size_t t = 0; t < numThreads; ++t) {
94 threads.emplace_back([this, seed, t, sizeLog2] {
95 std::mt19937 rng(seed + t);
96 size_t countLog2 = sizeLog2 - numThreadsLog2;
97 size_t start = size_t(t) << countLog2;
98 for (size_t i = 0; i < countLog2; ++i) {
99 this->data_[start + i] = rng();
104 for (auto& t : threads) {
109 class ConstantDataHolder : public DataHolder {
111 explicit ConstantDataHolder(size_t sizeLog2);
114 ConstantDataHolder::ConstantDataHolder(size_t sizeLog2)
115 : DataHolder(sizeLog2) {
116 memset(data_.get(), 'a', size_);
119 constexpr size_t dataSizeLog2 = 27; // 128MiB
120 RandomDataHolder randomDataHolder(dataSizeLog2);
121 ConstantDataHolder constantDataHolder(dataSizeLog2);
123 // The intersection of the provided codecs & those that are compiled in.
124 static std::vector<CodecType> supportedCodecs(std::vector<CodecType> const& v) {
125 std::vector<CodecType> supported;
130 std::back_inserter(supported),
136 // All compiled-in compression codecs.
137 static std::vector<CodecType> availableCodecs() {
138 std::vector<CodecType> codecs;
140 for (size_t i = 0; i < static_cast<size_t>(CodecType::NUM_CODEC_TYPES); ++i) {
141 auto type = static_cast<CodecType>(i);
142 if (hasCodec(type)) {
143 codecs.push_back(type);
150 TEST(CompressionTestNeedsUncompressedLength, Simple) {
151 static const struct { CodecType type; bool needsUncompressedLength; }
153 { CodecType::NO_COMPRESSION, false },
154 { CodecType::LZ4, true },
155 { CodecType::SNAPPY, false },
156 { CodecType::ZLIB, false },
157 { CodecType::LZ4_VARINT_SIZE, false },
158 { CodecType::LZMA2, false },
159 { CodecType::LZMA2_VARINT_SIZE, false },
160 { CodecType::ZSTD, false },
161 { CodecType::GZIP, false },
162 { CodecType::LZ4_FRAME, false },
163 { CodecType::BZIP2, false },
166 for (auto const& test : expectations) {
167 if (hasCodec(test.type)) {
168 EXPECT_EQ(getCodec(test.type)->needsUncompressedLength(),
169 test.needsUncompressedLength);
174 class CompressionTest
175 : public testing::TestWithParam<std::tr1::tuple<int, int, CodecType>> {
177 void SetUp() override {
178 auto tup = GetParam();
179 uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
180 chunks_ = std::tr1::get<1>(tup);
181 codec_ = getCodec(std::tr1::get<2>(tup));
184 void runSimpleIOBufTest(const DataHolder& dh);
186 void runSimpleStringTest(const DataHolder& dh);
189 std::unique_ptr<IOBuf> split(std::unique_ptr<IOBuf> data) const;
191 uint64_t uncompressedLength_;
193 std::unique_ptr<Codec> codec_;
196 void CompressionTest::runSimpleIOBufTest(const DataHolder& dh) {
197 const auto original = split(IOBuf::wrapBuffer(dh.data(uncompressedLength_)));
198 const auto compressed = split(codec_->compress(original.get()));
199 if (!codec_->needsUncompressedLength()) {
200 auto uncompressed = codec_->uncompress(compressed.get());
201 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
202 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
205 auto uncompressed = codec_->uncompress(compressed.get(),
206 uncompressedLength_);
207 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
208 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
212 void CompressionTest::runSimpleStringTest(const DataHolder& dh) {
213 const auto original = std::string(
214 reinterpret_cast<const char*>(dh.data(uncompressedLength_).data()),
215 uncompressedLength_);
216 const auto compressed = codec_->compress(original);
217 if (!codec_->needsUncompressedLength()) {
218 auto uncompressed = codec_->uncompress(compressed);
219 EXPECT_EQ(uncompressedLength_, uncompressed.length());
220 EXPECT_EQ(uncompressed, original);
223 auto uncompressed = codec_->uncompress(compressed, uncompressedLength_);
224 EXPECT_EQ(uncompressedLength_, uncompressed.length());
225 EXPECT_EQ(uncompressed, original);
229 // Uniformly split data into (potentially empty) chunks.
230 std::unique_ptr<IOBuf> CompressionTest::split(
231 std::unique_ptr<IOBuf> data) const {
232 if (data->isChained()) {
236 const size_t size = data->computeChainDataLength();
238 std::multiset<size_t> splits;
239 for (size_t i = 1; i < chunks_; ++i) {
240 splits.insert(Random::rand64(size));
243 folly::IOBufQueue result;
246 for (size_t split : splits) {
247 result.append(IOBuf::copyBuffer(data->data() + offset, split - offset));
250 result.append(IOBuf::copyBuffer(data->data() + offset, size - offset));
252 return result.move();
255 TEST_P(CompressionTest, RandomData) {
256 runSimpleIOBufTest(randomDataHolder);
259 TEST_P(CompressionTest, ConstantData) {
260 runSimpleIOBufTest(constantDataHolder);
263 TEST_P(CompressionTest, RandomDataString) {
264 runSimpleStringTest(randomDataHolder);
267 TEST_P(CompressionTest, ConstantDataString) {
268 runSimpleStringTest(constantDataHolder);
271 INSTANTIATE_TEST_CASE_P(
275 testing::Values(0, 1, 12, 22, 25, 27),
276 testing::Values(1, 2, 3, 8, 65),
277 testing::ValuesIn(availableCodecs())));
279 class CompressionVarintTest
280 : public testing::TestWithParam<std::tr1::tuple<int, CodecType>> {
282 void SetUp() override {
283 auto tup = GetParam();
284 uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
285 codec_ = getCodec(std::tr1::get<1>(tup));
288 void runSimpleTest(const DataHolder& dh);
290 uint64_t uncompressedLength_;
291 std::unique_ptr<Codec> codec_;
294 inline uint64_t oneBasedMsbPos(uint64_t number) {
296 for (; number > 0; ++pos, number >>= 1) {
301 void CompressionVarintTest::runSimpleTest(const DataHolder& dh) {
302 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength_));
303 auto compressed = codec_->compress(original.get());
307 std::max(uint64_t(9), oneBasedMsbPos(uncompressedLength_)) / 9UL);
308 auto tinyBuf = IOBuf::copyBuffer(compressed->data(),
309 std::min(compressed->length(), breakPoint));
310 compressed->trimStart(breakPoint);
311 tinyBuf->prependChain(std::move(compressed));
312 compressed = std::move(tinyBuf);
314 auto uncompressed = codec_->uncompress(compressed.get());
316 EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
317 EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
320 TEST_P(CompressionVarintTest, RandomData) {
321 runSimpleTest(randomDataHolder);
324 TEST_P(CompressionVarintTest, ConstantData) {
325 runSimpleTest(constantDataHolder);
328 INSTANTIATE_TEST_CASE_P(
329 CompressionVarintTest,
330 CompressionVarintTest,
332 testing::Values(0, 1, 12, 22, 25, 27),
333 testing::ValuesIn(supportedCodecs({
334 CodecType::LZ4_VARINT_SIZE,
335 CodecType::LZMA2_VARINT_SIZE,
338 class CompressionCorruptionTest : public testing::TestWithParam<CodecType> {
340 void SetUp() override { codec_ = getCodec(GetParam()); }
342 void runSimpleTest(const DataHolder& dh);
344 std::unique_ptr<Codec> codec_;
347 void CompressionCorruptionTest::runSimpleTest(const DataHolder& dh) {
348 constexpr uint64_t uncompressedLength = 42;
349 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
350 auto compressed = codec_->compress(original.get());
352 if (!codec_->needsUncompressedLength()) {
353 auto uncompressed = codec_->uncompress(compressed.get());
354 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
355 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
358 auto uncompressed = codec_->uncompress(compressed.get(),
360 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
361 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
364 EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength + 1),
367 // Corrupt the first character
368 ++(compressed->writableData()[0]);
370 if (!codec_->needsUncompressedLength()) {
371 EXPECT_THROW(codec_->uncompress(compressed.get()),
375 EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength),
379 TEST_P(CompressionCorruptionTest, RandomData) {
380 runSimpleTest(randomDataHolder);
383 TEST_P(CompressionCorruptionTest, ConstantData) {
384 runSimpleTest(constantDataHolder);
387 INSTANTIATE_TEST_CASE_P(
388 CompressionCorruptionTest,
389 CompressionCorruptionTest,
391 // NO_COMPRESSION can't detect corruption
392 // LZ4 can't detect corruption reliably (sigh)
398 CodecType::LZ4_FRAME,
402 class AutomaticCodecTest : public testing::TestWithParam<CodecType> {
404 void SetUp() override {
405 codec_ = getCodec(GetParam());
406 auto_ = getAutoUncompressionCodec();
409 void runSimpleTest(const DataHolder& dh);
411 std::unique_ptr<Codec> codec_;
412 std::unique_ptr<Codec> auto_;
415 void AutomaticCodecTest::runSimpleTest(const DataHolder& dh) {
416 constexpr uint64_t uncompressedLength = 1000;
417 auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
418 auto compressed = codec_->compress(original.get());
420 if (!codec_->needsUncompressedLength()) {
421 auto uncompressed = auto_->uncompress(compressed.get());
422 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
423 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
426 auto uncompressed = auto_->uncompress(compressed.get(), uncompressedLength);
427 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
428 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
430 ASSERT_GE(compressed->computeChainDataLength(), 8);
431 for (size_t i = 0; i < 8; ++i) {
432 auto split = compressed->clone();
433 auto rest = compressed->clone();
434 split->trimEnd(split->length() - i);
436 split->appendChain(std::move(rest));
437 auto uncompressed = auto_->uncompress(split.get(), uncompressedLength);
438 EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
439 EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
443 TEST_P(AutomaticCodecTest, RandomData) {
444 runSimpleTest(randomDataHolder);
447 TEST_P(AutomaticCodecTest, ConstantData) {
448 runSimpleTest(constantDataHolder);
451 TEST_P(AutomaticCodecTest, ValidPrefixes) {
452 const auto prefixes = codec_->validPrefixes();
453 for (const auto& prefix : prefixes) {
454 EXPECT_FALSE(prefix.empty());
455 // Ensure that all strings are at least 8 bytes for LZMA2.
456 // The bytes after the prefix should be ignored by `canUncompress()`.
457 IOBuf data{IOBuf::COPY_BUFFER, prefix, 0, 8};
459 EXPECT_TRUE(codec_->canUncompress(&data));
460 EXPECT_TRUE(auto_->canUncompress(&data));
464 TEST_P(AutomaticCodecTest, NeedsUncompressedLength) {
465 if (codec_->needsUncompressedLength()) {
466 EXPECT_TRUE(auto_->needsUncompressedLength());
470 TEST_P(AutomaticCodecTest, maxUncompressedLength) {
471 EXPECT_LE(codec_->maxUncompressedLength(), auto_->maxUncompressedLength());
474 TEST_P(AutomaticCodecTest, DefaultCodec) {
475 const uint64_t length = 42;
476 std::vector<std::unique_ptr<Codec>> codecs;
477 codecs.push_back(getCodec(CodecType::ZSTD));
478 auto automatic = getAutoUncompressionCodec(std::move(codecs));
479 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
480 auto compressed = codec_->compress(original.get());
481 auto decompressed = automatic->uncompress(compressed.get());
483 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
487 class CustomCodec : public Codec {
489 static std::unique_ptr<Codec> create(std::string prefix, CodecType type) {
490 return std::make_unique<CustomCodec>(std::move(prefix), type);
492 explicit CustomCodec(std::string prefix, CodecType type)
493 : Codec(CodecType::USER_DEFINED),
494 prefix_(std::move(prefix)),
495 codec_(getCodec(type)) {}
498 std::vector<std::string> validPrefixes() const override {
502 bool canUncompress(const IOBuf* data, Optional<uint64_t>) const override {
503 auto clone = data->cloneCoalescedAsValue();
504 if (clone.length() < prefix_.size()) {
507 return memcmp(clone.data(), prefix_.data(), prefix_.size()) == 0;
510 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override {
511 auto result = IOBuf::copyBuffer(prefix_);
512 result->appendChain(codec_->compress(data));
513 EXPECT_TRUE(canUncompress(result.get(), data->computeChainDataLength()));
517 std::unique_ptr<IOBuf> doUncompress(
519 Optional<uint64_t> uncompressedLength) override {
520 EXPECT_TRUE(canUncompress(data, uncompressedLength));
521 auto clone = data->cloneCoalescedAsValue();
522 clone.trimStart(prefix_.size());
523 return codec_->uncompress(&clone, uncompressedLength);
527 std::unique_ptr<Codec> codec_;
531 TEST_P(AutomaticCodecTest, CustomCodec) {
532 const uint64_t length = 42;
533 auto ab = CustomCodec::create("ab", CodecType::ZSTD);
534 std::vector<std::unique_ptr<Codec>> codecs;
535 codecs.push_back(CustomCodec::create("ab", CodecType::ZSTD));
536 auto automatic = getAutoUncompressionCodec(std::move(codecs));
537 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
539 auto abCompressed = ab->compress(original.get());
540 auto abDecompressed = automatic->uncompress(abCompressed.get());
541 EXPECT_TRUE(automatic->canUncompress(abCompressed.get()));
542 EXPECT_FALSE(auto_->canUncompress(abCompressed.get()));
543 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(abDecompressed.get()));
545 auto compressed = codec_->compress(original.get());
546 auto decompressed = automatic->uncompress(compressed.get());
547 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
550 TEST_P(AutomaticCodecTest, CustomDefaultCodec) {
551 const uint64_t length = 42;
552 auto none = CustomCodec::create("none", CodecType::NO_COMPRESSION);
553 std::vector<std::unique_ptr<Codec>> codecs;
554 codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
555 codecs.push_back(getCodec(CodecType::LZ4_FRAME));
556 auto automatic = getAutoUncompressionCodec(std::move(codecs));
557 auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
559 auto noneCompressed = none->compress(original.get());
560 auto noneDecompressed = automatic->uncompress(noneCompressed.get());
561 EXPECT_TRUE(automatic->canUncompress(noneCompressed.get()));
562 EXPECT_FALSE(auto_->canUncompress(noneCompressed.get()));
563 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(noneDecompressed.get()));
565 auto compressed = codec_->compress(original.get());
566 auto decompressed = automatic->uncompress(compressed.get());
567 EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
570 TEST_P(AutomaticCodecTest, canUncompressOneBytes) {
571 // No default codec can uncompress 1 bytes.
572 IOBuf buf{IOBuf::CREATE, 1};
574 EXPECT_FALSE(codec_->canUncompress(&buf, 1));
575 EXPECT_FALSE(codec_->canUncompress(&buf, folly::none));
576 EXPECT_FALSE(auto_->canUncompress(&buf, 1));
577 EXPECT_FALSE(auto_->canUncompress(&buf, folly::none));
580 INSTANTIATE_TEST_CASE_P(
584 CodecType::LZ4_FRAME,
591 TEST(ValidPrefixesTest, CustomCodec) {
592 std::vector<std::unique_ptr<Codec>> codecs;
593 codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
594 const auto none = getAutoUncompressionCodec(std::move(codecs));
595 const auto prefixes = none->validPrefixes();
596 const auto it = std::find(prefixes.begin(), prefixes.end(), "none");
597 EXPECT_TRUE(it != prefixes.end());
600 #define EXPECT_THROW_IF_DEBUG(statement, expected_exception) \
603 EXPECT_THROW((statement), expected_exception); \
605 EXPECT_NO_THROW((statement)); \
609 TEST(CheckCompatibleTest, SimplePrefixSecond) {
610 std::vector<std::unique_ptr<Codec>> codecs;
611 codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
612 codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
613 EXPECT_THROW_IF_DEBUG(
614 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
617 TEST(CheckCompatibleTest, SimplePrefixFirst) {
618 std::vector<std::unique_ptr<Codec>> codecs;
619 codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
620 codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
621 EXPECT_THROW_IF_DEBUG(
622 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
625 TEST(CheckCompatibleTest, Empty) {
626 std::vector<std::unique_ptr<Codec>> codecs;
627 codecs.push_back(CustomCodec::create("", CodecType::NO_COMPRESSION));
628 EXPECT_THROW_IF_DEBUG(
629 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
632 TEST(CheckCompatibleTest, ZstdPrefix) {
633 std::vector<std::unique_ptr<Codec>> codecs;
634 codecs.push_back(CustomCodec::create("\x28\xB5\x2F", CodecType::ZSTD));
635 EXPECT_THROW_IF_DEBUG(
636 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
639 TEST(CheckCompatibleTest, ZstdDuplicate) {
640 std::vector<std::unique_ptr<Codec>> codecs;
641 codecs.push_back(CustomCodec::create("\x28\xB5\x2F\xFD", CodecType::ZSTD));
642 EXPECT_THROW_IF_DEBUG(
643 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
646 TEST(CheckCompatibleTest, ZlibIsPrefix) {
647 std::vector<std::unique_ptr<Codec>> codecs;
648 codecs.push_back(CustomCodec::create("\x18\x76zzasdf", CodecType::ZSTD));
649 EXPECT_THROW_IF_DEBUG(
650 getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
654 int main(int argc, char *argv[]) {
655 testing::InitGoogleTest(&argc, argv);
656 gflags::ParseCommandLineFlags(&argc, &argv, true);
658 auto ret = RUN_ALL_TESTS();
660 folly::runBenchmarksOnFlag();