2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/io/Compression.h>
22 #if LZ4_VERSION_NUMBER >= 10301
27 #include <glog/logging.h>
29 #if FOLLY_HAVE_LIBSNAPPY
31 #include <snappy-sinksource.h>
38 #if FOLLY_HAVE_LIBLZMA
42 #if FOLLY_HAVE_LIBZSTD
50 #include <folly/Bits.h>
51 #include <folly/Conv.h>
52 #include <folly/Memory.h>
53 #include <folly/Portability.h>
54 #include <folly/ScopeGuard.h>
55 #include <folly/Varint.h>
56 #include <folly/io/Cursor.h>
58 #include <unordered_set>
60 namespace folly { namespace io {
62 Codec::Codec(CodecType type) : type_(type) { }
64 // Ensure consistent behavior in the nullptr case
65 std::unique_ptr<IOBuf> Codec::compress(const IOBuf* data) {
66 uint64_t len = data->computeChainDataLength();
68 return IOBuf::create(0);
70 if (len > maxUncompressedLength()) {
71 throw std::runtime_error("Codec: uncompressed length too large");
74 return doCompress(data);
77 std::string Codec::compress(const StringPiece data) {
78 const uint64_t len = data.size();
82 if (len > maxUncompressedLength()) {
83 throw std::runtime_error("Codec: uncompressed length too large");
86 return doCompressString(data);
89 std::unique_ptr<IOBuf> Codec::uncompress(
91 Optional<uint64_t> uncompressedLength) {
92 if (!uncompressedLength) {
93 if (needsUncompressedLength()) {
94 throw std::invalid_argument("Codec: uncompressed length required");
96 } else if (*uncompressedLength > maxUncompressedLength()) {
97 throw std::runtime_error("Codec: uncompressed length too large");
101 if (uncompressedLength.value_or(0) != 0) {
102 throw std::runtime_error("Codec: invalid uncompressed length");
104 return IOBuf::create(0);
107 return doUncompress(data, uncompressedLength);
110 std::string Codec::uncompress(
111 const StringPiece data,
112 Optional<uint64_t> uncompressedLength) {
113 if (!uncompressedLength) {
114 if (needsUncompressedLength()) {
115 throw std::invalid_argument("Codec: uncompressed length required");
117 } else if (*uncompressedLength > maxUncompressedLength()) {
118 throw std::runtime_error("Codec: uncompressed length too large");
122 if (uncompressedLength.value_or(0) != 0) {
123 throw std::runtime_error("Codec: invalid uncompressed length");
128 return doUncompressString(data, uncompressedLength);
131 bool Codec::needsUncompressedLength() const {
132 return doNeedsUncompressedLength();
135 uint64_t Codec::maxUncompressedLength() const {
136 return doMaxUncompressedLength();
139 bool Codec::doNeedsUncompressedLength() const {
143 uint64_t Codec::doMaxUncompressedLength() const {
144 return UNLIMITED_UNCOMPRESSED_LENGTH;
147 std::vector<std::string> Codec::validPrefixes() const {
151 bool Codec::canUncompress(const IOBuf*, Optional<uint64_t>) const {
155 std::string Codec::doCompressString(const StringPiece data) {
156 const IOBuf inputBuffer{IOBuf::WRAP_BUFFER, data};
157 auto outputBuffer = doCompress(&inputBuffer);
159 output.reserve(outputBuffer->computeChainDataLength());
160 for (auto range : *outputBuffer) {
161 output.append(reinterpret_cast<const char*>(range.data()), range.size());
166 std::string Codec::doUncompressString(
167 const StringPiece data,
168 Optional<uint64_t> uncompressedLength) {
169 const IOBuf inputBuffer{IOBuf::WRAP_BUFFER, data};
170 auto outputBuffer = doUncompress(&inputBuffer, uncompressedLength);
172 output.reserve(outputBuffer->computeChainDataLength());
173 for (auto range : *outputBuffer) {
174 output.append(reinterpret_cast<const char*>(range.data()), range.size());
184 class NoCompressionCodec final : public Codec {
186 static std::unique_ptr<Codec> create(int level, CodecType type);
187 explicit NoCompressionCodec(int level, CodecType type);
190 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
191 std::unique_ptr<IOBuf> doUncompress(
193 Optional<uint64_t> uncompressedLength) override;
196 std::unique_ptr<Codec> NoCompressionCodec::create(int level, CodecType type) {
197 return std::make_unique<NoCompressionCodec>(level, type);
200 NoCompressionCodec::NoCompressionCodec(int level, CodecType type)
202 DCHECK(type == CodecType::NO_COMPRESSION);
204 case COMPRESSION_LEVEL_DEFAULT:
205 case COMPRESSION_LEVEL_FASTEST:
206 case COMPRESSION_LEVEL_BEST:
210 throw std::invalid_argument(to<std::string>(
211 "NoCompressionCodec: invalid level ", level));
215 std::unique_ptr<IOBuf> NoCompressionCodec::doCompress(
217 return data->clone();
220 std::unique_ptr<IOBuf> NoCompressionCodec::doUncompress(
222 Optional<uint64_t> uncompressedLength) {
223 if (uncompressedLength &&
224 data->computeChainDataLength() != *uncompressedLength) {
225 throw std::runtime_error(
226 to<std::string>("NoCompressionCodec: invalid uncompressed length"));
228 return data->clone();
231 #if (FOLLY_HAVE_LIBLZ4 || FOLLY_HAVE_LIBLZMA)
235 void encodeVarintToIOBuf(uint64_t val, folly::IOBuf* out) {
236 DCHECK_GE(out->tailroom(), kMaxVarintLength64);
237 out->append(encodeVarint(val, out->writableTail()));
240 inline uint64_t decodeVarintFromCursor(folly::io::Cursor& cursor) {
243 for (int shift = 0; shift <= 63; shift += 7) {
244 b = cursor.read<int8_t>();
245 val |= static_cast<uint64_t>(b & 0x7f) << shift;
251 throw std::invalid_argument("Invalid varint value. Too big.");
258 #endif // FOLLY_HAVE_LIBLZ4 || FOLLY_HAVE_LIBLZMA
262 * Reads sizeof(T) bytes, and returns false if not enough bytes are available.
263 * Returns true if the first n bytes are equal to prefix when interpreted as
266 template <typename T>
267 typename std::enable_if<std::is_unsigned<T>::value, bool>::type
268 dataStartsWithLE(const IOBuf* data, T prefix, uint64_t n = sizeof(T)) {
270 DCHECK_LE(n, sizeof(T));
273 if (!cursor.tryReadLE(value)) {
276 const T mask = n == sizeof(T) ? T(-1) : (T(1) << (8 * n)) - 1;
277 return prefix == (value & mask);
280 template <typename T>
281 typename std::enable_if<std::is_arithmetic<T>::value, std::string>::type
282 prefixToStringLE(T prefix, uint64_t n = sizeof(T)) {
284 DCHECK_LE(n, sizeof(T));
285 prefix = Endian::little(prefix);
288 memcpy(&result[0], &prefix, n);
292 static uint64_t computeBufferLength(
293 uint64_t const compressedLength,
294 uint64_t const blockSize) {
295 uint64_t constexpr kMaxBufferLength = uint64_t(4) << 20; // 4 MiB
296 uint64_t const goodBufferSize = 4 * std::max(blockSize, compressedLength);
297 return std::min(goodBufferSize, kMaxBufferLength);
301 #if FOLLY_HAVE_LIBLZ4
306 class LZ4Codec final : public Codec {
308 static std::unique_ptr<Codec> create(int level, CodecType type);
309 explicit LZ4Codec(int level, CodecType type);
312 bool doNeedsUncompressedLength() const override;
313 uint64_t doMaxUncompressedLength() const override;
315 bool encodeSize() const { return type() == CodecType::LZ4_VARINT_SIZE; }
317 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
318 std::unique_ptr<IOBuf> doUncompress(
320 Optional<uint64_t> uncompressedLength) override;
322 bool highCompression_;
325 std::unique_ptr<Codec> LZ4Codec::create(int level, CodecType type) {
326 return std::make_unique<LZ4Codec>(level, type);
329 LZ4Codec::LZ4Codec(int level, CodecType type) : Codec(type) {
330 DCHECK(type == CodecType::LZ4 || type == CodecType::LZ4_VARINT_SIZE);
333 case COMPRESSION_LEVEL_FASTEST:
334 case COMPRESSION_LEVEL_DEFAULT:
337 case COMPRESSION_LEVEL_BEST:
341 if (level < 1 || level > 2) {
342 throw std::invalid_argument(to<std::string>(
343 "LZ4Codec: invalid level: ", level));
345 highCompression_ = (level > 1);
348 bool LZ4Codec::doNeedsUncompressedLength() const {
349 return !encodeSize();
352 // The value comes from lz4.h in lz4-r117, but older versions of lz4 don't
353 // define LZ4_MAX_INPUT_SIZE (even though the max size is the same), so do it
355 #ifndef LZ4_MAX_INPUT_SIZE
356 # define LZ4_MAX_INPUT_SIZE 0x7E000000
359 uint64_t LZ4Codec::doMaxUncompressedLength() const {
360 return LZ4_MAX_INPUT_SIZE;
363 std::unique_ptr<IOBuf> LZ4Codec::doCompress(const IOBuf* data) {
365 if (data->isChained()) {
366 // LZ4 doesn't support streaming, so we have to coalesce
367 clone = data->cloneCoalescedAsValue();
371 uint32_t extraSize = encodeSize() ? kMaxVarintLength64 : 0;
372 auto out = IOBuf::create(extraSize + LZ4_compressBound(data->length()));
374 encodeVarintToIOBuf(data->length(), out.get());
378 auto input = reinterpret_cast<const char*>(data->data());
379 auto output = reinterpret_cast<char*>(out->writableTail());
380 const auto inputLength = data->length();
381 #if LZ4_VERSION_NUMBER >= 10700
382 if (highCompression_) {
383 n = LZ4_compress_HC(input, output, inputLength, out->tailroom(), 0);
385 n = LZ4_compress_default(input, output, inputLength, out->tailroom());
388 if (highCompression_) {
389 n = LZ4_compressHC(input, output, inputLength);
391 n = LZ4_compress(input, output, inputLength);
396 CHECK_LE(n, out->capacity());
402 std::unique_ptr<IOBuf> LZ4Codec::doUncompress(
404 Optional<uint64_t> uncompressedLength) {
406 if (data->isChained()) {
407 // LZ4 doesn't support streaming, so we have to coalesce
408 clone = data->cloneCoalescedAsValue();
412 folly::io::Cursor cursor(data);
413 uint64_t actualUncompressedLength;
415 actualUncompressedLength = decodeVarintFromCursor(cursor);
416 if (uncompressedLength && *uncompressedLength != actualUncompressedLength) {
417 throw std::runtime_error("LZ4Codec: invalid uncompressed length");
421 DCHECK(uncompressedLength.hasValue());
422 DCHECK(*uncompressedLength <= maxUncompressedLength());
423 actualUncompressedLength = *uncompressedLength;
426 auto sp = StringPiece{cursor.peekBytes()};
427 auto out = IOBuf::create(actualUncompressedLength);
428 int n = LZ4_decompress_safe(
430 reinterpret_cast<char*>(out->writableTail()),
432 actualUncompressedLength);
434 if (n < 0 || uint64_t(n) != actualUncompressedLength) {
435 throw std::runtime_error(to<std::string>(
436 "LZ4 decompression returned invalid value ", n));
438 out->append(actualUncompressedLength);
442 #if LZ4_VERSION_NUMBER >= 10301
444 class LZ4FrameCodec final : public Codec {
446 static std::unique_ptr<Codec> create(int level, CodecType type);
447 explicit LZ4FrameCodec(int level, CodecType type);
450 std::vector<std::string> validPrefixes() const override;
451 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
455 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
456 std::unique_ptr<IOBuf> doUncompress(
458 Optional<uint64_t> uncompressedLength) override;
460 // Reset the dctx_ if it is dirty or null.
464 LZ4F_decompressionContext_t dctx_{nullptr};
468 /* static */ std::unique_ptr<Codec> LZ4FrameCodec::create(
471 return std::make_unique<LZ4FrameCodec>(level, type);
474 static constexpr uint32_t kLZ4FrameMagicLE = 0x184D2204;
476 std::vector<std::string> LZ4FrameCodec::validPrefixes() const {
477 return {prefixToStringLE(kLZ4FrameMagicLE)};
480 bool LZ4FrameCodec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
481 return dataStartsWithLE(data, kLZ4FrameMagicLE);
484 static size_t lz4FrameThrowOnError(size_t code) {
485 if (LZ4F_isError(code)) {
486 throw std::runtime_error(
487 to<std::string>("LZ4Frame error: ", LZ4F_getErrorName(code)));
492 void LZ4FrameCodec::resetDCtx() {
493 if (dctx_ && !dirty_) {
497 LZ4F_freeDecompressionContext(dctx_);
499 lz4FrameThrowOnError(LZ4F_createDecompressionContext(&dctx_, 100));
503 LZ4FrameCodec::LZ4FrameCodec(int level, CodecType type) : Codec(type) {
504 DCHECK(type == CodecType::LZ4_FRAME);
506 case COMPRESSION_LEVEL_FASTEST:
507 case COMPRESSION_LEVEL_DEFAULT:
510 case COMPRESSION_LEVEL_BEST:
519 LZ4FrameCodec::~LZ4FrameCodec() {
521 LZ4F_freeDecompressionContext(dctx_);
525 std::unique_ptr<IOBuf> LZ4FrameCodec::doCompress(const IOBuf* data) {
526 // LZ4 Frame compression doesn't support streaming so we have to coalesce
528 if (data->isChained()) {
529 clone = data->cloneCoalescedAsValue();
533 const auto uncompressedLength = data->length();
534 LZ4F_preferences_t prefs{};
535 prefs.compressionLevel = level_;
536 prefs.frameInfo.contentSize = uncompressedLength;
538 auto buf = IOBuf::create(LZ4F_compressFrameBound(uncompressedLength, &prefs));
539 const size_t written = lz4FrameThrowOnError(LZ4F_compressFrame(
545 buf->append(written);
549 std::unique_ptr<IOBuf> LZ4FrameCodec::doUncompress(
551 Optional<uint64_t> uncompressedLength) {
552 // Reset the dctx if any errors have occurred
555 ByteRange in = *data->begin();
557 if (data->isChained()) {
558 clone = data->cloneCoalescedAsValue();
559 in = clone.coalesce();
562 // Select decompression options
563 LZ4F_decompressOptions_t options;
564 options.stableDst = 1;
565 // Select blockSize and growthSize for the IOBufQueue
566 IOBufQueue queue(IOBufQueue::cacheChainLength());
567 auto blockSize = uint64_t{64} << 10;
568 auto growthSize = uint64_t{4} << 20;
569 if (uncompressedLength) {
570 // Allocate uncompressedLength in one chunk (up to 64 MB)
571 const auto allocateSize = std::min(*uncompressedLength, uint64_t{64} << 20);
572 queue.preallocate(allocateSize, allocateSize);
573 blockSize = std::min(*uncompressedLength, blockSize);
574 growthSize = std::min(*uncompressedLength, growthSize);
576 // Reduce growthSize for small data
577 const auto guessUncompressedLen =
578 4 * std::max<uint64_t>(blockSize, in.size());
579 growthSize = std::min(guessUncompressedLen, growthSize);
581 // Once LZ4_decompress() is called, the dctx_ cannot be reused until it
584 // Decompress until the frame is over
587 // Allocate enough space to decompress at least a block
590 std::tie(out, outSize) = queue.preallocate(blockSize, growthSize);
592 size_t inSize = in.size();
593 code = lz4FrameThrowOnError(
594 LZ4F_decompress(dctx_, out, &outSize, in.data(), &inSize, &options));
595 if (in.empty() && outSize == 0 && code != 0) {
596 // We passed no input, no output was produced, and the frame isn't over
597 // No more forward progress is possible
598 throw std::runtime_error("LZ4Frame error: Incomplete frame");
600 in.uncheckedAdvance(inSize);
601 queue.postallocate(outSize);
603 // At this point the decompression context can be reused
605 if (uncompressedLength && queue.chainLength() != *uncompressedLength) {
606 throw std::runtime_error("LZ4Frame error: Invalid uncompressedLength");
611 #endif // LZ4_VERSION_NUMBER >= 10301
612 #endif // FOLLY_HAVE_LIBLZ4
614 #if FOLLY_HAVE_LIBSNAPPY
621 * Implementation of snappy::Source that reads from a IOBuf chain.
623 class IOBufSnappySource final : public snappy::Source {
625 explicit IOBufSnappySource(const IOBuf* data);
626 size_t Available() const override;
627 const char* Peek(size_t* len) override;
628 void Skip(size_t n) override;
634 IOBufSnappySource::IOBufSnappySource(const IOBuf* data)
635 : available_(data->computeChainDataLength()),
639 size_t IOBufSnappySource::Available() const {
643 const char* IOBufSnappySource::Peek(size_t* len) {
644 auto sp = StringPiece{cursor_.peekBytes()};
649 void IOBufSnappySource::Skip(size_t n) {
650 CHECK_LE(n, available_);
655 class SnappyCodec final : public Codec {
657 static std::unique_ptr<Codec> create(int level, CodecType type);
658 explicit SnappyCodec(int level, CodecType type);
661 uint64_t doMaxUncompressedLength() const override;
662 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
663 std::unique_ptr<IOBuf> doUncompress(
665 Optional<uint64_t> uncompressedLength) override;
668 std::unique_ptr<Codec> SnappyCodec::create(int level, CodecType type) {
669 return std::make_unique<SnappyCodec>(level, type);
672 SnappyCodec::SnappyCodec(int level, CodecType type) : Codec(type) {
673 DCHECK(type == CodecType::SNAPPY);
675 case COMPRESSION_LEVEL_FASTEST:
676 case COMPRESSION_LEVEL_DEFAULT:
677 case COMPRESSION_LEVEL_BEST:
681 throw std::invalid_argument(to<std::string>(
682 "SnappyCodec: invalid level: ", level));
686 uint64_t SnappyCodec::doMaxUncompressedLength() const {
687 // snappy.h uses uint32_t for lengths, so there's that.
688 return std::numeric_limits<uint32_t>::max();
691 std::unique_ptr<IOBuf> SnappyCodec::doCompress(const IOBuf* data) {
692 IOBufSnappySource source(data);
694 IOBuf::create(snappy::MaxCompressedLength(source.Available()));
696 snappy::UncheckedByteArraySink sink(reinterpret_cast<char*>(
697 out->writableTail()));
699 size_t n = snappy::Compress(&source, &sink);
701 CHECK_LE(n, out->capacity());
706 std::unique_ptr<IOBuf> SnappyCodec::doUncompress(
708 Optional<uint64_t> uncompressedLength) {
709 uint32_t actualUncompressedLength = 0;
712 IOBufSnappySource source(data);
713 if (!snappy::GetUncompressedLength(&source, &actualUncompressedLength)) {
714 throw std::runtime_error("snappy::GetUncompressedLength failed");
716 if (uncompressedLength && *uncompressedLength != actualUncompressedLength) {
717 throw std::runtime_error("snappy: invalid uncompressed length");
721 auto out = IOBuf::create(actualUncompressedLength);
724 IOBufSnappySource source(data);
725 if (!snappy::RawUncompress(&source,
726 reinterpret_cast<char*>(out->writableTail()))) {
727 throw std::runtime_error("snappy::RawUncompress failed");
731 out->append(actualUncompressedLength);
735 #endif // FOLLY_HAVE_LIBSNAPPY
741 class ZlibCodec final : public Codec {
743 static std::unique_ptr<Codec> create(int level, CodecType type);
744 explicit ZlibCodec(int level, CodecType type);
746 std::vector<std::string> validPrefixes() const override;
747 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
751 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
752 std::unique_ptr<IOBuf> doUncompress(
754 Optional<uint64_t> uncompressedLength) override;
756 std::unique_ptr<IOBuf> addOutputBuffer(z_stream* stream, uint32_t length);
757 bool doInflate(z_stream* stream, IOBuf* head, uint32_t bufferLength);
762 static constexpr uint16_t kGZIPMagicLE = 0x8B1F;
764 std::vector<std::string> ZlibCodec::validPrefixes() const {
765 if (type() == CodecType::ZLIB) {
766 // Zlib streams start with a 2 byte header.
773 // We won't restrict the values of any sub-fields except as described below.
775 // The lowest 4 bits of CMF is the compression method (CM).
776 // CM == 0x8 is the deflate compression method, which is currently the only
777 // supported compression method, so any valid prefix must have CM == 0x8.
779 // The lowest 5 bits of FLG is FCHECK.
780 // FCHECK must be such that the two header bytes are a multiple of 31 when
781 // interpreted as a big endian 16-bit number.
782 std::vector<std::string> result;
783 // 16 values for the first byte, 8 values for the second byte.
784 // There are also 4 combinations where both 0x00 and 0x1F work as FCHECK.
786 // Select all values for the CMF byte that use the deflate algorithm 0x8.
787 for (uint32_t first = 0x0800; first <= 0xF800; first += 0x1000) {
788 // Select all values for the FLG, but leave FCHECK as 0 since it's fixed.
789 for (uint32_t second = 0x00; second <= 0xE0; second += 0x20) {
790 uint16_t prefix = first | second;
792 prefix += 31 - (prefix % 31);
793 result.push_back(prefixToStringLE(Endian::big(prefix)));
794 // zlib won't produce this, but it is a valid prefix.
795 if ((prefix & 0x1F) == 31) {
797 result.push_back(prefixToStringLE(Endian::big(prefix)));
803 // The gzip frame starts with 2 magic bytes.
804 return {prefixToStringLE(kGZIPMagicLE)};
808 bool ZlibCodec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
809 if (type() == CodecType::ZLIB) {
812 if (!cursor.tryReadBE(value)) {
815 // zlib compressed if using deflate and is a multiple of 31.
816 return (value & 0x0F00) == 0x0800 && value % 31 == 0;
818 return dataStartsWithLE(data, kGZIPMagicLE);
822 std::unique_ptr<Codec> ZlibCodec::create(int level, CodecType type) {
823 return std::make_unique<ZlibCodec>(level, type);
826 ZlibCodec::ZlibCodec(int level, CodecType type) : Codec(type) {
827 DCHECK(type == CodecType::ZLIB || type == CodecType::GZIP);
829 case COMPRESSION_LEVEL_FASTEST:
832 case COMPRESSION_LEVEL_DEFAULT:
833 level = Z_DEFAULT_COMPRESSION;
835 case COMPRESSION_LEVEL_BEST:
839 if (level != Z_DEFAULT_COMPRESSION && (level < 0 || level > 9)) {
840 throw std::invalid_argument(to<std::string>(
841 "ZlibCodec: invalid level: ", level));
846 std::unique_ptr<IOBuf> ZlibCodec::addOutputBuffer(z_stream* stream,
848 CHECK_EQ(stream->avail_out, 0);
850 auto buf = IOBuf::create(length);
851 buf->append(buf->capacity());
853 stream->next_out = buf->writableData();
854 stream->avail_out = buf->length();
859 bool ZlibCodec::doInflate(z_stream* stream,
861 uint32_t bufferLength) {
862 if (stream->avail_out == 0) {
863 head->prependChain(addOutputBuffer(stream, bufferLength));
866 int rc = inflate(stream, Z_NO_FLUSH);
877 throw std::runtime_error(to<std::string>(
878 "ZlibCodec: inflate error: ", rc, ": ", stream->msg));
880 CHECK(false) << rc << ": " << stream->msg;
886 std::unique_ptr<IOBuf> ZlibCodec::doCompress(const IOBuf* data) {
888 stream.zalloc = nullptr;
889 stream.zfree = nullptr;
890 stream.opaque = nullptr;
892 // Using deflateInit2() to support gzip. "The windowBits parameter is the
893 // base two logarithm of the maximum window size (...) The default value is
894 // 15 (...) Add 16 to windowBits to write a simple gzip header and trailer
895 // around the compressed data instead of a zlib wrapper. The gzip header
896 // will have no file name, no extra data, no comment, no modification time
897 // (set to zero), no header crc, and the operating system will be set to 255
899 int windowBits = 15 + (type() == CodecType::GZIP ? 16 : 0);
900 // All other parameters (method, memLevel, strategy) get default values from
902 int rc = deflateInit2(&stream,
909 throw std::runtime_error(to<std::string>(
910 "ZlibCodec: deflateInit error: ", rc, ": ", stream.msg));
913 stream.next_in = stream.next_out = nullptr;
914 stream.avail_in = stream.avail_out = 0;
915 stream.total_in = stream.total_out = 0;
917 bool success = false;
920 rc = deflateEnd(&stream);
921 // If we're here because of an exception, it's okay if some data
923 CHECK(rc == Z_OK || (!success && rc == Z_DATA_ERROR))
924 << rc << ": " << stream.msg;
927 uint64_t uncompressedLength = data->computeChainDataLength();
928 uint64_t maxCompressedLength = deflateBound(&stream, uncompressedLength);
930 // Max 64MiB in one go
931 constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20; // 64MiB
932 constexpr uint32_t defaultBufferLength = uint32_t(4) << 20; // 4MiB
934 auto out = addOutputBuffer(
936 (maxCompressedLength <= maxSingleStepLength ?
937 maxCompressedLength :
938 defaultBufferLength));
940 for (auto& range : *data) {
941 uint64_t remaining = range.size();
942 uint64_t written = 0;
944 uint32_t step = (remaining > maxSingleStepLength ?
945 maxSingleStepLength : remaining);
946 stream.next_in = const_cast<uint8_t*>(range.data() + written);
947 stream.avail_in = step;
951 while (stream.avail_in != 0) {
952 if (stream.avail_out == 0) {
953 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
956 rc = deflate(&stream, Z_NO_FLUSH);
958 CHECK_EQ(rc, Z_OK) << stream.msg;
964 if (stream.avail_out == 0) {
965 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
968 rc = deflate(&stream, Z_FINISH);
969 } while (rc == Z_OK);
971 CHECK_EQ(rc, Z_STREAM_END) << stream.msg;
973 out->prev()->trimEnd(stream.avail_out);
975 success = true; // we survived
980 std::unique_ptr<IOBuf> ZlibCodec::doUncompress(
982 Optional<uint64_t> uncompressedLength) {
984 stream.zalloc = nullptr;
985 stream.zfree = nullptr;
986 stream.opaque = nullptr;
988 // "The windowBits parameter is the base two logarithm of the maximum window
989 // size (...) The default value is 15 (...) add 16 to decode only the gzip
990 // format (the zlib format will return a Z_DATA_ERROR)."
991 int windowBits = 15 + (type() == CodecType::GZIP ? 16 : 0);
992 int rc = inflateInit2(&stream, windowBits);
994 throw std::runtime_error(to<std::string>(
995 "ZlibCodec: inflateInit error: ", rc, ": ", stream.msg));
998 stream.next_in = stream.next_out = nullptr;
999 stream.avail_in = stream.avail_out = 0;
1000 stream.total_in = stream.total_out = 0;
1002 bool success = false;
1005 rc = inflateEnd(&stream);
1006 // If we're here because of an exception, it's okay if some data
1008 CHECK(rc == Z_OK || (!success && rc == Z_DATA_ERROR))
1009 << rc << ": " << stream.msg;
1012 // Max 64MiB in one go
1013 constexpr uint64_t maxSingleStepLength = uint64_t(64) << 20; // 64MiB
1014 constexpr uint64_t kBlockSize = uint64_t(32) << 10; // 32 KiB
1015 const uint64_t defaultBufferLength =
1016 computeBufferLength(data->computeChainDataLength(), kBlockSize);
1018 auto out = addOutputBuffer(
1020 ((uncompressedLength && *uncompressedLength <= maxSingleStepLength)
1021 ? *uncompressedLength
1022 : defaultBufferLength));
1024 bool streamEnd = false;
1025 for (auto& range : *data) {
1026 if (range.empty()) {
1030 stream.next_in = const_cast<uint8_t*>(range.data());
1031 stream.avail_in = range.size();
1033 while (stream.avail_in != 0) {
1035 throw std::runtime_error(to<std::string>(
1036 "ZlibCodec: junk after end of data"));
1039 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1043 while (!streamEnd) {
1044 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1047 out->prev()->trimEnd(stream.avail_out);
1049 if (uncompressedLength && *uncompressedLength != stream.total_out) {
1050 throw std::runtime_error(
1051 to<std::string>("ZlibCodec: invalid uncompressed length"));
1054 success = true; // we survived
1059 #endif // FOLLY_HAVE_LIBZ
1061 #if FOLLY_HAVE_LIBLZMA
1066 class LZMA2Codec final : public Codec {
1068 static std::unique_ptr<Codec> create(int level, CodecType type);
1069 explicit LZMA2Codec(int level, CodecType type);
1071 std::vector<std::string> validPrefixes() const override;
1072 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
1076 bool doNeedsUncompressedLength() const override;
1077 uint64_t doMaxUncompressedLength() const override;
1079 bool encodeSize() const { return type() == CodecType::LZMA2_VARINT_SIZE; }
1081 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
1082 std::unique_ptr<IOBuf> doUncompress(
1084 Optional<uint64_t> uncompressedLength) override;
1086 std::unique_ptr<IOBuf> addOutputBuffer(lzma_stream* stream, size_t length);
1087 bool doInflate(lzma_stream* stream, IOBuf* head, size_t bufferLength);
1092 static constexpr uint64_t kLZMA2MagicLE = 0x005A587A37FD;
1093 static constexpr unsigned kLZMA2MagicBytes = 6;
1095 std::vector<std::string> LZMA2Codec::validPrefixes() const {
1096 if (type() == CodecType::LZMA2_VARINT_SIZE) {
1099 return {prefixToStringLE(kLZMA2MagicLE, kLZMA2MagicBytes)};
1102 bool LZMA2Codec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
1103 if (type() == CodecType::LZMA2_VARINT_SIZE) {
1106 // Returns false for all inputs less than 8 bytes.
1107 // This is okay, because no valid LZMA2 streams are less than 8 bytes.
1108 return dataStartsWithLE(data, kLZMA2MagicLE, kLZMA2MagicBytes);
1111 std::unique_ptr<Codec> LZMA2Codec::create(int level, CodecType type) {
1112 return std::make_unique<LZMA2Codec>(level, type);
1115 LZMA2Codec::LZMA2Codec(int level, CodecType type) : Codec(type) {
1116 DCHECK(type == CodecType::LZMA2 || type == CodecType::LZMA2_VARINT_SIZE);
1118 case COMPRESSION_LEVEL_FASTEST:
1121 case COMPRESSION_LEVEL_DEFAULT:
1122 level = LZMA_PRESET_DEFAULT;
1124 case COMPRESSION_LEVEL_BEST:
1128 if (level < 0 || level > 9) {
1129 throw std::invalid_argument(to<std::string>(
1130 "LZMA2Codec: invalid level: ", level));
1135 bool LZMA2Codec::doNeedsUncompressedLength() const {
1139 uint64_t LZMA2Codec::doMaxUncompressedLength() const {
1140 // From lzma/base.h: "Stream is roughly 8 EiB (2^63 bytes)"
1141 return uint64_t(1) << 63;
1144 std::unique_ptr<IOBuf> LZMA2Codec::addOutputBuffer(
1145 lzma_stream* stream,
1148 CHECK_EQ(stream->avail_out, 0);
1150 auto buf = IOBuf::create(length);
1151 buf->append(buf->capacity());
1153 stream->next_out = buf->writableData();
1154 stream->avail_out = buf->length();
1159 std::unique_ptr<IOBuf> LZMA2Codec::doCompress(const IOBuf* data) {
1161 lzma_stream stream = LZMA_STREAM_INIT;
1163 rc = lzma_easy_encoder(&stream, level_, LZMA_CHECK_NONE);
1164 if (rc != LZMA_OK) {
1165 throw std::runtime_error(folly::to<std::string>(
1166 "LZMA2Codec: lzma_easy_encoder error: ", rc));
1169 SCOPE_EXIT { lzma_end(&stream); };
1171 uint64_t uncompressedLength = data->computeChainDataLength();
1172 uint64_t maxCompressedLength = lzma_stream_buffer_bound(uncompressedLength);
1174 // Max 64MiB in one go
1175 constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20; // 64MiB
1176 constexpr uint32_t defaultBufferLength = uint32_t(4) << 20; // 4MiB
1178 auto out = addOutputBuffer(
1180 (maxCompressedLength <= maxSingleStepLength ?
1181 maxCompressedLength :
1182 defaultBufferLength));
1185 auto size = IOBuf::createCombined(kMaxVarintLength64);
1186 encodeVarintToIOBuf(uncompressedLength, size.get());
1187 size->appendChain(std::move(out));
1188 out = std::move(size);
1191 for (auto& range : *data) {
1192 if (range.empty()) {
1196 stream.next_in = const_cast<uint8_t*>(range.data());
1197 stream.avail_in = range.size();
1199 while (stream.avail_in != 0) {
1200 if (stream.avail_out == 0) {
1201 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
1204 rc = lzma_code(&stream, LZMA_RUN);
1206 if (rc != LZMA_OK) {
1207 throw std::runtime_error(folly::to<std::string>(
1208 "LZMA2Codec: lzma_code error: ", rc));
1214 if (stream.avail_out == 0) {
1215 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
1218 rc = lzma_code(&stream, LZMA_FINISH);
1219 } while (rc == LZMA_OK);
1221 if (rc != LZMA_STREAM_END) {
1222 throw std::runtime_error(folly::to<std::string>(
1223 "LZMA2Codec: lzma_code ended with error: ", rc));
1226 out->prev()->trimEnd(stream.avail_out);
1231 bool LZMA2Codec::doInflate(lzma_stream* stream,
1233 size_t bufferLength) {
1234 if (stream->avail_out == 0) {
1235 head->prependChain(addOutputBuffer(stream, bufferLength));
1238 lzma_ret rc = lzma_code(stream, LZMA_RUN);
1243 case LZMA_STREAM_END:
1246 throw std::runtime_error(to<std::string>(
1247 "LZMA2Codec: lzma_code error: ", rc));
1253 std::unique_ptr<IOBuf> LZMA2Codec::doUncompress(
1255 Optional<uint64_t> uncompressedLength) {
1257 lzma_stream stream = LZMA_STREAM_INIT;
1259 rc = lzma_auto_decoder(&stream, std::numeric_limits<uint64_t>::max(), 0);
1260 if (rc != LZMA_OK) {
1261 throw std::runtime_error(folly::to<std::string>(
1262 "LZMA2Codec: lzma_auto_decoder error: ", rc));
1265 SCOPE_EXIT { lzma_end(&stream); };
1267 // Max 64MiB in one go
1268 constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20; // 64MiB
1269 constexpr uint32_t defaultBufferLength = uint32_t(256) << 10; // 256 KiB
1271 folly::io::Cursor cursor(data);
1273 const uint64_t actualUncompressedLength = decodeVarintFromCursor(cursor);
1274 if (uncompressedLength && *uncompressedLength != actualUncompressedLength) {
1275 throw std::runtime_error("LZMA2Codec: invalid uncompressed length");
1277 uncompressedLength = actualUncompressedLength;
1280 auto out = addOutputBuffer(
1282 ((uncompressedLength && *uncompressedLength <= maxSingleStepLength)
1283 ? *uncompressedLength
1284 : defaultBufferLength));
1286 bool streamEnd = false;
1287 auto buf = cursor.peekBytes();
1288 while (!buf.empty()) {
1289 stream.next_in = const_cast<uint8_t*>(buf.data());
1290 stream.avail_in = buf.size();
1292 while (stream.avail_in != 0) {
1294 throw std::runtime_error(to<std::string>(
1295 "LZMA2Codec: junk after end of data"));
1298 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1301 cursor.skip(buf.size());
1302 buf = cursor.peekBytes();
1305 while (!streamEnd) {
1306 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1309 out->prev()->trimEnd(stream.avail_out);
1311 if (uncompressedLength && *uncompressedLength != stream.total_out) {
1312 throw std::runtime_error(
1313 to<std::string>("LZMA2Codec: invalid uncompressed length"));
1319 #endif // FOLLY_HAVE_LIBLZMA
1321 #ifdef FOLLY_HAVE_LIBZSTD
1326 class ZSTDCodec final : public Codec {
1328 static std::unique_ptr<Codec> create(int level, CodecType);
1329 explicit ZSTDCodec(int level, CodecType type);
1331 std::vector<std::string> validPrefixes() const override;
1332 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
1336 bool doNeedsUncompressedLength() const override;
1337 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
1338 std::unique_ptr<IOBuf> doUncompress(
1340 Optional<uint64_t> uncompressedLength) override;
1345 static constexpr uint32_t kZSTDMagicLE = 0xFD2FB528;
1347 std::vector<std::string> ZSTDCodec::validPrefixes() const {
1348 return {prefixToStringLE(kZSTDMagicLE)};
1351 bool ZSTDCodec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
1352 return dataStartsWithLE(data, kZSTDMagicLE);
1355 std::unique_ptr<Codec> ZSTDCodec::create(int level, CodecType type) {
1356 return std::make_unique<ZSTDCodec>(level, type);
1359 ZSTDCodec::ZSTDCodec(int level, CodecType type) : Codec(type) {
1360 DCHECK(type == CodecType::ZSTD);
1362 case COMPRESSION_LEVEL_FASTEST:
1365 case COMPRESSION_LEVEL_DEFAULT:
1368 case COMPRESSION_LEVEL_BEST:
1372 if (level < 1 || level > ZSTD_maxCLevel()) {
1373 throw std::invalid_argument(
1374 to<std::string>("ZSTD: invalid level: ", level));
1379 bool ZSTDCodec::doNeedsUncompressedLength() const {
1383 void zstdThrowIfError(size_t rc) {
1384 if (!ZSTD_isError(rc)) {
1387 throw std::runtime_error(
1388 to<std::string>("ZSTD returned an error: ", ZSTD_getErrorName(rc)));
1391 std::unique_ptr<IOBuf> ZSTDCodec::doCompress(const IOBuf* data) {
1392 // Support earlier versions of the codec (working with a single IOBuf,
1393 // and using ZSTD_decompress which requires ZSTD frame to contain size,
1394 // which isn't populated by streaming API).
1395 if (!data->isChained()) {
1396 auto out = IOBuf::createCombined(ZSTD_compressBound(data->length()));
1397 const auto rc = ZSTD_compress(
1398 out->writableData(),
1403 zstdThrowIfError(rc);
1408 auto zcs = ZSTD_createCStream();
1410 ZSTD_freeCStream(zcs);
1413 auto rc = ZSTD_initCStream(zcs, level_);
1414 zstdThrowIfError(rc);
1416 Cursor cursor(data);
1417 auto result = IOBuf::createCombined(ZSTD_compressBound(cursor.totalLength()));
1420 out.dst = result->writableTail();
1421 out.size = result->capacity();
1424 for (auto buffer = cursor.peekBytes(); !buffer.empty();) {
1426 in.src = buffer.data();
1427 in.size = buffer.size();
1428 for (in.pos = 0; in.pos != in.size;) {
1429 rc = ZSTD_compressStream(zcs, &out, &in);
1430 zstdThrowIfError(rc);
1432 cursor.skip(in.size);
1433 buffer = cursor.peekBytes();
1436 rc = ZSTD_endStream(zcs, &out);
1437 zstdThrowIfError(rc);
1440 result->append(out.pos);
1444 static std::unique_ptr<IOBuf> zstdUncompressBuffer(
1446 Optional<uint64_t> uncompressedLength) {
1447 // Check preconditions
1448 DCHECK(!data->isChained());
1449 DCHECK(uncompressedLength.hasValue());
1451 auto uncompressed = IOBuf::create(*uncompressedLength);
1452 const auto decompressedSize = ZSTD_decompress(
1453 uncompressed->writableTail(),
1454 uncompressed->tailroom(),
1457 zstdThrowIfError(decompressedSize);
1458 if (decompressedSize != uncompressedLength) {
1459 throw std::runtime_error("ZSTD: invalid uncompressed length");
1461 uncompressed->append(decompressedSize);
1462 return uncompressed;
1465 static std::unique_ptr<IOBuf> zstdUncompressStream(
1467 Optional<uint64_t> uncompressedLength) {
1468 auto zds = ZSTD_createDStream();
1470 ZSTD_freeDStream(zds);
1473 auto rc = ZSTD_initDStream(zds);
1474 zstdThrowIfError(rc);
1476 ZSTD_outBuffer out{};
1479 auto outputSize = uncompressedLength.value_or(ZSTD_DStreamOutSize());
1481 IOBufQueue queue(IOBufQueue::cacheChainLength());
1483 Cursor cursor(data);
1485 if (in.pos == in.size) {
1486 auto buffer = cursor.peekBytes();
1487 in.src = buffer.data();
1488 in.size = buffer.size();
1490 cursor.skip(in.size);
1491 if (rc > 1 && in.size == 0) {
1492 throw std::runtime_error(to<std::string>("ZSTD: incomplete input"));
1495 if (out.pos == out.size) {
1497 queue.postallocate(out.pos);
1499 auto buffer = queue.preallocate(outputSize, outputSize);
1500 out.dst = buffer.first;
1501 out.size = buffer.second;
1503 outputSize = ZSTD_DStreamOutSize();
1505 rc = ZSTD_decompressStream(zds, &out, &in);
1506 zstdThrowIfError(rc);
1512 queue.postallocate(out.pos);
1514 if (in.pos != in.size || !cursor.isAtEnd()) {
1515 throw std::runtime_error("ZSTD: junk after end of data");
1517 if (uncompressedLength && queue.chainLength() != *uncompressedLength) {
1518 throw std::runtime_error("ZSTD: invalid uncompressed length");
1521 return queue.move();
1524 std::unique_ptr<IOBuf> ZSTDCodec::doUncompress(
1526 Optional<uint64_t> uncompressedLength) {
1528 // Read decompressed size from frame if available in first IOBuf.
1529 const auto decompressedSize =
1530 ZSTD_getDecompressedSize(data->data(), data->length());
1531 if (decompressedSize != 0) {
1532 if (uncompressedLength && *uncompressedLength != decompressedSize) {
1533 throw std::runtime_error("ZSTD: invalid uncompressed length");
1535 uncompressedLength = decompressedSize;
1538 // Faster to decompress using ZSTD_decompress() if we can.
1539 if (uncompressedLength && !data->isChained()) {
1540 return zstdUncompressBuffer(data, uncompressedLength);
1542 // Fall back to slower streaming decompression.
1543 return zstdUncompressStream(data, uncompressedLength);
1546 #endif // FOLLY_HAVE_LIBZSTD
1548 #if FOLLY_HAVE_LIBBZ2
1550 class Bzip2Codec final : public Codec {
1552 static std::unique_ptr<Codec> create(int level, CodecType type);
1553 explicit Bzip2Codec(int level, CodecType type);
1555 std::vector<std::string> validPrefixes() const override;
1556 bool canUncompress(IOBuf const* data, Optional<uint64_t> uncompressedLength)
1560 std::unique_ptr<IOBuf> doCompress(IOBuf const* data) override;
1561 std::unique_ptr<IOBuf> doUncompress(
1563 Optional<uint64_t> uncompressedLength) override;
1568 /* static */ std::unique_ptr<Codec> Bzip2Codec::create(
1571 return std::make_unique<Bzip2Codec>(level, type);
1574 Bzip2Codec::Bzip2Codec(int level, CodecType type) : Codec(type) {
1575 DCHECK(type == CodecType::BZIP2);
1577 case COMPRESSION_LEVEL_FASTEST:
1580 case COMPRESSION_LEVEL_DEFAULT:
1583 case COMPRESSION_LEVEL_BEST:
1587 if (level < 1 || level > 9) {
1588 throw std::invalid_argument(
1589 to<std::string>("Bzip2: invalid level: ", level));
1594 static uint32_t constexpr kBzip2MagicLE = 0x685a42;
1595 static uint64_t constexpr kBzip2MagicBytes = 3;
1597 std::vector<std::string> Bzip2Codec::validPrefixes() const {
1598 return {prefixToStringLE(kBzip2MagicLE, kBzip2MagicBytes)};
1601 bool Bzip2Codec::canUncompress(IOBuf const* data, Optional<uint64_t>) const {
1602 return dataStartsWithLE(data, kBzip2MagicLE, kBzip2MagicBytes);
1605 static bz_stream createBzStream() {
1607 stream.bzalloc = nullptr;
1608 stream.bzfree = nullptr;
1609 stream.opaque = nullptr;
1610 stream.next_in = stream.next_out = nullptr;
1611 stream.avail_in = stream.avail_out = 0;
1615 // Throws on error condition, otherwise returns the code.
1616 static int bzCheck(int const rc) {
1625 throw std::runtime_error(to<std::string>("Bzip2 error: ", rc));
1629 static uint64_t bzCompressBound(uint64_t const uncompressedLength) {
1630 // http://www.bzip.org/1.0.5/bzip2-manual-1.0.5.html#bzbufftobuffcompress
1631 // To guarantee that the compressed data will fit in its buffer, allocate an
1632 // output buffer of size 1% larger than the uncompressed data, plus six
1633 // hundred extra bytes.
1634 return uncompressedLength + uncompressedLength / 100 + 600;
1637 static std::unique_ptr<IOBuf> addOutputBuffer(
1639 uint64_t const bufferLength) {
1640 DCHECK_LE(bufferLength, std::numeric_limits<unsigned>::max());
1641 DCHECK_EQ(stream->avail_out, 0);
1643 auto buf = IOBuf::create(bufferLength);
1644 buf->append(buf->capacity());
1646 stream->next_out = reinterpret_cast<char*>(buf->writableData());
1647 stream->avail_out = buf->length();
1652 std::unique_ptr<IOBuf> Bzip2Codec::doCompress(IOBuf const* data) {
1653 bz_stream stream = createBzStream();
1654 bzCheck(BZ2_bzCompressInit(&stream, level_, 0, 0));
1656 bzCheck(BZ2_bzCompressEnd(&stream));
1659 uint64_t const uncompressedLength = data->computeChainDataLength();
1660 uint64_t const maxCompressedLength = bzCompressBound(uncompressedLength);
1661 uint64_t constexpr kMaxSingleStepLength = uint64_t(64) << 20; // 64 MiB
1662 uint64_t constexpr kDefaultBufferLength = uint64_t(4) << 20;
1664 auto out = addOutputBuffer(
1666 maxCompressedLength <= kMaxSingleStepLength ? maxCompressedLength
1667 : kDefaultBufferLength);
1669 for (auto range : *data) {
1670 while (!range.empty()) {
1671 auto const inSize = std::min<size_t>(range.size(), kMaxSingleStepLength);
1673 const_cast<char*>(reinterpret_cast<char const*>(range.data()));
1674 stream.avail_in = inSize;
1676 if (stream.avail_out == 0) {
1677 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
1680 bzCheck(BZ2_bzCompress(&stream, BZ_RUN));
1681 range.uncheckedAdvance(inSize - stream.avail_in);
1685 if (stream.avail_out == 0) {
1686 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
1688 } while (bzCheck(BZ2_bzCompress(&stream, BZ_FINISH)) != BZ_STREAM_END);
1690 out->prev()->trimEnd(stream.avail_out);
1695 std::unique_ptr<IOBuf> Bzip2Codec::doUncompress(
1697 Optional<uint64_t> uncompressedLength) {
1698 bz_stream stream = createBzStream();
1699 bzCheck(BZ2_bzDecompressInit(&stream, 0, 0));
1701 bzCheck(BZ2_bzDecompressEnd(&stream));
1704 uint64_t constexpr kMaxSingleStepLength = uint64_t(64) << 20; // 64 MiB
1705 uint64_t const kBlockSize = uint64_t(100) << 10; // 100 KiB
1706 uint64_t const kDefaultBufferLength =
1707 computeBufferLength(data->computeChainDataLength(), kBlockSize);
1709 auto out = addOutputBuffer(
1711 ((uncompressedLength && *uncompressedLength <= kMaxSingleStepLength)
1712 ? *uncompressedLength
1713 : kDefaultBufferLength));
1716 for (auto range : *data) {
1717 while (!range.empty()) {
1718 auto const inSize = std::min<size_t>(range.size(), kMaxSingleStepLength);
1720 const_cast<char*>(reinterpret_cast<char const*>(range.data()));
1721 stream.avail_in = inSize;
1723 if (stream.avail_out == 0) {
1724 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
1727 rc = bzCheck(BZ2_bzDecompress(&stream));
1728 range.uncheckedAdvance(inSize - stream.avail_in);
1731 while (rc != BZ_STREAM_END) {
1732 if (stream.avail_out == 0) {
1733 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
1736 rc = bzCheck(BZ2_bzDecompress(&stream));
1739 out->prev()->trimEnd(stream.avail_out);
1741 uint64_t const totalOut =
1742 (uint64_t(stream.total_out_hi32) << 32) + stream.total_out_lo32;
1743 if (uncompressedLength && uncompressedLength != totalOut) {
1744 throw std::runtime_error("Bzip2 error: Invalid uncompressed length");
1750 #endif // FOLLY_HAVE_LIBBZ2
1753 * Automatic decompression
1755 class AutomaticCodec final : public Codec {
1757 static std::unique_ptr<Codec> create(
1758 std::vector<std::unique_ptr<Codec>> customCodecs);
1759 explicit AutomaticCodec(std::vector<std::unique_ptr<Codec>> customCodecs);
1761 std::vector<std::string> validPrefixes() const override;
1762 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
1766 bool doNeedsUncompressedLength() const override;
1767 uint64_t doMaxUncompressedLength() const override;
1769 std::unique_ptr<IOBuf> doCompress(const IOBuf*) override {
1770 throw std::runtime_error("AutomaticCodec error: compress() not supported.");
1772 std::unique_ptr<IOBuf> doUncompress(
1774 Optional<uint64_t> uncompressedLength) override;
1776 void addCodecIfSupported(CodecType type);
1778 // Throws iff the codecs aren't compatible (very slow)
1779 void checkCompatibleCodecs() const;
1781 std::vector<std::unique_ptr<Codec>> codecs_;
1782 bool needsUncompressedLength_;
1783 uint64_t maxUncompressedLength_;
1786 std::vector<std::string> AutomaticCodec::validPrefixes() const {
1787 std::unordered_set<std::string> prefixes;
1788 for (const auto& codec : codecs_) {
1789 const auto codecPrefixes = codec->validPrefixes();
1790 prefixes.insert(codecPrefixes.begin(), codecPrefixes.end());
1792 return std::vector<std::string>{prefixes.begin(), prefixes.end()};
1795 bool AutomaticCodec::canUncompress(
1797 Optional<uint64_t> uncompressedLength) const {
1801 [data, uncompressedLength](std::unique_ptr<Codec> const& codec) {
1802 return codec->canUncompress(data, uncompressedLength);
1806 void AutomaticCodec::addCodecIfSupported(CodecType type) {
1807 const bool present = std::any_of(
1810 [&type](std::unique_ptr<Codec> const& codec) {
1811 return codec->type() == type;
1813 if (hasCodec(type) && !present) {
1814 codecs_.push_back(getCodec(type));
1818 /* static */ std::unique_ptr<Codec> AutomaticCodec::create(
1819 std::vector<std::unique_ptr<Codec>> customCodecs) {
1820 return std::make_unique<AutomaticCodec>(std::move(customCodecs));
1823 AutomaticCodec::AutomaticCodec(std::vector<std::unique_ptr<Codec>> customCodecs)
1824 : Codec(CodecType::USER_DEFINED), codecs_(std::move(customCodecs)) {
1825 // Fastest -> slowest
1826 addCodecIfSupported(CodecType::LZ4_FRAME);
1827 addCodecIfSupported(CodecType::ZSTD);
1828 addCodecIfSupported(CodecType::ZLIB);
1829 addCodecIfSupported(CodecType::GZIP);
1830 addCodecIfSupported(CodecType::LZMA2);
1831 addCodecIfSupported(CodecType::BZIP2);
1833 checkCompatibleCodecs();
1835 // Check that none of the codes are are null
1836 DCHECK(std::none_of(
1837 codecs_.begin(), codecs_.end(), [](std::unique_ptr<Codec> const& codec) {
1838 return codec == nullptr;
1841 needsUncompressedLength_ = std::any_of(
1842 codecs_.begin(), codecs_.end(), [](std::unique_ptr<Codec> const& codec) {
1843 return codec->needsUncompressedLength();
1846 const auto it = std::max_element(
1849 [](std::unique_ptr<Codec> const& lhs, std::unique_ptr<Codec> const& rhs) {
1850 return lhs->maxUncompressedLength() < rhs->maxUncompressedLength();
1852 DCHECK(it != codecs_.end());
1853 maxUncompressedLength_ = (*it)->maxUncompressedLength();
1856 void AutomaticCodec::checkCompatibleCodecs() const {
1857 // Keep track of all the possible headers.
1858 std::unordered_set<std::string> headers;
1859 // The empty header is not allowed.
1862 // Construct a set of headers and check that none of the headers occur twice.
1863 // Eliminate edge cases.
1864 for (auto&& codec : codecs_) {
1865 const auto codecHeaders = codec->validPrefixes();
1866 // Codecs without any valid headers are not allowed.
1867 if (codecHeaders.empty()) {
1868 throw std::invalid_argument{
1869 "AutomaticCodec: validPrefixes() must not be empty."};
1871 // Insert all the headers for the current codec.
1872 const size_t beforeSize = headers.size();
1873 headers.insert(codecHeaders.begin(), codecHeaders.end());
1874 // Codecs are not compatible if any header occurred twice.
1875 if (beforeSize + codecHeaders.size() != headers.size()) {
1876 throw std::invalid_argument{
1877 "AutomaticCodec: Two valid prefixes collide."};
1881 // Check if any strict non-empty prefix of any header is a header.
1882 for (const auto& header : headers) {
1883 for (size_t i = 1; i < header.size(); ++i) {
1884 if (headers.count(header.substr(0, i))) {
1885 throw std::invalid_argument{
1886 "AutomaticCodec: One valid prefix is a prefix of another valid "
1893 bool AutomaticCodec::doNeedsUncompressedLength() const {
1894 return needsUncompressedLength_;
1897 uint64_t AutomaticCodec::doMaxUncompressedLength() const {
1898 return maxUncompressedLength_;
1901 std::unique_ptr<IOBuf> AutomaticCodec::doUncompress(
1903 Optional<uint64_t> uncompressedLength) {
1904 for (auto&& codec : codecs_) {
1905 if (codec->canUncompress(data, uncompressedLength)) {
1906 return codec->uncompress(data, uncompressedLength);
1909 throw std::runtime_error("AutomaticCodec error: Unknown compressed data");
1914 typedef std::unique_ptr<Codec> (*CodecFactory)(int, CodecType);
1915 static constexpr CodecFactory
1916 codecFactories[static_cast<size_t>(CodecType::NUM_CODEC_TYPES)] = {
1917 nullptr, // USER_DEFINED
1918 NoCompressionCodec::create,
1920 #if FOLLY_HAVE_LIBLZ4
1926 #if FOLLY_HAVE_LIBSNAPPY
1927 SnappyCodec::create,
1938 #if FOLLY_HAVE_LIBLZ4
1944 #if FOLLY_HAVE_LIBLZMA
1952 #if FOLLY_HAVE_LIBZSTD
1964 #if (FOLLY_HAVE_LIBLZ4 && LZ4_VERSION_NUMBER >= 10301)
1965 LZ4FrameCodec::create,
1970 #if FOLLY_HAVE_LIBBZ2
1977 bool hasCodec(CodecType type) {
1978 size_t idx = static_cast<size_t>(type);
1979 if (idx >= static_cast<size_t>(CodecType::NUM_CODEC_TYPES)) {
1980 throw std::invalid_argument(
1981 to<std::string>("Compression type ", idx, " invalid"));
1983 return codecFactories[idx] != nullptr;
1986 std::unique_ptr<Codec> getCodec(CodecType type, int level) {
1987 size_t idx = static_cast<size_t>(type);
1988 if (idx >= static_cast<size_t>(CodecType::NUM_CODEC_TYPES)) {
1989 throw std::invalid_argument(
1990 to<std::string>("Compression type ", idx, " invalid"));
1992 auto factory = codecFactories[idx];
1994 throw std::invalid_argument(to<std::string>(
1995 "Compression type ", idx, " not supported"));
1997 auto codec = (*factory)(level, type);
1998 DCHECK_EQ(static_cast<size_t>(codec->type()), idx);
2002 std::unique_ptr<Codec> getAutoUncompressionCodec(
2003 std::vector<std::unique_ptr<Codec>> customCodecs) {
2004 return AutomaticCodec::create(std::move(customCodecs));