2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/io/Compression.h>
22 #if LZ4_VERSION_NUMBER >= 10301
27 #include <glog/logging.h>
29 #if FOLLY_HAVE_LIBSNAPPY
31 #include <snappy-sinksource.h>
38 #if FOLLY_HAVE_LIBLZMA
42 #if FOLLY_HAVE_LIBZSTD
43 #define ZSTD_STATIC_LINKING_ONLY
51 #include <folly/Bits.h>
52 #include <folly/Conv.h>
53 #include <folly/Memory.h>
54 #include <folly/Portability.h>
55 #include <folly/ScopeGuard.h>
56 #include <folly/Varint.h>
57 #include <folly/io/Cursor.h>
59 #include <unordered_set>
61 namespace folly { namespace io {
63 Codec::Codec(CodecType type) : type_(type) { }
65 // Ensure consistent behavior in the nullptr case
66 std::unique_ptr<IOBuf> Codec::compress(const IOBuf* data) {
67 uint64_t len = data->computeChainDataLength();
69 return IOBuf::create(0);
71 if (len > maxUncompressedLength()) {
72 throw std::runtime_error("Codec: uncompressed length too large");
75 return doCompress(data);
78 std::string Codec::compress(const StringPiece data) {
79 const uint64_t len = data.size();
83 if (len > maxUncompressedLength()) {
84 throw std::runtime_error("Codec: uncompressed length too large");
87 return doCompressString(data);
90 std::unique_ptr<IOBuf> Codec::uncompress(
92 Optional<uint64_t> uncompressedLength) {
93 if (!uncompressedLength) {
94 if (needsUncompressedLength()) {
95 throw std::invalid_argument("Codec: uncompressed length required");
97 } else if (*uncompressedLength > maxUncompressedLength()) {
98 throw std::runtime_error("Codec: uncompressed length too large");
102 if (uncompressedLength.value_or(0) != 0) {
103 throw std::runtime_error("Codec: invalid uncompressed length");
105 return IOBuf::create(0);
108 return doUncompress(data, uncompressedLength);
111 std::string Codec::uncompress(
112 const StringPiece data,
113 Optional<uint64_t> uncompressedLength) {
114 if (!uncompressedLength) {
115 if (needsUncompressedLength()) {
116 throw std::invalid_argument("Codec: uncompressed length required");
118 } else if (*uncompressedLength > maxUncompressedLength()) {
119 throw std::runtime_error("Codec: uncompressed length too large");
123 if (uncompressedLength.value_or(0) != 0) {
124 throw std::runtime_error("Codec: invalid uncompressed length");
129 return doUncompressString(data, uncompressedLength);
132 bool Codec::needsUncompressedLength() const {
133 return doNeedsUncompressedLength();
136 uint64_t Codec::maxUncompressedLength() const {
137 return doMaxUncompressedLength();
140 bool Codec::doNeedsUncompressedLength() const {
144 uint64_t Codec::doMaxUncompressedLength() const {
145 return UNLIMITED_UNCOMPRESSED_LENGTH;
148 std::vector<std::string> Codec::validPrefixes() const {
152 bool Codec::canUncompress(const IOBuf*, Optional<uint64_t>) const {
156 std::string Codec::doCompressString(const StringPiece data) {
157 const IOBuf inputBuffer{IOBuf::WRAP_BUFFER, data};
158 auto outputBuffer = doCompress(&inputBuffer);
160 output.reserve(outputBuffer->computeChainDataLength());
161 for (auto range : *outputBuffer) {
162 output.append(reinterpret_cast<const char*>(range.data()), range.size());
167 std::string Codec::doUncompressString(
168 const StringPiece data,
169 Optional<uint64_t> uncompressedLength) {
170 const IOBuf inputBuffer{IOBuf::WRAP_BUFFER, data};
171 auto outputBuffer = doUncompress(&inputBuffer, uncompressedLength);
173 output.reserve(outputBuffer->computeChainDataLength());
174 for (auto range : *outputBuffer) {
175 output.append(reinterpret_cast<const char*>(range.data()), range.size());
180 uint64_t Codec::maxCompressedLength(uint64_t uncompressedLength) const {
181 if (uncompressedLength == 0) {
184 return doMaxCompressedLength(uncompressedLength);
187 Optional<uint64_t> Codec::getUncompressedLength(
188 const folly::IOBuf* data,
189 Optional<uint64_t> uncompressedLength) const {
190 auto const compressedLength = data->computeChainDataLength();
191 if (uncompressedLength == uint64_t(0) || compressedLength == 0) {
192 if (uncompressedLength.value_or(0) != 0 || compressedLength != 0) {
193 throw std::runtime_error("Invalid uncompressed length");
197 return doGetUncompressedLength(data, uncompressedLength);
200 Optional<uint64_t> Codec::doGetUncompressedLength(
202 Optional<uint64_t> uncompressedLength) const {
203 return uncompressedLength;
206 bool StreamCodec::needsDataLength() const {
207 return doNeedsDataLength();
210 bool StreamCodec::doNeedsDataLength() const {
214 void StreamCodec::assertStateIs(State expected) const {
215 if (state_ != expected) {
216 throw std::logic_error(folly::to<std::string>(
217 "Codec: state is ", state_, "; expected state ", expected));
221 void StreamCodec::resetStream(Optional<uint64_t> uncompressedLength) {
222 state_ = State::RESET;
223 uncompressedLength_ = uncompressedLength;
227 bool StreamCodec::compressStream(
229 MutableByteRange& output,
230 StreamCodec::FlushOp flushOp) {
231 if (state_ == State::RESET && input.empty()) {
232 if (flushOp == StreamCodec::FlushOp::NONE) {
235 if (flushOp == StreamCodec::FlushOp::END &&
236 uncompressedLength().value_or(0) != 0) {
237 throw std::runtime_error("Codec: invalid uncompressed length");
241 if (state_ == State::RESET && !input.empty() &&
242 uncompressedLength() == uint64_t(0)) {
243 throw std::runtime_error("Codec: invalid uncompressed length");
245 // Handle input state transitions
247 case StreamCodec::FlushOp::NONE:
248 if (state_ == State::RESET) {
249 state_ = State::COMPRESS;
251 assertStateIs(State::COMPRESS);
253 case StreamCodec::FlushOp::FLUSH:
254 if (state_ == State::RESET || state_ == State::COMPRESS) {
255 state_ = State::COMPRESS_FLUSH;
257 assertStateIs(State::COMPRESS_FLUSH);
259 case StreamCodec::FlushOp::END:
260 if (state_ == State::RESET || state_ == State::COMPRESS) {
261 state_ = State::COMPRESS_END;
263 assertStateIs(State::COMPRESS_END);
266 bool const done = doCompressStream(input, output, flushOp);
267 // Handle output state transitions
269 if (state_ == State::COMPRESS_FLUSH) {
270 state_ = State::COMPRESS;
271 } else if (state_ == State::COMPRESS_END) {
274 // Check internal invariants
275 DCHECK(input.empty());
276 DCHECK(flushOp != StreamCodec::FlushOp::NONE);
281 bool StreamCodec::uncompressStream(
283 MutableByteRange& output,
284 StreamCodec::FlushOp flushOp) {
285 if (state_ == State::RESET && input.empty()) {
286 if (uncompressedLength().value_or(0) == 0) {
291 // Handle input state transitions
292 if (state_ == State::RESET) {
293 state_ = State::UNCOMPRESS;
295 assertStateIs(State::UNCOMPRESS);
296 bool const done = doUncompressStream(input, output, flushOp);
297 // Handle output state transitions
304 static std::unique_ptr<IOBuf> addOutputBuffer(
305 MutableByteRange& output,
307 DCHECK(output.empty());
308 auto buffer = IOBuf::create(size);
309 buffer->append(buffer->capacity());
310 output = {buffer->writableData(), buffer->length()};
314 std::unique_ptr<IOBuf> StreamCodec::doCompress(IOBuf const* data) {
315 uint64_t const uncompressedLength = data->computeChainDataLength();
316 resetStream(uncompressedLength);
317 uint64_t const maxCompressedLen = maxCompressedLength(uncompressedLength);
319 auto constexpr kMaxSingleStepLength = uint64_t(64) << 20; // 64 MB
320 auto constexpr kDefaultBufferLength = uint64_t(4) << 20; // 4 MB
322 MutableByteRange output;
323 auto buffer = addOutputBuffer(
325 maxCompressedLen <= kMaxSingleStepLength ? maxCompressedLen
326 : kDefaultBufferLength);
328 // Compress the entire IOBuf chain into the IOBuf chain pointed to by buffer
329 IOBuf const* current = data;
330 ByteRange input{current->data(), current->length()};
331 StreamCodec::FlushOp flushOp = StreamCodec::FlushOp::NONE;
333 while (input.empty() && current->next() != data) {
334 current = current->next();
335 input = {current->data(), current->length()};
337 if (current->next() == data) {
338 // This is the last input buffer so end the stream
339 flushOp = StreamCodec::FlushOp::END;
341 if (output.empty()) {
342 buffer->prependChain(addOutputBuffer(output, kDefaultBufferLength));
344 bool const done = compressStream(input, output, flushOp);
346 DCHECK(input.empty());
347 DCHECK(flushOp == StreamCodec::FlushOp::END);
348 DCHECK_EQ(current->next(), data);
352 buffer->prev()->trimEnd(output.size());
356 static uint64_t computeBufferLength(
357 uint64_t const compressedLength,
358 uint64_t const blockSize) {
359 uint64_t constexpr kMaxBufferLength = uint64_t(4) << 20; // 4 MiB
360 uint64_t const goodBufferSize = 4 * std::max(blockSize, compressedLength);
361 return std::min(goodBufferSize, kMaxBufferLength);
364 std::unique_ptr<IOBuf> StreamCodec::doUncompress(
366 Optional<uint64_t> uncompressedLength) {
367 auto constexpr kMaxSingleStepLength = uint64_t(64) << 20; // 64 MB
368 auto constexpr kBlockSize = uint64_t(128) << 10;
369 auto const defaultBufferLength =
370 computeBufferLength(data->computeChainDataLength(), kBlockSize);
372 uncompressedLength = getUncompressedLength(data, uncompressedLength);
373 resetStream(uncompressedLength);
375 MutableByteRange output;
376 auto buffer = addOutputBuffer(
378 (uncompressedLength && *uncompressedLength <= kMaxSingleStepLength
379 ? *uncompressedLength
380 : defaultBufferLength));
382 // Uncompress the entire IOBuf chain into the IOBuf chain pointed to by buffer
383 IOBuf const* current = data;
384 ByteRange input{current->data(), current->length()};
385 StreamCodec::FlushOp flushOp = StreamCodec::FlushOp::NONE;
387 while (input.empty() && current->next() != data) {
388 current = current->next();
389 input = {current->data(), current->length()};
391 if (current->next() == data) {
392 // Tell the uncompressor there is no more input (it may optimize)
393 flushOp = StreamCodec::FlushOp::END;
395 if (output.empty()) {
396 buffer->prependChain(addOutputBuffer(output, defaultBufferLength));
398 bool const done = uncompressStream(input, output, flushOp);
403 if (!input.empty()) {
404 throw std::runtime_error("Codec: Junk after end of data");
407 buffer->prev()->trimEnd(output.size());
408 if (uncompressedLength &&
409 *uncompressedLength != buffer->computeChainDataLength()) {
410 throw std::runtime_error("Codec: invalid uncompressed length");
421 class NoCompressionCodec final : public Codec {
423 static std::unique_ptr<Codec> create(int level, CodecType type);
424 explicit NoCompressionCodec(int level, CodecType type);
427 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
428 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
429 std::unique_ptr<IOBuf> doUncompress(
431 Optional<uint64_t> uncompressedLength) override;
434 std::unique_ptr<Codec> NoCompressionCodec::create(int level, CodecType type) {
435 return std::make_unique<NoCompressionCodec>(level, type);
438 NoCompressionCodec::NoCompressionCodec(int level, CodecType type)
440 DCHECK(type == CodecType::NO_COMPRESSION);
442 case COMPRESSION_LEVEL_DEFAULT:
443 case COMPRESSION_LEVEL_FASTEST:
444 case COMPRESSION_LEVEL_BEST:
448 throw std::invalid_argument(to<std::string>(
449 "NoCompressionCodec: invalid level ", level));
453 uint64_t NoCompressionCodec::doMaxCompressedLength(
454 uint64_t uncompressedLength) const {
455 return uncompressedLength;
458 std::unique_ptr<IOBuf> NoCompressionCodec::doCompress(
460 return data->clone();
463 std::unique_ptr<IOBuf> NoCompressionCodec::doUncompress(
465 Optional<uint64_t> uncompressedLength) {
466 if (uncompressedLength &&
467 data->computeChainDataLength() != *uncompressedLength) {
468 throw std::runtime_error(
469 to<std::string>("NoCompressionCodec: invalid uncompressed length"));
471 return data->clone();
474 #if (FOLLY_HAVE_LIBLZ4 || FOLLY_HAVE_LIBLZMA)
478 void encodeVarintToIOBuf(uint64_t val, folly::IOBuf* out) {
479 DCHECK_GE(out->tailroom(), kMaxVarintLength64);
480 out->append(encodeVarint(val, out->writableTail()));
483 inline uint64_t decodeVarintFromCursor(folly::io::Cursor& cursor) {
486 for (int shift = 0; shift <= 63; shift += 7) {
487 b = cursor.read<int8_t>();
488 val |= static_cast<uint64_t>(b & 0x7f) << shift;
494 throw std::invalid_argument("Invalid varint value. Too big.");
501 #endif // FOLLY_HAVE_LIBLZ4 || FOLLY_HAVE_LIBLZMA
505 * Reads sizeof(T) bytes, and returns false if not enough bytes are available.
506 * Returns true if the first n bytes are equal to prefix when interpreted as
509 template <typename T>
510 typename std::enable_if<std::is_unsigned<T>::value, bool>::type
511 dataStartsWithLE(const IOBuf* data, T prefix, uint64_t n = sizeof(T)) {
513 DCHECK_LE(n, sizeof(T));
516 if (!cursor.tryReadLE(value)) {
519 const T mask = n == sizeof(T) ? T(-1) : (T(1) << (8 * n)) - 1;
520 return prefix == (value & mask);
523 template <typename T>
524 typename std::enable_if<std::is_arithmetic<T>::value, std::string>::type
525 prefixToStringLE(T prefix, uint64_t n = sizeof(T)) {
527 DCHECK_LE(n, sizeof(T));
528 prefix = Endian::little(prefix);
531 memcpy(&result[0], &prefix, n);
536 #if FOLLY_HAVE_LIBLZ4
541 class LZ4Codec final : public Codec {
543 static std::unique_ptr<Codec> create(int level, CodecType type);
544 explicit LZ4Codec(int level, CodecType type);
547 bool doNeedsUncompressedLength() const override;
548 uint64_t doMaxUncompressedLength() const override;
549 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
551 bool encodeSize() const { return type() == CodecType::LZ4_VARINT_SIZE; }
553 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
554 std::unique_ptr<IOBuf> doUncompress(
556 Optional<uint64_t> uncompressedLength) override;
558 bool highCompression_;
561 std::unique_ptr<Codec> LZ4Codec::create(int level, CodecType type) {
562 return std::make_unique<LZ4Codec>(level, type);
565 LZ4Codec::LZ4Codec(int level, CodecType type) : Codec(type) {
566 DCHECK(type == CodecType::LZ4 || type == CodecType::LZ4_VARINT_SIZE);
569 case COMPRESSION_LEVEL_FASTEST:
570 case COMPRESSION_LEVEL_DEFAULT:
573 case COMPRESSION_LEVEL_BEST:
577 if (level < 1 || level > 2) {
578 throw std::invalid_argument(to<std::string>(
579 "LZ4Codec: invalid level: ", level));
581 highCompression_ = (level > 1);
584 bool LZ4Codec::doNeedsUncompressedLength() const {
585 return !encodeSize();
588 // The value comes from lz4.h in lz4-r117, but older versions of lz4 don't
589 // define LZ4_MAX_INPUT_SIZE (even though the max size is the same), so do it
591 #ifndef LZ4_MAX_INPUT_SIZE
592 # define LZ4_MAX_INPUT_SIZE 0x7E000000
595 uint64_t LZ4Codec::doMaxUncompressedLength() const {
596 return LZ4_MAX_INPUT_SIZE;
599 uint64_t LZ4Codec::doMaxCompressedLength(uint64_t uncompressedLength) const {
600 return LZ4_compressBound(uncompressedLength) +
601 (encodeSize() ? kMaxVarintLength64 : 0);
604 std::unique_ptr<IOBuf> LZ4Codec::doCompress(const IOBuf* data) {
606 if (data->isChained()) {
607 // LZ4 doesn't support streaming, so we have to coalesce
608 clone = data->cloneCoalescedAsValue();
612 auto out = IOBuf::create(maxCompressedLength(data->length()));
614 encodeVarintToIOBuf(data->length(), out.get());
618 auto input = reinterpret_cast<const char*>(data->data());
619 auto output = reinterpret_cast<char*>(out->writableTail());
620 const auto inputLength = data->length();
621 #if LZ4_VERSION_NUMBER >= 10700
622 if (highCompression_) {
623 n = LZ4_compress_HC(input, output, inputLength, out->tailroom(), 0);
625 n = LZ4_compress_default(input, output, inputLength, out->tailroom());
628 if (highCompression_) {
629 n = LZ4_compressHC(input, output, inputLength);
631 n = LZ4_compress(input, output, inputLength);
636 CHECK_LE(n, out->capacity());
642 std::unique_ptr<IOBuf> LZ4Codec::doUncompress(
644 Optional<uint64_t> uncompressedLength) {
646 if (data->isChained()) {
647 // LZ4 doesn't support streaming, so we have to coalesce
648 clone = data->cloneCoalescedAsValue();
652 folly::io::Cursor cursor(data);
653 uint64_t actualUncompressedLength;
655 actualUncompressedLength = decodeVarintFromCursor(cursor);
656 if (uncompressedLength && *uncompressedLength != actualUncompressedLength) {
657 throw std::runtime_error("LZ4Codec: invalid uncompressed length");
661 DCHECK(uncompressedLength.hasValue());
662 DCHECK(*uncompressedLength <= maxUncompressedLength());
663 actualUncompressedLength = *uncompressedLength;
666 auto sp = StringPiece{cursor.peekBytes()};
667 auto out = IOBuf::create(actualUncompressedLength);
668 int n = LZ4_decompress_safe(
670 reinterpret_cast<char*>(out->writableTail()),
672 actualUncompressedLength);
674 if (n < 0 || uint64_t(n) != actualUncompressedLength) {
675 throw std::runtime_error(to<std::string>(
676 "LZ4 decompression returned invalid value ", n));
678 out->append(actualUncompressedLength);
682 #if LZ4_VERSION_NUMBER >= 10301
684 class LZ4FrameCodec final : public Codec {
686 static std::unique_ptr<Codec> create(int level, CodecType type);
687 explicit LZ4FrameCodec(int level, CodecType type);
688 ~LZ4FrameCodec() override;
690 std::vector<std::string> validPrefixes() const override;
691 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
695 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
697 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
698 std::unique_ptr<IOBuf> doUncompress(
700 Optional<uint64_t> uncompressedLength) override;
702 // Reset the dctx_ if it is dirty or null.
706 LZ4F_decompressionContext_t dctx_{nullptr};
710 /* static */ std::unique_ptr<Codec> LZ4FrameCodec::create(
713 return std::make_unique<LZ4FrameCodec>(level, type);
716 static constexpr uint32_t kLZ4FrameMagicLE = 0x184D2204;
718 std::vector<std::string> LZ4FrameCodec::validPrefixes() const {
719 return {prefixToStringLE(kLZ4FrameMagicLE)};
722 bool LZ4FrameCodec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
723 return dataStartsWithLE(data, kLZ4FrameMagicLE);
726 uint64_t LZ4FrameCodec::doMaxCompressedLength(
727 uint64_t uncompressedLength) const {
728 LZ4F_preferences_t prefs{};
729 prefs.compressionLevel = level_;
730 prefs.frameInfo.contentSize = uncompressedLength;
731 return LZ4F_compressFrameBound(uncompressedLength, &prefs);
734 static size_t lz4FrameThrowOnError(size_t code) {
735 if (LZ4F_isError(code)) {
736 throw std::runtime_error(
737 to<std::string>("LZ4Frame error: ", LZ4F_getErrorName(code)));
742 void LZ4FrameCodec::resetDCtx() {
743 if (dctx_ && !dirty_) {
747 LZ4F_freeDecompressionContext(dctx_);
749 lz4FrameThrowOnError(LZ4F_createDecompressionContext(&dctx_, 100));
753 LZ4FrameCodec::LZ4FrameCodec(int level, CodecType type) : Codec(type) {
754 DCHECK(type == CodecType::LZ4_FRAME);
756 case COMPRESSION_LEVEL_FASTEST:
757 case COMPRESSION_LEVEL_DEFAULT:
760 case COMPRESSION_LEVEL_BEST:
769 LZ4FrameCodec::~LZ4FrameCodec() {
771 LZ4F_freeDecompressionContext(dctx_);
775 std::unique_ptr<IOBuf> LZ4FrameCodec::doCompress(const IOBuf* data) {
776 // LZ4 Frame compression doesn't support streaming so we have to coalesce
778 if (data->isChained()) {
779 clone = data->cloneCoalescedAsValue();
783 const auto uncompressedLength = data->length();
784 LZ4F_preferences_t prefs{};
785 prefs.compressionLevel = level_;
786 prefs.frameInfo.contentSize = uncompressedLength;
788 auto buf = IOBuf::create(maxCompressedLength(uncompressedLength));
789 const size_t written = lz4FrameThrowOnError(LZ4F_compressFrame(
795 buf->append(written);
799 std::unique_ptr<IOBuf> LZ4FrameCodec::doUncompress(
801 Optional<uint64_t> uncompressedLength) {
802 // Reset the dctx if any errors have occurred
805 ByteRange in = *data->begin();
807 if (data->isChained()) {
808 clone = data->cloneCoalescedAsValue();
809 in = clone.coalesce();
812 // Select decompression options
813 LZ4F_decompressOptions_t options;
814 options.stableDst = 1;
815 // Select blockSize and growthSize for the IOBufQueue
816 IOBufQueue queue(IOBufQueue::cacheChainLength());
817 auto blockSize = uint64_t{64} << 10;
818 auto growthSize = uint64_t{4} << 20;
819 if (uncompressedLength) {
820 // Allocate uncompressedLength in one chunk (up to 64 MB)
821 const auto allocateSize = std::min(*uncompressedLength, uint64_t{64} << 20);
822 queue.preallocate(allocateSize, allocateSize);
823 blockSize = std::min(*uncompressedLength, blockSize);
824 growthSize = std::min(*uncompressedLength, growthSize);
826 // Reduce growthSize for small data
827 const auto guessUncompressedLen =
828 4 * std::max<uint64_t>(blockSize, in.size());
829 growthSize = std::min(guessUncompressedLen, growthSize);
831 // Once LZ4_decompress() is called, the dctx_ cannot be reused until it
834 // Decompress until the frame is over
837 // Allocate enough space to decompress at least a block
840 std::tie(out, outSize) = queue.preallocate(blockSize, growthSize);
842 size_t inSize = in.size();
843 code = lz4FrameThrowOnError(
844 LZ4F_decompress(dctx_, out, &outSize, in.data(), &inSize, &options));
845 if (in.empty() && outSize == 0 && code != 0) {
846 // We passed no input, no output was produced, and the frame isn't over
847 // No more forward progress is possible
848 throw std::runtime_error("LZ4Frame error: Incomplete frame");
850 in.uncheckedAdvance(inSize);
851 queue.postallocate(outSize);
853 // At this point the decompression context can be reused
855 if (uncompressedLength && queue.chainLength() != *uncompressedLength) {
856 throw std::runtime_error("LZ4Frame error: Invalid uncompressedLength");
861 #endif // LZ4_VERSION_NUMBER >= 10301
862 #endif // FOLLY_HAVE_LIBLZ4
864 #if FOLLY_HAVE_LIBSNAPPY
871 * Implementation of snappy::Source that reads from a IOBuf chain.
873 class IOBufSnappySource final : public snappy::Source {
875 explicit IOBufSnappySource(const IOBuf* data);
876 size_t Available() const override;
877 const char* Peek(size_t* len) override;
878 void Skip(size_t n) override;
884 IOBufSnappySource::IOBufSnappySource(const IOBuf* data)
885 : available_(data->computeChainDataLength()),
889 size_t IOBufSnappySource::Available() const {
893 const char* IOBufSnappySource::Peek(size_t* len) {
894 auto sp = StringPiece{cursor_.peekBytes()};
899 void IOBufSnappySource::Skip(size_t n) {
900 CHECK_LE(n, available_);
905 class SnappyCodec final : public Codec {
907 static std::unique_ptr<Codec> create(int level, CodecType type);
908 explicit SnappyCodec(int level, CodecType type);
911 uint64_t doMaxUncompressedLength() const override;
912 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
913 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
914 std::unique_ptr<IOBuf> doUncompress(
916 Optional<uint64_t> uncompressedLength) override;
919 std::unique_ptr<Codec> SnappyCodec::create(int level, CodecType type) {
920 return std::make_unique<SnappyCodec>(level, type);
923 SnappyCodec::SnappyCodec(int level, CodecType type) : Codec(type) {
924 DCHECK(type == CodecType::SNAPPY);
926 case COMPRESSION_LEVEL_FASTEST:
927 case COMPRESSION_LEVEL_DEFAULT:
928 case COMPRESSION_LEVEL_BEST:
932 throw std::invalid_argument(to<std::string>(
933 "SnappyCodec: invalid level: ", level));
937 uint64_t SnappyCodec::doMaxUncompressedLength() const {
938 // snappy.h uses uint32_t for lengths, so there's that.
939 return std::numeric_limits<uint32_t>::max();
942 uint64_t SnappyCodec::doMaxCompressedLength(uint64_t uncompressedLength) const {
943 return snappy::MaxCompressedLength(uncompressedLength);
946 std::unique_ptr<IOBuf> SnappyCodec::doCompress(const IOBuf* data) {
947 IOBufSnappySource source(data);
948 auto out = IOBuf::create(maxCompressedLength(source.Available()));
950 snappy::UncheckedByteArraySink sink(reinterpret_cast<char*>(
951 out->writableTail()));
953 size_t n = snappy::Compress(&source, &sink);
955 CHECK_LE(n, out->capacity());
960 std::unique_ptr<IOBuf> SnappyCodec::doUncompress(
962 Optional<uint64_t> uncompressedLength) {
963 uint32_t actualUncompressedLength = 0;
966 IOBufSnappySource source(data);
967 if (!snappy::GetUncompressedLength(&source, &actualUncompressedLength)) {
968 throw std::runtime_error("snappy::GetUncompressedLength failed");
970 if (uncompressedLength && *uncompressedLength != actualUncompressedLength) {
971 throw std::runtime_error("snappy: invalid uncompressed length");
975 auto out = IOBuf::create(actualUncompressedLength);
978 IOBufSnappySource source(data);
979 if (!snappy::RawUncompress(&source,
980 reinterpret_cast<char*>(out->writableTail()))) {
981 throw std::runtime_error("snappy::RawUncompress failed");
985 out->append(actualUncompressedLength);
989 #endif // FOLLY_HAVE_LIBSNAPPY
995 class ZlibCodec final : public Codec {
997 static std::unique_ptr<Codec> create(int level, CodecType type);
998 explicit ZlibCodec(int level, CodecType type);
1000 std::vector<std::string> validPrefixes() const override;
1001 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
1005 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
1006 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
1007 std::unique_ptr<IOBuf> doUncompress(
1009 Optional<uint64_t> uncompressedLength) override;
1011 std::unique_ptr<IOBuf> addOutputBuffer(z_stream* stream, uint32_t length);
1012 bool doInflate(z_stream* stream, IOBuf* head, uint32_t bufferLength);
1017 static constexpr uint16_t kGZIPMagicLE = 0x8B1F;
1019 std::vector<std::string> ZlibCodec::validPrefixes() const {
1020 if (type() == CodecType::ZLIB) {
1021 // Zlib streams start with a 2 byte header.
1028 // We won't restrict the values of any sub-fields except as described below.
1030 // The lowest 4 bits of CMF is the compression method (CM).
1031 // CM == 0x8 is the deflate compression method, which is currently the only
1032 // supported compression method, so any valid prefix must have CM == 0x8.
1034 // The lowest 5 bits of FLG is FCHECK.
1035 // FCHECK must be such that the two header bytes are a multiple of 31 when
1036 // interpreted as a big endian 16-bit number.
1037 std::vector<std::string> result;
1038 // 16 values for the first byte, 8 values for the second byte.
1039 // There are also 4 combinations where both 0x00 and 0x1F work as FCHECK.
1040 result.reserve(132);
1041 // Select all values for the CMF byte that use the deflate algorithm 0x8.
1042 for (uint32_t first = 0x0800; first <= 0xF800; first += 0x1000) {
1043 // Select all values for the FLG, but leave FCHECK as 0 since it's fixed.
1044 for (uint32_t second = 0x00; second <= 0xE0; second += 0x20) {
1045 uint16_t prefix = first | second;
1047 prefix += 31 - (prefix % 31);
1048 result.push_back(prefixToStringLE(Endian::big(prefix)));
1049 // zlib won't produce this, but it is a valid prefix.
1050 if ((prefix & 0x1F) == 31) {
1052 result.push_back(prefixToStringLE(Endian::big(prefix)));
1058 // The gzip frame starts with 2 magic bytes.
1059 return {prefixToStringLE(kGZIPMagicLE)};
1063 bool ZlibCodec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
1064 if (type() == CodecType::ZLIB) {
1066 Cursor cursor{data};
1067 if (!cursor.tryReadBE(value)) {
1070 // zlib compressed if using deflate and is a multiple of 31.
1071 return (value & 0x0F00) == 0x0800 && value % 31 == 0;
1073 return dataStartsWithLE(data, kGZIPMagicLE);
1077 uint64_t ZlibCodec::doMaxCompressedLength(uint64_t uncompressedLength) const {
1078 return deflateBound(nullptr, uncompressedLength);
1081 std::unique_ptr<Codec> ZlibCodec::create(int level, CodecType type) {
1082 return std::make_unique<ZlibCodec>(level, type);
1085 ZlibCodec::ZlibCodec(int level, CodecType type) : Codec(type) {
1086 DCHECK(type == CodecType::ZLIB || type == CodecType::GZIP);
1088 case COMPRESSION_LEVEL_FASTEST:
1091 case COMPRESSION_LEVEL_DEFAULT:
1092 level = Z_DEFAULT_COMPRESSION;
1094 case COMPRESSION_LEVEL_BEST:
1098 if (level != Z_DEFAULT_COMPRESSION && (level < 0 || level > 9)) {
1099 throw std::invalid_argument(to<std::string>(
1100 "ZlibCodec: invalid level: ", level));
1105 std::unique_ptr<IOBuf> ZlibCodec::addOutputBuffer(z_stream* stream,
1107 CHECK_EQ(stream->avail_out, 0);
1109 auto buf = IOBuf::create(length);
1110 buf->append(buf->capacity());
1112 stream->next_out = buf->writableData();
1113 stream->avail_out = buf->length();
1118 bool ZlibCodec::doInflate(z_stream* stream,
1120 uint32_t bufferLength) {
1121 if (stream->avail_out == 0) {
1122 head->prependChain(addOutputBuffer(stream, bufferLength));
1125 int rc = inflate(stream, Z_NO_FLUSH);
1136 throw std::runtime_error(to<std::string>(
1137 "ZlibCodec: inflate error: ", rc, ": ", stream->msg));
1139 CHECK(false) << rc << ": " << stream->msg;
1145 std::unique_ptr<IOBuf> ZlibCodec::doCompress(const IOBuf* data) {
1147 stream.zalloc = nullptr;
1148 stream.zfree = nullptr;
1149 stream.opaque = nullptr;
1151 // Using deflateInit2() to support gzip. "The windowBits parameter is the
1152 // base two logarithm of the maximum window size (...) The default value is
1153 // 15 (...) Add 16 to windowBits to write a simple gzip header and trailer
1154 // around the compressed data instead of a zlib wrapper. The gzip header
1155 // will have no file name, no extra data, no comment, no modification time
1156 // (set to zero), no header crc, and the operating system will be set to 255
1158 int windowBits = 15 + (type() == CodecType::GZIP ? 16 : 0);
1159 // All other parameters (method, memLevel, strategy) get default values from
1161 int rc = deflateInit2(&stream,
1166 Z_DEFAULT_STRATEGY);
1168 throw std::runtime_error(to<std::string>(
1169 "ZlibCodec: deflateInit error: ", rc, ": ", stream.msg));
1172 stream.next_in = stream.next_out = nullptr;
1173 stream.avail_in = stream.avail_out = 0;
1174 stream.total_in = stream.total_out = 0;
1176 bool success = false;
1179 rc = deflateEnd(&stream);
1180 // If we're here because of an exception, it's okay if some data
1182 CHECK(rc == Z_OK || (!success && rc == Z_DATA_ERROR))
1183 << rc << ": " << stream.msg;
1186 uint64_t uncompressedLength = data->computeChainDataLength();
1187 uint64_t maxCompressedLength = deflateBound(&stream, uncompressedLength);
1189 // Max 64MiB in one go
1190 constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20; // 64MiB
1191 constexpr uint32_t defaultBufferLength = uint32_t(4) << 20; // 4MiB
1193 auto out = addOutputBuffer(
1195 (maxCompressedLength <= maxSingleStepLength ?
1196 maxCompressedLength :
1197 defaultBufferLength));
1199 for (auto& range : *data) {
1200 uint64_t remaining = range.size();
1201 uint64_t written = 0;
1203 uint32_t step = (remaining > maxSingleStepLength ?
1204 maxSingleStepLength : remaining);
1205 stream.next_in = const_cast<uint8_t*>(range.data() + written);
1206 stream.avail_in = step;
1210 while (stream.avail_in != 0) {
1211 if (stream.avail_out == 0) {
1212 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
1215 rc = deflate(&stream, Z_NO_FLUSH);
1217 CHECK_EQ(rc, Z_OK) << stream.msg;
1223 if (stream.avail_out == 0) {
1224 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
1227 rc = deflate(&stream, Z_FINISH);
1228 } while (rc == Z_OK);
1230 CHECK_EQ(rc, Z_STREAM_END) << stream.msg;
1232 out->prev()->trimEnd(stream.avail_out);
1234 success = true; // we survived
1239 std::unique_ptr<IOBuf> ZlibCodec::doUncompress(
1241 Optional<uint64_t> uncompressedLength) {
1243 stream.zalloc = nullptr;
1244 stream.zfree = nullptr;
1245 stream.opaque = nullptr;
1247 // "The windowBits parameter is the base two logarithm of the maximum window
1248 // size (...) The default value is 15 (...) add 16 to decode only the gzip
1249 // format (the zlib format will return a Z_DATA_ERROR)."
1250 int windowBits = 15 + (type() == CodecType::GZIP ? 16 : 0);
1251 int rc = inflateInit2(&stream, windowBits);
1253 throw std::runtime_error(to<std::string>(
1254 "ZlibCodec: inflateInit error: ", rc, ": ", stream.msg));
1257 stream.next_in = stream.next_out = nullptr;
1258 stream.avail_in = stream.avail_out = 0;
1259 stream.total_in = stream.total_out = 0;
1261 bool success = false;
1264 rc = inflateEnd(&stream);
1265 // If we're here because of an exception, it's okay if some data
1267 CHECK(rc == Z_OK || (!success && rc == Z_DATA_ERROR))
1268 << rc << ": " << stream.msg;
1271 // Max 64MiB in one go
1272 constexpr uint64_t maxSingleStepLength = uint64_t(64) << 20; // 64MiB
1273 constexpr uint64_t kBlockSize = uint64_t(32) << 10; // 32 KiB
1274 const uint64_t defaultBufferLength =
1275 computeBufferLength(data->computeChainDataLength(), kBlockSize);
1277 auto out = addOutputBuffer(
1279 ((uncompressedLength && *uncompressedLength <= maxSingleStepLength)
1280 ? *uncompressedLength
1281 : defaultBufferLength));
1283 bool streamEnd = false;
1284 for (auto& range : *data) {
1285 if (range.empty()) {
1289 stream.next_in = const_cast<uint8_t*>(range.data());
1290 stream.avail_in = range.size();
1292 while (stream.avail_in != 0) {
1294 throw std::runtime_error(to<std::string>(
1295 "ZlibCodec: junk after end of data"));
1298 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1302 while (!streamEnd) {
1303 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1306 out->prev()->trimEnd(stream.avail_out);
1308 if (uncompressedLength && *uncompressedLength != stream.total_out) {
1309 throw std::runtime_error(
1310 to<std::string>("ZlibCodec: invalid uncompressed length"));
1313 success = true; // we survived
1318 #endif // FOLLY_HAVE_LIBZ
1320 #if FOLLY_HAVE_LIBLZMA
1325 class LZMA2Codec final : public Codec {
1327 static std::unique_ptr<Codec> create(int level, CodecType type);
1328 explicit LZMA2Codec(int level, CodecType type);
1330 std::vector<std::string> validPrefixes() const override;
1331 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
1335 bool doNeedsUncompressedLength() const override;
1336 uint64_t doMaxUncompressedLength() const override;
1337 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
1339 bool encodeSize() const { return type() == CodecType::LZMA2_VARINT_SIZE; }
1341 std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override;
1342 std::unique_ptr<IOBuf> doUncompress(
1344 Optional<uint64_t> uncompressedLength) override;
1346 std::unique_ptr<IOBuf> addOutputBuffer(lzma_stream* stream, size_t length);
1347 bool doInflate(lzma_stream* stream, IOBuf* head, size_t bufferLength);
1352 static constexpr uint64_t kLZMA2MagicLE = 0x005A587A37FD;
1353 static constexpr unsigned kLZMA2MagicBytes = 6;
1355 std::vector<std::string> LZMA2Codec::validPrefixes() const {
1356 if (type() == CodecType::LZMA2_VARINT_SIZE) {
1359 return {prefixToStringLE(kLZMA2MagicLE, kLZMA2MagicBytes)};
1362 bool LZMA2Codec::canUncompress(const IOBuf* data, Optional<uint64_t>) const {
1363 if (type() == CodecType::LZMA2_VARINT_SIZE) {
1366 // Returns false for all inputs less than 8 bytes.
1367 // This is okay, because no valid LZMA2 streams are less than 8 bytes.
1368 return dataStartsWithLE(data, kLZMA2MagicLE, kLZMA2MagicBytes);
1371 std::unique_ptr<Codec> LZMA2Codec::create(int level, CodecType type) {
1372 return std::make_unique<LZMA2Codec>(level, type);
1375 LZMA2Codec::LZMA2Codec(int level, CodecType type) : Codec(type) {
1376 DCHECK(type == CodecType::LZMA2 || type == CodecType::LZMA2_VARINT_SIZE);
1378 case COMPRESSION_LEVEL_FASTEST:
1381 case COMPRESSION_LEVEL_DEFAULT:
1382 level = LZMA_PRESET_DEFAULT;
1384 case COMPRESSION_LEVEL_BEST:
1388 if (level < 0 || level > 9) {
1389 throw std::invalid_argument(to<std::string>(
1390 "LZMA2Codec: invalid level: ", level));
1395 bool LZMA2Codec::doNeedsUncompressedLength() const {
1399 uint64_t LZMA2Codec::doMaxUncompressedLength() const {
1400 // From lzma/base.h: "Stream is roughly 8 EiB (2^63 bytes)"
1401 return uint64_t(1) << 63;
1404 uint64_t LZMA2Codec::doMaxCompressedLength(uint64_t uncompressedLength) const {
1405 return lzma_stream_buffer_bound(uncompressedLength) +
1406 (encodeSize() ? kMaxVarintLength64 : 0);
1409 std::unique_ptr<IOBuf> LZMA2Codec::addOutputBuffer(
1410 lzma_stream* stream,
1413 CHECK_EQ(stream->avail_out, 0);
1415 auto buf = IOBuf::create(length);
1416 buf->append(buf->capacity());
1418 stream->next_out = buf->writableData();
1419 stream->avail_out = buf->length();
1424 std::unique_ptr<IOBuf> LZMA2Codec::doCompress(const IOBuf* data) {
1426 lzma_stream stream = LZMA_STREAM_INIT;
1428 rc = lzma_easy_encoder(&stream, level_, LZMA_CHECK_NONE);
1429 if (rc != LZMA_OK) {
1430 throw std::runtime_error(folly::to<std::string>(
1431 "LZMA2Codec: lzma_easy_encoder error: ", rc));
1434 SCOPE_EXIT { lzma_end(&stream); };
1436 uint64_t uncompressedLength = data->computeChainDataLength();
1437 uint64_t maxCompressedLength = lzma_stream_buffer_bound(uncompressedLength);
1439 // Max 64MiB in one go
1440 constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20; // 64MiB
1441 constexpr uint32_t defaultBufferLength = uint32_t(4) << 20; // 4MiB
1443 auto out = addOutputBuffer(
1445 (maxCompressedLength <= maxSingleStepLength ?
1446 maxCompressedLength :
1447 defaultBufferLength));
1450 auto size = IOBuf::createCombined(kMaxVarintLength64);
1451 encodeVarintToIOBuf(uncompressedLength, size.get());
1452 size->appendChain(std::move(out));
1453 out = std::move(size);
1456 for (auto& range : *data) {
1457 if (range.empty()) {
1461 stream.next_in = const_cast<uint8_t*>(range.data());
1462 stream.avail_in = range.size();
1464 while (stream.avail_in != 0) {
1465 if (stream.avail_out == 0) {
1466 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
1469 rc = lzma_code(&stream, LZMA_RUN);
1471 if (rc != LZMA_OK) {
1472 throw std::runtime_error(folly::to<std::string>(
1473 "LZMA2Codec: lzma_code error: ", rc));
1479 if (stream.avail_out == 0) {
1480 out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
1483 rc = lzma_code(&stream, LZMA_FINISH);
1484 } while (rc == LZMA_OK);
1486 if (rc != LZMA_STREAM_END) {
1487 throw std::runtime_error(folly::to<std::string>(
1488 "LZMA2Codec: lzma_code ended with error: ", rc));
1491 out->prev()->trimEnd(stream.avail_out);
1496 bool LZMA2Codec::doInflate(lzma_stream* stream,
1498 size_t bufferLength) {
1499 if (stream->avail_out == 0) {
1500 head->prependChain(addOutputBuffer(stream, bufferLength));
1503 lzma_ret rc = lzma_code(stream, LZMA_RUN);
1508 case LZMA_STREAM_END:
1511 throw std::runtime_error(to<std::string>(
1512 "LZMA2Codec: lzma_code error: ", rc));
1518 std::unique_ptr<IOBuf> LZMA2Codec::doUncompress(
1520 Optional<uint64_t> uncompressedLength) {
1522 lzma_stream stream = LZMA_STREAM_INIT;
1524 rc = lzma_auto_decoder(&stream, std::numeric_limits<uint64_t>::max(), 0);
1525 if (rc != LZMA_OK) {
1526 throw std::runtime_error(folly::to<std::string>(
1527 "LZMA2Codec: lzma_auto_decoder error: ", rc));
1530 SCOPE_EXIT { lzma_end(&stream); };
1532 // Max 64MiB in one go
1533 constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20; // 64MiB
1534 constexpr uint32_t defaultBufferLength = uint32_t(256) << 10; // 256 KiB
1536 folly::io::Cursor cursor(data);
1538 const uint64_t actualUncompressedLength = decodeVarintFromCursor(cursor);
1539 if (uncompressedLength && *uncompressedLength != actualUncompressedLength) {
1540 throw std::runtime_error("LZMA2Codec: invalid uncompressed length");
1542 uncompressedLength = actualUncompressedLength;
1545 auto out = addOutputBuffer(
1547 ((uncompressedLength && *uncompressedLength <= maxSingleStepLength)
1548 ? *uncompressedLength
1549 : defaultBufferLength));
1551 bool streamEnd = false;
1552 auto buf = cursor.peekBytes();
1553 while (!buf.empty()) {
1554 stream.next_in = const_cast<uint8_t*>(buf.data());
1555 stream.avail_in = buf.size();
1557 while (stream.avail_in != 0) {
1559 throw std::runtime_error(to<std::string>(
1560 "LZMA2Codec: junk after end of data"));
1563 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1566 cursor.skip(buf.size());
1567 buf = cursor.peekBytes();
1570 while (!streamEnd) {
1571 streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
1574 out->prev()->trimEnd(stream.avail_out);
1576 if (uncompressedLength && *uncompressedLength != stream.total_out) {
1577 throw std::runtime_error(
1578 to<std::string>("LZMA2Codec: invalid uncompressed length"));
1584 #endif // FOLLY_HAVE_LIBLZMA
1586 #ifdef FOLLY_HAVE_LIBZSTD
1589 void zstdFreeCStream(ZSTD_CStream* zcs) {
1590 ZSTD_freeCStream(zcs);
1593 void zstdFreeDStream(ZSTD_DStream* zds) {
1594 ZSTD_freeDStream(zds);
1601 class ZSTDStreamCodec final : public StreamCodec {
1603 static std::unique_ptr<Codec> createCodec(int level, CodecType);
1604 static std::unique_ptr<StreamCodec> createStream(int level, CodecType);
1605 explicit ZSTDStreamCodec(int level, CodecType type);
1607 std::vector<std::string> validPrefixes() const override;
1608 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
1612 bool doNeedsUncompressedLength() const override;
1613 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
1614 Optional<uint64_t> doGetUncompressedLength(
1616 Optional<uint64_t> uncompressedLength) const override;
1618 void doResetStream() override;
1619 bool doCompressStream(
1621 MutableByteRange& output,
1622 StreamCodec::FlushOp flushOp) override;
1623 bool doUncompressStream(
1625 MutableByteRange& output,
1626 StreamCodec::FlushOp flushOp) override;
1628 void resetCStream();
1629 void resetDStream();
1631 bool tryBlockCompress(ByteRange& input, MutableByteRange& output) const;
1632 bool tryBlockUncompress(ByteRange& input, MutableByteRange& output) const;
1635 bool needReset_{true};
1638 folly::static_function_deleter<ZSTD_CStream, &zstdFreeCStream>>
1642 folly::static_function_deleter<ZSTD_DStream, &zstdFreeDStream>>
1646 static constexpr uint32_t kZSTDMagicLE = 0xFD2FB528;
1648 std::vector<std::string> ZSTDStreamCodec::validPrefixes() const {
1649 return {prefixToStringLE(kZSTDMagicLE)};
1652 bool ZSTDStreamCodec::canUncompress(const IOBuf* data, Optional<uint64_t>)
1654 return dataStartsWithLE(data, kZSTDMagicLE);
1657 std::unique_ptr<Codec> ZSTDStreamCodec::createCodec(int level, CodecType type) {
1658 return make_unique<ZSTDStreamCodec>(level, type);
1661 std::unique_ptr<StreamCodec> ZSTDStreamCodec::createStream(
1664 return make_unique<ZSTDStreamCodec>(level, type);
1667 ZSTDStreamCodec::ZSTDStreamCodec(int level, CodecType type)
1668 : StreamCodec(type) {
1669 DCHECK(type == CodecType::ZSTD);
1671 case COMPRESSION_LEVEL_FASTEST:
1674 case COMPRESSION_LEVEL_DEFAULT:
1677 case COMPRESSION_LEVEL_BEST:
1681 if (level < 1 || level > ZSTD_maxCLevel()) {
1682 throw std::invalid_argument(
1683 to<std::string>("ZSTD: invalid level: ", level));
1688 bool ZSTDStreamCodec::doNeedsUncompressedLength() const {
1692 uint64_t ZSTDStreamCodec::doMaxCompressedLength(
1693 uint64_t uncompressedLength) const {
1694 return ZSTD_compressBound(uncompressedLength);
1697 void zstdThrowIfError(size_t rc) {
1698 if (!ZSTD_isError(rc)) {
1701 throw std::runtime_error(
1702 to<std::string>("ZSTD returned an error: ", ZSTD_getErrorName(rc)));
1705 Optional<uint64_t> ZSTDStreamCodec::doGetUncompressedLength(
1707 Optional<uint64_t> uncompressedLength) const {
1708 // Read decompressed size from frame if available in first IOBuf.
1709 auto const decompressedSize =
1710 ZSTD_getDecompressedSize(data->data(), data->length());
1711 if (decompressedSize != 0) {
1712 if (uncompressedLength && *uncompressedLength != decompressedSize) {
1713 throw std::runtime_error("ZSTD: invalid uncompressed length");
1715 uncompressedLength = decompressedSize;
1717 return uncompressedLength;
1720 void ZSTDStreamCodec::doResetStream() {
1724 bool ZSTDStreamCodec::tryBlockCompress(
1726 MutableByteRange& output) const {
1728 // We need to know that we have enough output space to use block compression
1729 if (output.size() < ZSTD_compressBound(input.size())) {
1732 size_t const length = ZSTD_compress(
1733 output.data(), output.size(), input.data(), input.size(), level_);
1734 zstdThrowIfError(length);
1735 input.uncheckedAdvance(input.size());
1736 output.uncheckedAdvance(length);
1740 void ZSTDStreamCodec::resetCStream() {
1742 cstream_.reset(ZSTD_createCStream());
1744 throw std::bad_alloc{};
1747 // Advanced API usage works for all supported versions of zstd.
1748 // Required to set contentSizeFlag.
1749 auto params = ZSTD_getParams(level_, uncompressedLength().value_or(0), 0);
1750 params.fParams.contentSizeFlag = uncompressedLength().hasValue();
1751 zstdThrowIfError(ZSTD_initCStream_advanced(
1752 cstream_.get(), nullptr, 0, params, uncompressedLength().value_or(0)));
1755 bool ZSTDStreamCodec::doCompressStream(
1757 MutableByteRange& output,
1758 StreamCodec::FlushOp flushOp) {
1760 // If we are given all the input in one chunk try to use block compression
1761 if (flushOp == StreamCodec::FlushOp::END &&
1762 tryBlockCompress(input, output)) {
1768 ZSTD_inBuffer in = {input.data(), input.size(), 0};
1769 ZSTD_outBuffer out = {output.data(), output.size(), 0};
1771 input.uncheckedAdvance(in.pos);
1772 output.uncheckedAdvance(out.pos);
1774 if (flushOp == StreamCodec::FlushOp::NONE || !input.empty()) {
1775 zstdThrowIfError(ZSTD_compressStream(cstream_.get(), &out, &in));
1777 if (in.pos == in.size && flushOp != StreamCodec::FlushOp::NONE) {
1780 case StreamCodec::FlushOp::FLUSH:
1781 rc = ZSTD_flushStream(cstream_.get(), &out);
1783 case StreamCodec::FlushOp::END:
1784 rc = ZSTD_endStream(cstream_.get(), &out);
1787 throw std::invalid_argument("ZSTD: invalid FlushOp");
1789 zstdThrowIfError(rc);
1797 bool ZSTDStreamCodec::tryBlockUncompress(
1799 MutableByteRange& output) const {
1801 #if ZSTD_VERSION_NUMBER < 10104
1802 // We require ZSTD_findFrameCompressedSize() to perform this optimization.
1805 // We need to know the uncompressed length and have enough output space.
1806 if (!uncompressedLength() || output.size() < *uncompressedLength()) {
1809 size_t const compressedLength =
1810 ZSTD_findFrameCompressedSize(input.data(), input.size());
1811 zstdThrowIfError(compressedLength);
1812 size_t const length = ZSTD_decompress(
1813 output.data(), *uncompressedLength(), input.data(), compressedLength);
1814 zstdThrowIfError(length);
1815 DCHECK_EQ(length, *uncompressedLength());
1816 input.uncheckedAdvance(compressedLength);
1817 output.uncheckedAdvance(length);
1822 void ZSTDStreamCodec::resetDStream() {
1824 dstream_.reset(ZSTD_createDStream());
1826 throw std::bad_alloc{};
1829 zstdThrowIfError(ZSTD_initDStream(dstream_.get()));
1832 bool ZSTDStreamCodec::doUncompressStream(
1834 MutableByteRange& output,
1835 StreamCodec::FlushOp flushOp) {
1837 // If we are given all the input in one chunk try to use block uncompression
1838 if (flushOp == StreamCodec::FlushOp::END &&
1839 tryBlockUncompress(input, output)) {
1845 ZSTD_inBuffer in = {input.data(), input.size(), 0};
1846 ZSTD_outBuffer out = {output.data(), output.size(), 0};
1848 input.uncheckedAdvance(in.pos);
1849 output.uncheckedAdvance(out.pos);
1851 size_t const rc = ZSTD_decompressStream(dstream_.get(), &out, &in);
1852 zstdThrowIfError(rc);
1856 #endif // FOLLY_HAVE_LIBZSTD
1858 #if FOLLY_HAVE_LIBBZ2
1860 class Bzip2Codec final : public Codec {
1862 static std::unique_ptr<Codec> create(int level, CodecType type);
1863 explicit Bzip2Codec(int level, CodecType type);
1865 std::vector<std::string> validPrefixes() const override;
1866 bool canUncompress(IOBuf const* data, Optional<uint64_t> uncompressedLength)
1870 uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const override;
1871 std::unique_ptr<IOBuf> doCompress(IOBuf const* data) override;
1872 std::unique_ptr<IOBuf> doUncompress(
1874 Optional<uint64_t> uncompressedLength) override;
1879 /* static */ std::unique_ptr<Codec> Bzip2Codec::create(
1882 return std::make_unique<Bzip2Codec>(level, type);
1885 Bzip2Codec::Bzip2Codec(int level, CodecType type) : Codec(type) {
1886 DCHECK(type == CodecType::BZIP2);
1888 case COMPRESSION_LEVEL_FASTEST:
1891 case COMPRESSION_LEVEL_DEFAULT:
1894 case COMPRESSION_LEVEL_BEST:
1898 if (level < 1 || level > 9) {
1899 throw std::invalid_argument(
1900 to<std::string>("Bzip2: invalid level: ", level));
1905 static uint32_t constexpr kBzip2MagicLE = 0x685a42;
1906 static uint64_t constexpr kBzip2MagicBytes = 3;
1908 std::vector<std::string> Bzip2Codec::validPrefixes() const {
1909 return {prefixToStringLE(kBzip2MagicLE, kBzip2MagicBytes)};
1912 bool Bzip2Codec::canUncompress(IOBuf const* data, Optional<uint64_t>) const {
1913 return dataStartsWithLE(data, kBzip2MagicLE, kBzip2MagicBytes);
1916 uint64_t Bzip2Codec::doMaxCompressedLength(uint64_t uncompressedLength) const {
1917 // http://www.bzip.org/1.0.5/bzip2-manual-1.0.5.html#bzbufftobuffcompress
1918 // To guarantee that the compressed data will fit in its buffer, allocate an
1919 // output buffer of size 1% larger than the uncompressed data, plus six
1920 // hundred extra bytes.
1921 return uncompressedLength + uncompressedLength / 100 + 600;
1924 static bz_stream createBzStream() {
1926 stream.bzalloc = nullptr;
1927 stream.bzfree = nullptr;
1928 stream.opaque = nullptr;
1929 stream.next_in = stream.next_out = nullptr;
1930 stream.avail_in = stream.avail_out = 0;
1934 // Throws on error condition, otherwise returns the code.
1935 static int bzCheck(int const rc) {
1944 throw std::runtime_error(to<std::string>("Bzip2 error: ", rc));
1948 static std::unique_ptr<IOBuf> addOutputBuffer(
1950 uint64_t const bufferLength) {
1951 DCHECK_LE(bufferLength, std::numeric_limits<unsigned>::max());
1952 DCHECK_EQ(stream->avail_out, 0);
1954 auto buf = IOBuf::create(bufferLength);
1955 buf->append(buf->capacity());
1957 stream->next_out = reinterpret_cast<char*>(buf->writableData());
1958 stream->avail_out = buf->length();
1963 std::unique_ptr<IOBuf> Bzip2Codec::doCompress(IOBuf const* data) {
1964 bz_stream stream = createBzStream();
1965 bzCheck(BZ2_bzCompressInit(&stream, level_, 0, 0));
1967 bzCheck(BZ2_bzCompressEnd(&stream));
1970 uint64_t const uncompressedLength = data->computeChainDataLength();
1971 uint64_t const maxCompressedLen = maxCompressedLength(uncompressedLength);
1972 uint64_t constexpr kMaxSingleStepLength = uint64_t(64) << 20; // 64 MiB
1973 uint64_t constexpr kDefaultBufferLength = uint64_t(4) << 20;
1975 auto out = addOutputBuffer(
1977 maxCompressedLen <= kMaxSingleStepLength ? maxCompressedLen
1978 : kDefaultBufferLength);
1980 for (auto range : *data) {
1981 while (!range.empty()) {
1982 auto const inSize = std::min<size_t>(range.size(), kMaxSingleStepLength);
1984 const_cast<char*>(reinterpret_cast<char const*>(range.data()));
1985 stream.avail_in = inSize;
1987 if (stream.avail_out == 0) {
1988 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
1991 bzCheck(BZ2_bzCompress(&stream, BZ_RUN));
1992 range.uncheckedAdvance(inSize - stream.avail_in);
1996 if (stream.avail_out == 0) {
1997 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
1999 } while (bzCheck(BZ2_bzCompress(&stream, BZ_FINISH)) != BZ_STREAM_END);
2001 out->prev()->trimEnd(stream.avail_out);
2006 std::unique_ptr<IOBuf> Bzip2Codec::doUncompress(
2008 Optional<uint64_t> uncompressedLength) {
2009 bz_stream stream = createBzStream();
2010 bzCheck(BZ2_bzDecompressInit(&stream, 0, 0));
2012 bzCheck(BZ2_bzDecompressEnd(&stream));
2015 uint64_t constexpr kMaxSingleStepLength = uint64_t(64) << 20; // 64 MiB
2016 uint64_t const kBlockSize = uint64_t(100) << 10; // 100 KiB
2017 uint64_t const kDefaultBufferLength =
2018 computeBufferLength(data->computeChainDataLength(), kBlockSize);
2020 auto out = addOutputBuffer(
2022 ((uncompressedLength && *uncompressedLength <= kMaxSingleStepLength)
2023 ? *uncompressedLength
2024 : kDefaultBufferLength));
2027 for (auto range : *data) {
2028 while (!range.empty()) {
2029 auto const inSize = std::min<size_t>(range.size(), kMaxSingleStepLength);
2031 const_cast<char*>(reinterpret_cast<char const*>(range.data()));
2032 stream.avail_in = inSize;
2034 if (stream.avail_out == 0) {
2035 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
2038 rc = bzCheck(BZ2_bzDecompress(&stream));
2039 range.uncheckedAdvance(inSize - stream.avail_in);
2042 while (rc != BZ_STREAM_END) {
2043 if (stream.avail_out == 0) {
2044 out->prependChain(addOutputBuffer(&stream, kDefaultBufferLength));
2047 rc = bzCheck(BZ2_bzDecompress(&stream));
2050 out->prev()->trimEnd(stream.avail_out);
2052 uint64_t const totalOut =
2053 (uint64_t(stream.total_out_hi32) << 32) + stream.total_out_lo32;
2054 if (uncompressedLength && uncompressedLength != totalOut) {
2055 throw std::runtime_error("Bzip2 error: Invalid uncompressed length");
2061 #endif // FOLLY_HAVE_LIBBZ2
2064 * Automatic decompression
2066 class AutomaticCodec final : public Codec {
2068 static std::unique_ptr<Codec> create(
2069 std::vector<std::unique_ptr<Codec>> customCodecs);
2070 explicit AutomaticCodec(std::vector<std::unique_ptr<Codec>> customCodecs);
2072 std::vector<std::string> validPrefixes() const override;
2073 bool canUncompress(const IOBuf* data, Optional<uint64_t> uncompressedLength)
2077 bool doNeedsUncompressedLength() const override;
2078 uint64_t doMaxUncompressedLength() const override;
2080 uint64_t doMaxCompressedLength(uint64_t) const override {
2081 throw std::runtime_error(
2082 "AutomaticCodec error: maxCompressedLength() not supported.");
2084 std::unique_ptr<IOBuf> doCompress(const IOBuf*) override {
2085 throw std::runtime_error("AutomaticCodec error: compress() not supported.");
2087 std::unique_ptr<IOBuf> doUncompress(
2089 Optional<uint64_t> uncompressedLength) override;
2091 void addCodecIfSupported(CodecType type);
2093 // Throws iff the codecs aren't compatible (very slow)
2094 void checkCompatibleCodecs() const;
2096 std::vector<std::unique_ptr<Codec>> codecs_;
2097 bool needsUncompressedLength_;
2098 uint64_t maxUncompressedLength_;
2101 std::vector<std::string> AutomaticCodec::validPrefixes() const {
2102 std::unordered_set<std::string> prefixes;
2103 for (const auto& codec : codecs_) {
2104 const auto codecPrefixes = codec->validPrefixes();
2105 prefixes.insert(codecPrefixes.begin(), codecPrefixes.end());
2107 return std::vector<std::string>{prefixes.begin(), prefixes.end()};
2110 bool AutomaticCodec::canUncompress(
2112 Optional<uint64_t> uncompressedLength) const {
2116 [data, uncompressedLength](std::unique_ptr<Codec> const& codec) {
2117 return codec->canUncompress(data, uncompressedLength);
2121 void AutomaticCodec::addCodecIfSupported(CodecType type) {
2122 const bool present = std::any_of(
2125 [&type](std::unique_ptr<Codec> const& codec) {
2126 return codec->type() == type;
2128 if (hasCodec(type) && !present) {
2129 codecs_.push_back(getCodec(type));
2133 /* static */ std::unique_ptr<Codec> AutomaticCodec::create(
2134 std::vector<std::unique_ptr<Codec>> customCodecs) {
2135 return std::make_unique<AutomaticCodec>(std::move(customCodecs));
2138 AutomaticCodec::AutomaticCodec(std::vector<std::unique_ptr<Codec>> customCodecs)
2139 : Codec(CodecType::USER_DEFINED), codecs_(std::move(customCodecs)) {
2140 // Fastest -> slowest
2141 addCodecIfSupported(CodecType::LZ4_FRAME);
2142 addCodecIfSupported(CodecType::ZSTD);
2143 addCodecIfSupported(CodecType::ZLIB);
2144 addCodecIfSupported(CodecType::GZIP);
2145 addCodecIfSupported(CodecType::LZMA2);
2146 addCodecIfSupported(CodecType::BZIP2);
2148 checkCompatibleCodecs();
2150 // Check that none of the codes are are null
2151 DCHECK(std::none_of(
2152 codecs_.begin(), codecs_.end(), [](std::unique_ptr<Codec> const& codec) {
2153 return codec == nullptr;
2156 needsUncompressedLength_ = std::any_of(
2157 codecs_.begin(), codecs_.end(), [](std::unique_ptr<Codec> const& codec) {
2158 return codec->needsUncompressedLength();
2161 const auto it = std::max_element(
2164 [](std::unique_ptr<Codec> const& lhs, std::unique_ptr<Codec> const& rhs) {
2165 return lhs->maxUncompressedLength() < rhs->maxUncompressedLength();
2167 DCHECK(it != codecs_.end());
2168 maxUncompressedLength_ = (*it)->maxUncompressedLength();
2171 void AutomaticCodec::checkCompatibleCodecs() const {
2172 // Keep track of all the possible headers.
2173 std::unordered_set<std::string> headers;
2174 // The empty header is not allowed.
2177 // Construct a set of headers and check that none of the headers occur twice.
2178 // Eliminate edge cases.
2179 for (auto&& codec : codecs_) {
2180 const auto codecHeaders = codec->validPrefixes();
2181 // Codecs without any valid headers are not allowed.
2182 if (codecHeaders.empty()) {
2183 throw std::invalid_argument{
2184 "AutomaticCodec: validPrefixes() must not be empty."};
2186 // Insert all the headers for the current codec.
2187 const size_t beforeSize = headers.size();
2188 headers.insert(codecHeaders.begin(), codecHeaders.end());
2189 // Codecs are not compatible if any header occurred twice.
2190 if (beforeSize + codecHeaders.size() != headers.size()) {
2191 throw std::invalid_argument{
2192 "AutomaticCodec: Two valid prefixes collide."};
2196 // Check if any strict non-empty prefix of any header is a header.
2197 for (const auto& header : headers) {
2198 for (size_t i = 1; i < header.size(); ++i) {
2199 if (headers.count(header.substr(0, i))) {
2200 throw std::invalid_argument{
2201 "AutomaticCodec: One valid prefix is a prefix of another valid "
2208 bool AutomaticCodec::doNeedsUncompressedLength() const {
2209 return needsUncompressedLength_;
2212 uint64_t AutomaticCodec::doMaxUncompressedLength() const {
2213 return maxUncompressedLength_;
2216 std::unique_ptr<IOBuf> AutomaticCodec::doUncompress(
2218 Optional<uint64_t> uncompressedLength) {
2219 for (auto&& codec : codecs_) {
2220 if (codec->canUncompress(data, uncompressedLength)) {
2221 return codec->uncompress(data, uncompressedLength);
2224 throw std::runtime_error("AutomaticCodec error: Unknown compressed data");
2227 using CodecFactory = std::unique_ptr<Codec> (*)(int, CodecType);
2228 using StreamCodecFactory = std::unique_ptr<StreamCodec> (*)(int, CodecType);
2231 StreamCodecFactory stream;
2235 codecFactories[static_cast<size_t>(CodecType::NUM_CODEC_TYPES)] = {
2237 {NoCompressionCodec::create, nullptr},
2239 #if FOLLY_HAVE_LIBLZ4
2240 {LZ4Codec::create, nullptr},
2245 #if FOLLY_HAVE_LIBSNAPPY
2246 {SnappyCodec::create, nullptr},
2252 {ZlibCodec::create, nullptr},
2257 #if FOLLY_HAVE_LIBLZ4
2258 {LZ4Codec::create, nullptr},
2263 #if FOLLY_HAVE_LIBLZMA
2264 {LZMA2Codec::create, nullptr},
2265 {LZMA2Codec::create, nullptr},
2271 #if FOLLY_HAVE_LIBZSTD
2272 {ZSTDStreamCodec::createCodec, ZSTDStreamCodec::createStream},
2278 {ZlibCodec::create, nullptr},
2283 #if (FOLLY_HAVE_LIBLZ4 && LZ4_VERSION_NUMBER >= 10301)
2284 {LZ4FrameCodec::create, nullptr},
2289 #if FOLLY_HAVE_LIBBZ2
2290 {Bzip2Codec::create, nullptr},
2296 Factory const& getFactory(CodecType type) {
2297 size_t const idx = static_cast<size_t>(type);
2298 if (idx >= static_cast<size_t>(CodecType::NUM_CODEC_TYPES)) {
2299 throw std::invalid_argument(
2300 to<std::string>("Compression type ", idx, " invalid"));
2302 return codecFactories[idx];
2306 bool hasCodec(CodecType type) {
2307 return getFactory(type).codec != nullptr;
2310 std::unique_ptr<Codec> getCodec(CodecType type, int level) {
2311 auto const factory = getFactory(type).codec;
2313 throw std::invalid_argument(
2314 to<std::string>("Compression type ", type, " not supported"));
2316 auto codec = (*factory)(level, type);
2317 DCHECK(codec->type() == type);
2321 bool hasStreamCodec(CodecType type) {
2322 return getFactory(type).stream != nullptr;
2325 std::unique_ptr<StreamCodec> getStreamCodec(CodecType type, int level) {
2326 auto const factory = getFactory(type).stream;
2328 throw std::invalid_argument(
2329 to<std::string>("Compression type ", type, " not supported"));
2331 auto codec = (*factory)(level, type);
2332 DCHECK(codec->type() == type);
2336 std::unique_ptr<Codec> getAutoUncompressionCodec(
2337 std::vector<std::unique_ptr<Codec>> customCodecs) {
2338 return AutomaticCodec::create(std::move(customCodecs));