From 049fb12ea9c00ba1c6463501e2e66b21188cca1f Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 5 Sep 2017 15:15:13 -0700 Subject: [PATCH] Compress empty strings with underlying compressor Summary: - Current behavior compresses empty strings to empty strings. This is undesirable as decompression using underlying decompressor (side-stepping the codec) will fail. This change passes empty strings to the underlying compressor - Decompressing empty string -> empty string was kept for backwards compatibility - Fix `getUncompressedLength` for zlib Reviewed By: terrelln, yfeldblum Differential Revision: D5740034 fbshipit-source-id: 5a747ea4963dad872103209aa4410197f6c605db --- folly/io/Compression.cpp | 53 +++++++++++---------------- folly/io/Compression.h | 3 -- folly/io/compression/Zlib.cpp | 5 ++- folly/io/test/CompressionTest.cpp | 61 ++++++++++++++++++++----------- 4 files changed, 66 insertions(+), 56 deletions(-) diff --git a/folly/io/Compression.cpp b/folly/io/Compression.cpp index b41af0b9..b3abc1cf 100644 --- a/folly/io/Compression.cpp +++ b/folly/io/Compression.cpp @@ -75,9 +75,6 @@ std::unique_ptr Codec::compress(const IOBuf* data) { throw std::invalid_argument("Codec: data must not be nullptr"); } uint64_t len = data->computeChainDataLength(); - if (len == 0) { - return IOBuf::create(0); - } if (len > maxUncompressedLength()) { throw std::runtime_error("Codec: uncompressed length too large"); } @@ -87,9 +84,6 @@ std::unique_ptr Codec::compress(const IOBuf* data) { std::string Codec::compress(const StringPiece data) { const uint64_t len = data.size(); - if (len == 0) { - return ""; - } if (len > maxUncompressedLength()) { throw std::runtime_error("Codec: uncompressed length too large"); } @@ -191,9 +185,6 @@ std::string Codec::doUncompressString( } uint64_t Codec::maxCompressedLength(uint64_t uncompressedLength) const { - if (uncompressedLength == 0) { - return 0; - } return doMaxCompressedLength(uncompressedLength); } @@ -201,8 +192,8 @@ Optional Codec::getUncompressedLength( const folly::IOBuf* data, Optional uncompressedLength) const { auto const compressedLength = data->computeChainDataLength(); - if (uncompressedLength == uint64_t(0) || compressedLength == 0) { - if (uncompressedLength.value_or(0) != 0 || compressedLength != 0) { + if (compressedLength == 0) { + if (uncompressedLength.value_or(0) != 0) { throw std::runtime_error("Invalid uncompressed length"); } return 0; @@ -242,16 +233,12 @@ bool StreamCodec::compressStream( ByteRange& input, MutableByteRange& output, StreamCodec::FlushOp flushOp) { - if (state_ == State::RESET && input.empty()) { - if (flushOp == StreamCodec::FlushOp::NONE) { - return false; - } - if (flushOp == StreamCodec::FlushOp::END && - uncompressedLength().value_or(0) != 0) { - throw std::runtime_error("Codec: invalid uncompressed length"); - } - return true; + if (state_ == State::RESET && input.empty() && + flushOp == StreamCodec::FlushOp::END && + uncompressedLength().value_or(0) != 0) { + throw std::runtime_error("Codec: invalid uncompressed length"); } + if (!uncompressedLength() && needsDataLength()) { throw std::runtime_error("Codec: uncompressed length required"); } @@ -1030,7 +1017,7 @@ class LZMA2StreamCodec final : public StreamCodec { void resetCStream(); void resetDStream(); - size_t decodeVarint(ByteRange& input); + bool decodeAndCheckVarint(ByteRange& input); bool flushVarintBuffer(MutableByteRange& output); void resetVarintBuffer(); @@ -1243,11 +1230,16 @@ bool LZMA2StreamCodec::doCompressStream( * * If there are too many bytes and the varint is not valid, throw a * runtime_error. - * Returns the decoded size or 0 if more bytes are needed. + * + * If the uncompressed length was provided and a decoded varint does not match + * the provided length, throw a runtime_error. + * + * Returns true if the varint was successfully decoded and matches the + * uncompressed length if provided, and false if more bytes are needed. */ -size_t LZMA2StreamCodec::decodeVarint(ByteRange& input) { +bool LZMA2StreamCodec::decodeAndCheckVarint(ByteRange& input) { if (input.empty()) { - return 0; + return false; } size_t const numBytesToCopy = std::min(kMaxVarintLength64 - varintBufferPos_, input.size()); @@ -1260,14 +1252,17 @@ size_t LZMA2StreamCodec::decodeVarint(ByteRange& input) { if (ret.hasValue()) { size_t const varintSize = rangeSize - range.size(); input.advance(varintSize - varintBufferPos_); - return ret.value(); + if (uncompressedLength() && *uncompressedLength() != ret.value()) { + throw std::runtime_error("LZMA2StreamCodec: invalid uncompressed length"); + } + return true; } else if (ret.error() == DecodeVarintError::TooManyBytes) { throw std::runtime_error("LZMA2StreamCodec: invalid uncompressed length"); } else { // Too few bytes input.advance(numBytesToCopy); varintBufferPos_ += numBytesToCopy; - return 0; + return false; } } @@ -1288,13 +1283,9 @@ bool LZMA2StreamCodec::doUncompressStream( if (needDecodeSize_) { // Try decoding the varint. If the input does not contain the entire varint, // buffer the input. If the varint can not be decoded, fail. - size_t const size = decodeVarint(input); - if (!size) { + if (!decodeAndCheckVarint(input)) { return false; } - if (uncompressedLength() && *uncompressedLength() != size) { - throw std::runtime_error("LZMA2StreamCodec: invalid uncompressed length"); - } needDecodeSize_ = false; } diff --git a/folly/io/Compression.h b/folly/io/Compression.h index 462bb5d0..4013e0a2 100644 --- a/folly/io/Compression.h +++ b/folly/io/Compression.h @@ -131,9 +131,6 @@ class Codec { * Compress data, returning an IOBuf (which may share storage with data). * Throws std::invalid_argument if data is larger than * maxUncompressedLength(). - * - * Regardless of the behavior of the underlying compressor, compressing - * an empty IOBuf chain will return an empty IOBuf chain. */ std::unique_ptr compress(const folly::IOBuf* data); diff --git a/folly/io/compression/Zlib.cpp b/folly/io/compression/Zlib.cpp index 5cb5dcbf..38cfa1ad 100644 --- a/folly/io/compression/Zlib.cpp +++ b/folly/io/compression/Zlib.cpp @@ -176,7 +176,10 @@ bool ZlibStreamCodec::canUncompress(const IOBuf* data, Optional) uint64_t ZlibStreamCodec::doMaxCompressedLength( uint64_t uncompressedLength) const { - return deflateBound(nullptr, uncompressedLength); + // When passed a nullptr, deflateBound() adds 6 bytes for a zlib wrapper. A + // gzip wrapper is 18 bytes, so we add the 12 byte difference. + return deflateBound(nullptr, uncompressedLength) + + (options_.format == Options::Format::GZIP ? 12 : 0); } std::unique_ptr ZlibStreamCodec::createCodec( diff --git a/folly/io/test/CompressionTest.cpp b/folly/io/test/CompressionTest.cpp index 6a1ff71f..b9db1da0 100644 --- a/folly/io/test/CompressionTest.cpp +++ b/folly/io/test/CompressionTest.cpp @@ -205,7 +205,10 @@ class CompressionTest protected: void SetUp() override { auto tup = GetParam(); - uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup); + int lengthLog = std::tr1::get<0>(tup); + // Small hack to test empty data + uncompressedLength_ = + (lengthLog < 0) ? 0 : uint64_t(1) << std::tr1::get<0>(tup); chunks_ = std::tr1::get<1>(tup); codec_ = getCodec(std::tr1::get<2>(tup)); } @@ -225,6 +228,9 @@ class CompressionTest void CompressionTest::runSimpleIOBufTest(const DataHolder& dh) { const auto original = split(IOBuf::wrapBuffer(dh.data(uncompressedLength_))); const auto compressed = split(codec_->compress(original.get())); + EXPECT_LE( + compressed->computeChainDataLength(), + codec_->maxCompressedLength(uncompressedLength_)); if (!codec_->needsUncompressedLength()) { auto uncompressed = codec_->uncompress(compressed.get()); EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength()); @@ -243,6 +249,9 @@ void CompressionTest::runSimpleStringTest(const DataHolder& dh) { reinterpret_cast(dh.data(uncompressedLength_).data()), uncompressedLength_); const auto compressed = codec_->compress(original); + EXPECT_LE( + compressed.length(), codec_->maxCompressedLength(uncompressedLength_)); + if (!codec_->needsUncompressedLength()) { auto uncompressed = codec_->uncompress(compressed); EXPECT_EQ(uncompressedLength_, uncompressed.length()); @@ -301,7 +310,7 @@ INSTANTIATE_TEST_CASE_P( CompressionTest, CompressionTest, testing::Combine( - testing::Values(0, 1, 12, 22, 25, 27), + testing::Values(-1, 0, 1, 12, 22, 25, 27), testing::Values(1, 2, 3, 8, 65), testing::ValuesIn(availableCodecs()))); @@ -484,7 +493,6 @@ TEST(StreamingUnitTest, needsDataLength) { } TEST_P(StreamingUnitTest, maxCompressedLength) { - EXPECT_EQ(0, codec_->maxCompressedLength(0)); for (uint64_t const length : {1, 10, 100, 1000, 10000, 100000, 1000000}) { EXPECT_GE(codec_->maxCompressedLength(length), length); } @@ -494,11 +502,11 @@ TEST_P(StreamingUnitTest, getUncompressedLength) { auto const empty = IOBuf::create(0); EXPECT_EQ(uint64_t(0), codec_->getUncompressedLength(empty.get())); EXPECT_EQ(uint64_t(0), codec_->getUncompressedLength(empty.get(), 0)); + EXPECT_ANY_THROW(codec_->getUncompressedLength(empty.get(), 1)); auto const data = IOBuf::wrapBuffer(randomDataHolder.data(100)); auto const compressed = codec_->compress(data.get()); - EXPECT_ANY_THROW(codec_->getUncompressedLength(data.get(), 0)); if (auto const length = codec_->getUncompressedLength(data.get())) { EXPECT_EQ(100, *length); } @@ -512,35 +520,46 @@ TEST_P(StreamingUnitTest, getUncompressedLength) { TEST_P(StreamingUnitTest, emptyData) { ByteRange input{}; - auto buffer = IOBuf::create(1); + auto buffer = IOBuf::create(codec_->maxCompressedLength(0)); buffer->append(buffer->capacity()); - MutableByteRange output{}; + MutableByteRange output; // Test compressing empty data in one pass if (!codec_->needsDataLength()) { + output = {buffer->writableData(), buffer->length()}; EXPECT_TRUE( codec_->compressStream(input, output, StreamCodec::FlushOp::END)); } codec_->resetStream(0); - EXPECT_TRUE(codec_->compressStream(input, output, StreamCodec::FlushOp::END)); output = {buffer->writableData(), buffer->length()}; EXPECT_TRUE(codec_->compressStream(input, output, StreamCodec::FlushOp::END)); - EXPECT_EQ(buffer->length(), output.size()); + + // Test uncompressing the compressed empty data is equivalent to the empty + // string + { + size_t compressedSize = buffer->length() - output.size(); + auto const compressed = + IOBuf::copyBuffer(buffer->writableData(), compressedSize); + auto inputRange = compressed->coalesce(); + codec_->resetStream(0); + output = {buffer->writableData(), buffer->length()}; + EXPECT_TRUE(codec_->uncompressStream( + inputRange, output, StreamCodec::FlushOp::END)); + EXPECT_EQ(output.size(), buffer->length()); + } // Test compressing empty data with multiple calls to compressStream() - codec_->resetStream(0); - output = {}; - EXPECT_FALSE(codec_->compressStream(input, output)); - EXPECT_TRUE( - codec_->compressStream(input, output, StreamCodec::FlushOp::FLUSH)); - EXPECT_TRUE(codec_->compressStream(input, output, StreamCodec::FlushOp::END)); - codec_->resetStream(0); - output = {buffer->writableData(), buffer->length()}; - EXPECT_FALSE(codec_->compressStream(input, output)); - EXPECT_TRUE( - codec_->compressStream(input, output, StreamCodec::FlushOp::FLUSH)); - EXPECT_TRUE(codec_->compressStream(input, output, StreamCodec::FlushOp::END)); - EXPECT_EQ(buffer->length(), output.size()); + { + auto largeBuffer = IOBuf::create(codec_->maxCompressedLength(0) * 2); + largeBuffer->append(largeBuffer->capacity()); + codec_->resetStream(0); + output = {largeBuffer->writableData(), largeBuffer->length()}; + EXPECT_FALSE(codec_->compressStream(input, output)); + EXPECT_TRUE( + codec_->compressStream(input, output, StreamCodec::FlushOp::FLUSH)); + EXPECT_TRUE( + codec_->compressStream(input, output, StreamCodec::FlushOp::END)); + } // Test uncompressing empty data output = {}; -- 2.34.1