folly/io:compression: add LZMA2 support
authorAndrew Gallagher <agallagher@fb.com>
Tue, 10 Dec 2013 23:32:46 +0000 (15:32 -0800)
committerJordan DeLong <jdelong@fb.com>
Fri, 20 Dec 2013 21:06:33 +0000 (13:06 -0800)
Summary:
Adds LZMA2 and LZMA2_VARINT_SIZE compression support for
folly::io::Compression.  This format shows some big wins for
compressing ELF object files and is useful in our modified
ccache client.

Test Plan:
Compression unittests.  Also, tested compressing object files built
in fbcode.  On average, the compression percentage improved from
~16.5% to ~12%.  But we save a lot more as object files get bigger,
which can help make bigger object files fit over fewer memcache
keys.

Reviewed By: njormrod@fb.com

FB internal diff: D1092576

folly/io/Compression.cpp
folly/io/Compression.h
folly/io/test/CompressionTest.cpp

index 951619a17debccd79c0b2edfc0a5d1909e084dc2..29796a25df0de4478293e78dcee31e627759a298 100644 (file)
@@ -22,6 +22,7 @@
 #include <snappy.h>
 #include <snappy-sinksource.h>
 #include <zlib.h>
+#include <lzma.h>
 
 #include "folly/Conv.h"
 #include "folly/Memory.h"
@@ -624,6 +625,243 @@ std::unique_ptr<IOBuf> ZlibCodec::doUncompress(const IOBuf* data,
   return out;
 }
 
+/**
+ * LZMA2 compression
+ */
+class LZMA2Codec FOLLY_FINAL : public Codec {
+ public:
+  static std::unique_ptr<Codec> create(int level, CodecType type);
+  explicit LZMA2Codec(int level, CodecType type);
+
+ private:
+  bool doNeedsUncompressedLength() const FOLLY_OVERRIDE;
+  uint64_t doMaxUncompressedLength() const FOLLY_OVERRIDE;
+
+  bool encodeSize() const { return type() == CodecType::LZMA2_VARINT_SIZE; }
+
+  std::unique_ptr<IOBuf> doCompress(const IOBuf* data) FOLLY_OVERRIDE;
+  std::unique_ptr<IOBuf> doUncompress(
+      const IOBuf* data,
+      uint64_t uncompressedLength) FOLLY_OVERRIDE;
+
+  std::unique_ptr<IOBuf> addOutputBuffer(lzma_stream* stream, size_t length);
+  bool doInflate(lzma_stream* stream, IOBuf* head, size_t bufferLength);
+
+  int level_;
+};
+
+std::unique_ptr<Codec> LZMA2Codec::create(int level, CodecType type) {
+  return make_unique<LZMA2Codec>(level, type);
+}
+
+LZMA2Codec::LZMA2Codec(int level, CodecType type) : Codec(type) {
+  DCHECK(type == CodecType::LZMA2 || type == CodecType::LZMA2_VARINT_SIZE);
+  switch (level) {
+  case COMPRESSION_LEVEL_FASTEST:
+    level = 0;
+    break;
+  case COMPRESSION_LEVEL_DEFAULT:
+    level = LZMA_PRESET_DEFAULT;
+    break;
+  case COMPRESSION_LEVEL_BEST:
+    level = 9;
+    break;
+  }
+  if (level < 0 || level > 9) {
+    throw std::invalid_argument(to<std::string>(
+        "LZMA2Codec: invalid level: ", level));
+  }
+  level_ = level;
+}
+
+bool LZMA2Codec::doNeedsUncompressedLength() const {
+  return !encodeSize();
+}
+
+uint64_t LZMA2Codec::doMaxUncompressedLength() const {
+  // From lzma/base.h: "Stream is roughly 8 EiB (2^63 bytes)"
+  return uint64_t(1) << 63;
+}
+
+std::unique_ptr<IOBuf> LZMA2Codec::addOutputBuffer(
+    lzma_stream* stream,
+    size_t length) {
+
+  CHECK_EQ(stream->avail_out, 0);
+
+  auto buf = IOBuf::create(length);
+  buf->append(length);
+
+  stream->next_out = buf->writableData();
+  stream->avail_out = buf->length();
+
+  return buf;
+}
+
+std::unique_ptr<IOBuf> LZMA2Codec::doCompress(const IOBuf* data) {
+  lzma_ret rc;
+  lzma_stream stream = LZMA_STREAM_INIT;
+
+  rc = lzma_easy_encoder(&stream, level_, LZMA_CHECK_NONE);
+  if (rc != LZMA_OK) {
+    throw std::runtime_error(folly::to<std::string>(
+      "LZMA2Codec: lzma_easy_encoder error: ", rc));
+  }
+
+  SCOPE_EXIT { lzma_end(&stream); };
+
+  uint64_t uncompressedLength = data->computeChainDataLength();
+  uint64_t maxCompressedLength = lzma_stream_buffer_bound(uncompressedLength);
+
+  // Max 64MiB in one go
+  constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20;    // 64MiB
+  constexpr uint32_t defaultBufferLength = uint32_t(4) << 20;     // 4MiB
+
+  auto out = addOutputBuffer(
+    &stream,
+    (maxCompressedLength <= maxSingleStepLength ?
+     maxCompressedLength :
+     defaultBufferLength));
+
+  if (encodeSize()) {
+    auto size = IOBuf::createCombined(kMaxVarintLength64);
+    encodeVarintToIOBuf(uncompressedLength, size.get());
+    size->appendChain(std::move(out));
+    out = std::move(size);
+  }
+
+  for (auto& range : *data) {
+    if (range.empty()) {
+      continue;
+    }
+
+    stream.next_in = const_cast<uint8_t*>(range.data());
+    stream.avail_in = range.size();
+
+    while (stream.avail_in != 0) {
+      if (stream.avail_out == 0) {
+        out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
+      }
+
+      rc = lzma_code(&stream, LZMA_RUN);
+
+      if (rc != LZMA_OK) {
+        throw std::runtime_error(folly::to<std::string>(
+          "LZMA2Codec: lzma_code error: ", rc));
+      }
+    }
+  }
+
+  do {
+    if (stream.avail_out == 0) {
+      out->prependChain(addOutputBuffer(&stream, defaultBufferLength));
+    }
+
+    rc = lzma_code(&stream, LZMA_FINISH);
+  } while (rc == LZMA_OK);
+
+  if (rc != LZMA_STREAM_END) {
+    throw std::runtime_error(folly::to<std::string>(
+      "LZMA2Codec: lzma_code ended with error: ", rc));
+  }
+
+  out->prev()->trimEnd(stream.avail_out);
+
+  return out;
+}
+
+bool LZMA2Codec::doInflate(lzma_stream* stream,
+                          IOBuf* head,
+                          size_t bufferLength) {
+  if (stream->avail_out == 0) {
+    head->prependChain(addOutputBuffer(stream, bufferLength));
+  }
+
+  lzma_ret rc = lzma_code(stream, LZMA_RUN);
+
+  switch (rc) {
+  case LZMA_OK:
+    break;
+  case LZMA_STREAM_END:
+    return true;
+  default:
+    throw std::runtime_error(to<std::string>(
+        "LZMA2Codec: lzma_code error: ", rc));
+  }
+
+  return false;
+}
+
+std::unique_ptr<IOBuf> LZMA2Codec::doUncompress(const IOBuf* data,
+                                               uint64_t uncompressedLength) {
+  lzma_ret rc;
+  lzma_stream stream = LZMA_STREAM_INIT;
+
+  rc = lzma_auto_decoder(&stream, std::numeric_limits<uint64_t>::max(), 0);
+  if (rc != LZMA_OK) {
+    throw std::runtime_error(folly::to<std::string>(
+      "LZMA2Codec: lzma_auto_decoder error: ", rc));
+  }
+
+  SCOPE_EXIT { lzma_end(&stream); };
+
+  // Max 64MiB in one go
+  constexpr uint32_t maxSingleStepLength = uint32_t(64) << 20;    // 64MiB
+  constexpr uint32_t defaultBufferLength = uint32_t(4) << 20;     // 4MiB
+
+  folly::io::Cursor cursor(data);
+  uint64_t actualUncompressedLength;
+  if (encodeSize()) {
+    actualUncompressedLength = decodeVarintFromCursor(cursor);
+    if (uncompressedLength != UNKNOWN_UNCOMPRESSED_LENGTH &&
+        uncompressedLength != actualUncompressedLength) {
+      throw std::runtime_error("LZMA2Codec: invalid uncompressed length");
+    }
+  } else {
+    actualUncompressedLength = uncompressedLength;
+    DCHECK_NE(actualUncompressedLength, UNKNOWN_UNCOMPRESSED_LENGTH);
+  }
+
+  auto out = addOutputBuffer(
+      &stream,
+      (actualUncompressedLength <= maxSingleStepLength ?
+       actualUncompressedLength :
+       defaultBufferLength));
+
+  bool streamEnd = false;
+  auto buf = cursor.peek();
+  while (buf.second != 0) {
+    stream.next_in = const_cast<uint8_t*>(buf.first);
+    stream.avail_in = buf.second;
+
+    while (stream.avail_in != 0) {
+      if (streamEnd) {
+        throw std::runtime_error(to<std::string>(
+            "LZMA2Codec: junk after end of data"));
+      }
+
+      streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
+    }
+
+    cursor.skip(buf.second);
+    buf = cursor.peek();
+  }
+
+  while (!streamEnd) {
+    streamEnd = doInflate(&stream, out.get(), defaultBufferLength);
+  }
+
+  out->prev()->trimEnd(stream.avail_out);
+
+  if (actualUncompressedLength != stream.total_out) {
+    throw std::runtime_error(to<std::string>(
+        "LZMA2Codec: invalid uncompressed length"));
+  }
+
+  return out;
+}
+
+
 typedef std::unique_ptr<Codec> (*CodecFactory)(int, CodecType);
 
 CodecFactory gCodecFactories[
@@ -633,7 +871,9 @@ CodecFactory gCodecFactories[
   LZ4Codec::create,
   SnappyCodec::create,
   ZlibCodec::create,
-  LZ4Codec::create
+  LZ4Codec::create,
+  LZMA2Codec::create,
+  LZMA2Codec::create,
 };
 
 }  // namespace
index 1edba5e6f3e07435378476935cb9b72c2c222e3c..8a13b76f1e6687435ec3976307d1adf8b34459fb 100644 (file)
@@ -66,7 +66,14 @@ enum class CodecType {
    */
   LZ4_VARINT_SIZE = 5,
 
-  NUM_CODEC_TYPES = 6,
+  /**
+   * Use LZMA2 compression.
+   * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
+   */
+  LZMA2 = 6,
+  LZMA2_VARINT_SIZE = 7,
+
+  NUM_CODEC_TYPES = 8,
 };
 
 class Codec {
index 27a6b9fdccde103bdff6a12a4310d109b221e3e5..86461bced713bb4d438e6b6a0c17e9bf7958c168 100644 (file)
@@ -126,6 +126,9 @@ TEST(CompressionTestNeedsUncompressedLength, Simple) {
   EXPECT_FALSE(getCodec(CodecType::SNAPPY)->needsUncompressedLength());
   EXPECT_FALSE(getCodec(CodecType::ZLIB)->needsUncompressedLength());
   EXPECT_FALSE(getCodec(CodecType::LZ4_VARINT_SIZE)->needsUncompressedLength());
+  EXPECT_TRUE(getCodec(CodecType::LZMA2)->needsUncompressedLength());
+  EXPECT_FALSE(getCodec(CodecType::LZMA2_VARINT_SIZE)
+    ->needsUncompressedLength());
 }
 
 class CompressionTest : public testing::TestWithParam<
@@ -176,7 +179,9 @@ INSTANTIATE_TEST_CASE_P(
                         CodecType::LZ4,
                         CodecType::SNAPPY,
                         CodecType::ZLIB,
-                        CodecType::LZ4_VARINT_SIZE)));
+                        CodecType::LZ4_VARINT_SIZE,
+                        CodecType::LZMA2,
+                        CodecType::LZMA2_VARINT_SIZE)));
 
 class CompressionCorruptionTest : public testing::TestWithParam<CodecType> {
  protected: