2 * Copyright 2013-present Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
25 #include <folly/Optional.h>
26 #include <folly/Range.h>
27 #include <folly/compression/Counters.h>
28 #include <folly/io/IOBuf.h>
31 * Compression / decompression over IOBufs
37 enum class CodecType {
39 * This codec type is not defined; getCodec() will throw an exception
40 * if used. Useful if deriving your own classes from Codec without
41 * going through the getCodec() interface.
52 * Use LZ4 compression.
53 * Levels supported: 1 = fast, 2 = best; default = 1
58 * Use Snappy compression.
64 * Use zlib compression.
65 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
70 * Use LZ4 compression, prefixed with size (as Varint).
75 * Use LZMA2 compression.
76 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
79 LZMA2_VARINT_SIZE = 7,
82 * Use ZSTD compression.
87 * Use gzip compression. This is the same compression algorithm as ZLIB but
88 * gzip-compressed files tend to be easier to work with from the command line.
89 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
94 * Use LZ4 frame compression.
95 * Levels supported: 0 = fast, 16 = best; default = 0
100 * Use bzip2 compression.
101 * Levels supported: 1 = fast, 9 = best; default = 9
105 NUM_CODEC_TYPES = 12,
112 static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1);
114 * Return the maximum length of data that may be compressed with this codec.
115 * NO_COMPRESSION and ZLIB support arbitrary lengths;
116 * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB.
117 * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited.
119 uint64_t maxUncompressedLength() const;
122 * Return the codec's type.
124 CodecType type() const { return type_; }
127 * Does this codec need the exact uncompressed length on decompression?
129 bool needsUncompressedLength() const;
132 * Compress data, returning an IOBuf (which may share storage with data).
133 * Throws std::invalid_argument if data is larger than
134 * maxUncompressedLength().
136 std::unique_ptr<IOBuf> compress(const folly::IOBuf* data);
139 * Compresses data. May involve additional copies compared to the overload
140 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
143 std::string compress(StringPiece data);
146 * Uncompress data. Throws std::runtime_error on decompression error.
148 * Some codecs (LZ4) require the exact uncompressed length; this is indicated
149 * by needsUncompressedLength().
151 * For other codes (zlib), knowing the exact uncompressed length ahead of
152 * time might be faster.
154 * Regardless of the behavior of the underlying compressor, uncompressing
155 * an empty IOBuf chain will return an empty IOBuf chain.
157 std::unique_ptr<IOBuf> uncompress(
159 folly::Optional<uint64_t> uncompressedLength = folly::none);
162 * Uncompresses data. May involve additional copies compared to the overload
163 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
166 std::string uncompress(
168 folly::Optional<uint64_t> uncompressedLength = folly::none);
171 * Returns a bound on the maximum compressed length when compressing data with
172 * the given uncompressed length.
174 uint64_t maxCompressedLength(uint64_t uncompressedLength) const;
177 * Extracts the uncompressed length from the compressed data if possible.
178 * If the codec doesn't store the uncompressed length, or the data is
179 * corrupted it returns the given uncompressedLength.
180 * If the uncompressed length is stored in the compressed data and
181 * uncompressedLength is not none and they do not match a std::runtime_error
184 folly::Optional<uint64_t> getUncompressedLength(
185 const folly::IOBuf* data,
186 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
191 folly::Optional<int> level = folly::none,
192 folly::StringPiece name = {},
193 bool counters = true);
197 * Returns a superset of the set of prefixes for which canUncompress() will
198 * return true. A superset is allowed for optimizations in canUncompress()
199 * based on other knowledge such as length. None of the prefixes may be empty.
200 * default: No prefixes.
202 virtual std::vector<std::string> validPrefixes() const;
205 * Returns true if the codec thinks it can uncompress the data.
206 * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy,
207 * it can always return false.
208 * default: Returns false.
210 virtual bool canUncompress(
211 const folly::IOBuf* data,
212 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
215 // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH)
216 virtual uint64_t doMaxUncompressedLength() const;
217 // default: doesn't need uncompressed length
218 virtual bool doNeedsUncompressedLength() const;
219 virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0;
220 virtual std::unique_ptr<IOBuf> doUncompress(
221 const folly::IOBuf* data,
222 folly::Optional<uint64_t> uncompressedLength) = 0;
223 // default: an implementation is provided by default to wrap the strings into
224 // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output
225 // from IOBuf to string. Implementers, at their discretion, can override
226 // these methods to avoid the copy.
227 virtual std::string doCompressString(StringPiece data);
228 virtual std::string doUncompressString(
230 folly::Optional<uint64_t> uncompressedLength);
232 virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0;
233 // default: returns the passed uncompressedLength.
234 virtual folly::Optional<uint64_t> doGetUncompressedLength(
235 const folly::IOBuf* data,
236 folly::Optional<uint64_t> uncompressedLength) const;
239 folly::detail::CompressionCounter bytesBeforeCompression_;
240 folly::detail::CompressionCounter bytesAfterCompression_;
241 folly::detail::CompressionCounter bytesBeforeDecompression_;
242 folly::detail::CompressionCounter bytesAfterDecompression_;
243 folly::detail::CompressionCounter compressions_;
244 folly::detail::CompressionCounter decompressions_;
245 folly::detail::CompressionCounter compressionMilliseconds_;
246 folly::detail::CompressionCounter decompressionMilliseconds_;
249 class StreamCodec : public Codec {
251 ~StreamCodec() override {}
254 * Does the codec need the data length before compression streaming?
256 bool needsDataLength() const;
258 /*****************************************************************************
260 *****************************************************************************
261 * A low-level stateful streaming API.
262 * Streaming operations can be started in two ways:
263 * 1. From a clean Codec on which no non-const methods have been called.
264 * 2. A call to resetStream(), which will reset any codec to a clean state.
265 * After a streaming operation has begun, either compressStream() or
266 * uncompressStream() must be called until the streaming operation ends.
267 * compressStream() ends when it returns true with flushOp END.
268 * uncompressStream() ends when it returns true. At this point the codec
269 * may be reused by calling resetStream().
271 * compress() and uncompress() can be called at any time, but they interrupt
272 * any ongoing streaming operations (state is lost and resetStream() must be
273 * called before another streaming operation).
277 * Reset the state of the codec, and set the uncompressed length for the next
278 * streaming operation. If uncompressedLength is not none it must be exactly
279 * the uncompressed length. compressStream() must be passed exactly
280 * uncompressedLength input bytes before the stream is ended.
281 * uncompressStream() must be passed a compressed frame that uncompresses to
282 * uncompressedLength.
284 void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none);
286 enum class FlushOp { NONE, FLUSH, END };
289 * Compresses some data from the input buffer and writes the compressed data
290 * into the output buffer. It may read input without producing any output,
291 * except when forced to flush.
293 * The input buffer is advanced to point to the range of data that hasn't yet
294 * been read. Compression will resume at this point for the next call to
295 * compressStream(). The output buffer is advanced one byte past the last byte
298 * The default flushOp is NONE, which allows compressStream() complete
299 * discretion in how much data to gather before writing any output.
301 * If flushOp is END, all pending and input data is flushed to the output
302 * buffer, and the frame is ended. compressStream() must be called with the
303 * same input and flushOp END until it returns true. At this point the caller
304 * must call resetStream() to use the codec again.
306 * If flushOp is FLUSH, all pending and input data is flushed to the output
307 * buffer, but the frame is not ended. compressStream() must be called with
308 * the same input and flushOp END until it returns true. At this point the
309 * caller can continue to compressStream() with any input data and flushOp.
310 * The uncompressor, if passed all the produced output data, will be able to
311 * uncompress all the input data passed to compressStream() so far. Excessive
312 * use of flushOp FLUSH will deteriorate compression ratio. This is useful for
313 * stateful streaming across a network. Most users don't need to use this
316 * A std::logic_error is thrown on incorrect usage of the API.
317 * A std::runtime_error is thrown upon error conditions or if no forward
318 * progress could be made twice in a row.
321 folly::ByteRange& input,
322 folly::MutableByteRange& output,
323 FlushOp flushOp = StreamCodec::FlushOp::NONE);
326 * Uncompresses some data from the input buffer and writes the uncompressed
327 * data into the output buffer. It may read input without producing any
330 * The input buffer is advanced to point to the range of data that hasn't yet
331 * been read. Uncompression will resume at this point for the next call to
332 * uncompressStream(). The output buffer is advanced one byte past the last
335 * The default flushOp is NONE, which allows uncompressStream() complete
336 * discretion in how much output data to flush. The uncompressor may not make
337 * maximum forward progress, but will make some forward progress when
340 * If flushOp is END, the caller guarantees that no more input will be
341 * presented to uncompressStream(). uncompressStream() must be called with the
342 * same input and flushOp END until it returns true. This is not mandatory,
343 * but if the input is all available in one buffer, and there is enough output
344 * space to write the entire frame, codecs can uncompress faster.
346 * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum
347 * amount of forward progress possible. When using this flushOp and
348 * uncompressStream() returns with `!output.empty()` the caller knows that all
349 * pending output has been flushed. This is useful for stateful streaming
350 * across a network, and it should be used in conjunction with
351 * compressStream() with flushOp FLUSH. Most users don't need to use this
354 * A std::runtime_error is thrown upon error conditions or if no forward
355 * progress could be made upon two consecutive calls to the function (only the
356 * second call will throw an exception).
358 * Returns true at the end of a frame. At this point resetStream() must be
359 * called to reuse the codec.
361 bool uncompressStream(
362 folly::ByteRange& input,
363 folly::MutableByteRange& output,
364 FlushOp flushOp = StreamCodec::FlushOp::NONE);
369 folly::Optional<int> level = folly::none,
370 folly::StringPiece name = {},
371 bool counters = true)
372 : Codec(type, std::move(level), name, counters) {}
374 // Returns the uncompressed length last passed to resetStream() or none if it
375 // hasn't been called yet.
376 folly::Optional<uint64_t> uncompressedLength() const {
377 return uncompressedLength_;
381 // default: Implemented using the streaming API.
382 std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override;
383 std::unique_ptr<IOBuf> doUncompress(
384 const folly::IOBuf* data,
385 folly::Optional<uint64_t> uncompressedLength) override;
387 // default: Returns false
388 virtual bool doNeedsDataLength() const;
389 virtual void doResetStream() = 0;
390 virtual bool doCompressStream(
391 folly::ByteRange& input,
392 folly::MutableByteRange& output,
393 FlushOp flushOp) = 0;
394 virtual bool doUncompressStream(
395 folly::ByteRange& input,
396 folly::MutableByteRange& output,
397 FlushOp flushOp) = 0;
407 void assertStateIs(State expected) const;
409 State state_{State::RESET};
410 ByteRange previousInput_{};
411 folly::Optional<uint64_t> uncompressedLength_{};
412 bool progressMade_{true};
415 constexpr int COMPRESSION_LEVEL_FASTEST = -1;
416 constexpr int COMPRESSION_LEVEL_DEFAULT = -2;
417 constexpr int COMPRESSION_LEVEL_BEST = -3;
420 * Return a codec for the given type. Throws on error. The level
421 * is a non-negative codec-dependent integer indicating the level of
422 * compression desired, or one of the following constants:
424 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
426 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
428 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
431 * When decompressing, the compression level is ignored. All codecs will
432 * decompress all data compressed with the a codec of the same type, regardless
433 * of compression level.
435 std::unique_ptr<Codec> getCodec(
437 int level = COMPRESSION_LEVEL_DEFAULT);
440 * Return a codec for the given type. Throws on error. The level
441 * is a non-negative codec-dependent integer indicating the level of
442 * compression desired, or one of the following constants:
444 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
446 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
448 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
451 * When decompressing, the compression level is ignored. All codecs will
452 * decompress all data compressed with the a codec of the same type, regardless
453 * of compression level.
455 std::unique_ptr<StreamCodec> getStreamCodec(
457 int level = COMPRESSION_LEVEL_DEFAULT);
460 * Returns a codec that can uncompress any of the given codec types as well as
461 * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to
462 * customCodecs in order, so long as a codec with the same type() isn't already
463 * present in customCodecs or as the terminalCodec. When uncompress() is called,
464 * each codec's canUncompress() is called in the order that they are given.
465 * Appended default codecs are checked last. uncompress() is called on the
466 * first codec whose canUncompress() returns true.
468 * In addition, an optional `terminalCodec` can be provided. This codec's
469 * uncompress() will be called either when no other codec canUncompress() the
470 * data or the chosen codec throws an exception on the data. The terminalCodec
471 * is intended for ambiguous headers, when canUncompress() is false for some
472 * data it can actually uncompress. The terminalCodec does not need to override
473 * validPrefixes() or canUncompress() and overriding these functions will have
474 * no effect on the returned codec's validPrefixes() or canUncompress()
475 * functions. The terminalCodec's needsUncompressedLength() and
476 * maxUncompressedLength() will affect the returned codec's respective
477 * functions. The terminalCodec must not be duplicated in customCodecs.
479 * An exception is thrown if no codec canUncompress() the data and either no
480 * terminal codec was provided or a terminal codec was provided and it throws on
482 * An exception is thrown if the chosen codec's uncompress() throws on the data
483 * and either no terminal codec was provided or a terminal codec was provided
484 * and it also throws on the data.
485 * An exception is thrown if compress() is called on the returned codec.
487 * Requirements are checked in debug mode and are as follows:
488 * Let headers be the concatenation of every codec's validPrefixes().
489 * 1. Each codec must override validPrefixes() and canUncompress().
490 * 2. No codec's validPrefixes() may be empty.
491 * 3. No header in headers may be empty.
492 * 4. headers must not contain any duplicate elements.
493 * 5. No strict non-empty prefix of any header in headers may be in headers.
494 * 6. The terminalCodec's type must not be the same as any other codec's type
495 * (with USER_DEFINED being the exception).
497 std::unique_ptr<Codec> getAutoUncompressionCodec(
498 std::vector<std::unique_ptr<Codec>> customCodecs = {},
499 std::unique_ptr<Codec> terminalCodec = {});
502 * Check if a specified codec is supported.
504 bool hasCodec(CodecType type);
507 * Check if a specified codec is supported and supports streaming.
509 bool hasStreamCodec(CodecType type);