2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
25 #include <folly/Optional.h>
26 #include <folly/Range.h>
27 #include <folly/io/IOBuf.h>
30 * Compression / decompression over IOBufs
36 enum class CodecType {
38 * This codec type is not defined; getCodec() will throw an exception
39 * if used. Useful if deriving your own classes from Codec without
40 * going through the getCodec() interface.
51 * Use LZ4 compression.
52 * Levels supported: 1 = fast, 2 = best; default = 1
57 * Use Snappy compression.
63 * Use zlib compression.
64 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
69 * Use LZ4 compression, prefixed with size (as Varint).
74 * Use LZMA2 compression.
75 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
78 LZMA2_VARINT_SIZE = 7,
81 * Use ZSTD compression.
86 * Use gzip compression. This is the same compression algorithm as ZLIB but
87 * gzip-compressed files tend to be easier to work with from the command line.
88 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
93 * Use LZ4 frame compression.
94 * Levels supported: 0 = fast, 16 = best; default = 0
99 * Use bzip2 compression.
100 * Levels supported: 1 = fast, 9 = best; default = 9
104 NUM_CODEC_TYPES = 12,
111 static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1);
113 * Return the maximum length of data that may be compressed with this codec.
114 * NO_COMPRESSION and ZLIB support arbitrary lengths;
115 * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB.
116 * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited.
118 uint64_t maxUncompressedLength() const;
121 * Return the codec's type.
123 CodecType type() const { return type_; }
126 * Does this codec need the exact uncompressed length on decompression?
128 bool needsUncompressedLength() const;
131 * Compress data, returning an IOBuf (which may share storage with data).
132 * Throws std::invalid_argument if data is larger than
133 * maxUncompressedLength().
135 * Regardless of the behavior of the underlying compressor, compressing
136 * an empty IOBuf chain will return an empty IOBuf chain.
138 std::unique_ptr<IOBuf> compress(const folly::IOBuf* data);
141 * Compresses data. May involve additional copies compared to the overload
142 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
145 std::string compress(StringPiece data);
148 * Uncompress data. Throws std::runtime_error on decompression error.
150 * Some codecs (LZ4) require the exact uncompressed length; this is indicated
151 * by needsUncompressedLength().
153 * For other codes (zlib), knowing the exact uncompressed length ahead of
154 * time might be faster.
156 * Regardless of the behavior of the underlying compressor, uncompressing
157 * an empty IOBuf chain will return an empty IOBuf chain.
159 std::unique_ptr<IOBuf> uncompress(
161 folly::Optional<uint64_t> uncompressedLength = folly::none);
164 * Uncompresses data. May involve additional copies compared to the overload
165 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
168 std::string uncompress(
170 folly::Optional<uint64_t> uncompressedLength = folly::none);
173 * Returns a bound on the maximum compressed length when compressing data with
174 * the given uncompressed length.
176 uint64_t maxCompressedLength(uint64_t uncompressedLength) const;
179 * Extracts the uncompressed length from the compressed data if possible.
180 * If the codec doesn't store the uncompressed length, or the data is
181 * corrupted it returns the given uncompressedLength.
182 * If the uncompressed length is stored in the compressed data and
183 * uncompressedLength is not none and they do not match a std::runtime_error
186 folly::Optional<uint64_t> getUncompressedLength(
187 const folly::IOBuf* data,
188 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
191 explicit Codec(CodecType type);
195 * Returns a superset of the set of prefixes for which canUncompress() will
196 * return true. A superset is allowed for optimizations in canUncompress()
197 * based on other knowledge such as length. None of the prefixes may be empty.
198 * default: No prefixes.
200 virtual std::vector<std::string> validPrefixes() const;
203 * Returns true if the codec thinks it can uncompress the data.
204 * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy,
205 * it can always return false.
206 * default: Returns false.
208 virtual bool canUncompress(
209 const folly::IOBuf* data,
210 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
213 // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH)
214 virtual uint64_t doMaxUncompressedLength() const;
215 // default: doesn't need uncompressed length
216 virtual bool doNeedsUncompressedLength() const;
217 virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0;
218 virtual std::unique_ptr<IOBuf> doUncompress(
219 const folly::IOBuf* data,
220 folly::Optional<uint64_t> uncompressedLength) = 0;
221 // default: an implementation is provided by default to wrap the strings into
222 // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output
223 // from IOBuf to string. Implementers, at their discretion, can override
224 // these methods to avoid the copy.
225 virtual std::string doCompressString(StringPiece data);
226 virtual std::string doUncompressString(
228 folly::Optional<uint64_t> uncompressedLength);
230 virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0;
231 // default: returns the passed uncompressedLength.
232 virtual folly::Optional<uint64_t> doGetUncompressedLength(
233 const folly::IOBuf* data,
234 folly::Optional<uint64_t> uncompressedLength) const;
239 class StreamCodec : public Codec {
241 ~StreamCodec() override {}
244 * Does the codec need the data length before compression streaming?
246 bool needsDataLength() const;
248 /*****************************************************************************
250 *****************************************************************************
251 * A low-level stateful streaming API.
252 * Streaming operations can be started in two ways:
253 * 1. From a clean Codec on which no non-const methods have been called.
254 * 2. A call to resetStream(), which will reset any codec to a clean state.
255 * After a streaming operation has begun, either compressStream() or
256 * uncompressStream() must be called until the streaming operation ends.
257 * compressStream() ends when it returns true with flushOp END.
258 * uncompressStream() ends when it returns true. At this point the codec
259 * may be reused by calling resetStream().
261 * compress() and uncompress() can be called at any time, but they interrupt
262 * any ongoing streaming operations (state is lost and resetStream() must be
263 * called before another streaming operation).
267 * Reset the state of the codec, and set the uncompressed length for the next
268 * streaming operation. If uncompressedLength is not none it must be exactly
269 * the uncompressed length. compressStream() must be passed exactly
270 * uncompressedLength input bytes before the stream is ended.
271 * uncompressStream() must be passed a compressed frame that uncompresses to
272 * uncompressedLength.
274 void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none);
276 enum class FlushOp { NONE, FLUSH, END };
279 * Compresses some data from the input buffer and writes the compressed data
280 * into the output buffer. It may read input without producing any output,
281 * except when forced to flush.
283 * The input buffer is advanced to point to the range of data that hasn't yet
284 * been read. Compression will resume at this point for the next call to
285 * compressStream(). The output buffer is advanced one byte past the last byte
288 * The default flushOp is NONE, which allows compressStream() complete
289 * discretion in how much data to gather before writing any output.
291 * If flushOp is END, all pending and input data is flushed to the output
292 * buffer, and the frame is ended. compressStream() must be called with the
293 * same input and flushOp END until it returns true. At this point the caller
294 * must call resetStream() to use the codec again.
296 * If flushOp is FLUSH, all pending and input data is flushed to the output
297 * buffer, but the frame is not ended. compressStream() must be called with
298 * the same input and flushOp END until it returns true. At this point the
299 * caller can continue to compressStream() with any input data and flushOp.
300 * The uncompressor, if passed all the produced output data, will be able to
301 * uncompress all the input data passed to compressStream() so far. Excessive
302 * use of flushOp FLUSH will deteriorate compression ratio. This is useful for
303 * stateful streaming across a network. Most users don't need to use this
306 * A std::logic_error is thrown on incorrect usage of the API.
307 * A std::runtime_error is thrown upon error conditions.
310 folly::ByteRange& input,
311 folly::MutableByteRange& output,
312 FlushOp flushOp = StreamCodec::FlushOp::NONE);
315 * Uncompresses some data from the input buffer and writes the uncompressed
316 * data into the output buffer. It may read input without producing any
319 * The input buffer is advanced to point to the range of data that hasn't yet
320 * been read. Uncompression will resume at this point for the next call to
321 * uncompressStream(). The output buffer is advanced one byte past the last
324 * The default flushOp is NONE, which allows uncompressStream() complete
325 * discretion in how much output data to flush. The uncompressor may not make
326 * maximum forward progress, but will make some forward progress when
329 * If flushOp is END, the caller guarantees that no more input will be
330 * presented to uncompressStream(). uncompressStream() must be called with the
331 * same input and flushOp END until it returns true. This is not mandatory,
332 * but if the input is all available in one buffer, and there is enough output
333 * space to write the entire frame, codecs can uncompress faster.
335 * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum
336 * amount of forward progress possible. When using this flushOp and
337 * uncompressStream() returns with `!output.empty()` the caller knows that all
338 * pending output has been flushed. This is useful for stateful streaming
339 * across a network, and it should be used in conjunction with
340 * compressStream() with flushOp FLUSH. Most users don't need to use this
343 * Returns true at the end of a frame. At this point resetStream() must be
344 * called to reuse the codec.
346 bool uncompressStream(
347 folly::ByteRange& input,
348 folly::MutableByteRange& output,
349 FlushOp flushOp = StreamCodec::FlushOp::NONE);
352 explicit StreamCodec(CodecType type) : Codec(type) {}
354 // Returns the uncompressed length last passed to resetStream() or none if it
355 // hasn't been called yet.
356 folly::Optional<uint64_t> uncompressedLength() const {
357 return uncompressedLength_;
361 // default: Implemented using the streaming API.
362 std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override;
363 std::unique_ptr<IOBuf> doUncompress(
364 const folly::IOBuf* data,
365 folly::Optional<uint64_t> uncompressedLength) override;
367 // default: Returns false
368 virtual bool doNeedsDataLength() const;
369 virtual void doResetStream() = 0;
370 virtual bool doCompressStream(
371 folly::ByteRange& input,
372 folly::MutableByteRange& output,
373 FlushOp flushOp) = 0;
374 virtual bool doUncompressStream(
375 folly::ByteRange& input,
376 folly::MutableByteRange& output,
377 FlushOp flushOp) = 0;
387 void assertStateIs(State expected) const;
390 State state_{State::RESET};
391 ByteRange previousInput_{};
392 folly::Optional<uint64_t> uncompressedLength_{};
395 constexpr int COMPRESSION_LEVEL_FASTEST = -1;
396 constexpr int COMPRESSION_LEVEL_DEFAULT = -2;
397 constexpr int COMPRESSION_LEVEL_BEST = -3;
400 * Return a codec for the given type. Throws on error. The level
401 * is a non-negative codec-dependent integer indicating the level of
402 * compression desired, or one of the following constants:
404 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
406 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
408 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
411 * When decompressing, the compression level is ignored. All codecs will
412 * decompress all data compressed with the a codec of the same type, regardless
413 * of compression level.
415 std::unique_ptr<Codec> getCodec(
417 int level = COMPRESSION_LEVEL_DEFAULT);
420 * Return a codec for the given type. Throws on error. The level
421 * is a non-negative codec-dependent integer indicating the level of
422 * compression desired, or one of the following constants:
424 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
426 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
428 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
431 * When decompressing, the compression level is ignored. All codecs will
432 * decompress all data compressed with the a codec of the same type, regardless
433 * of compression level.
435 std::unique_ptr<StreamCodec> getStreamCodec(
437 int level = COMPRESSION_LEVEL_DEFAULT);
440 * Returns a codec that can uncompress any of the given codec types as well as
441 * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to
442 * customCodecs in order, so long as a codec with the same type() isn't already
443 * present. When uncompress() is called, each codec's canUncompress() is called
444 * in the order that they are given. Appended default codecs are checked last.
445 * uncompress() is called on the first codec whose canUncompress() returns true.
446 * An exception is thrown if no codec canUncompress() the data.
447 * An exception is thrown if the chosen codec's uncompress() throws on the data.
448 * An exception is thrown if compress() is called on the returned codec.
450 * Requirements are checked in debug mode and are as follows:
451 * Let headers be the concatenation of every codec's validPrefixes().
452 * 1. Each codec must override validPrefixes() and canUncompress().
453 * 2. No codec's validPrefixes() may be empty.
454 * 3. No header in headers may be empty.
455 * 4. headers must not contain any duplicate elements.
456 * 5. No strict non-empty prefix of any header in headers may be in headers.
458 std::unique_ptr<Codec> getAutoUncompressionCodec(
459 std::vector<std::unique_ptr<Codec>> customCodecs = {});
462 * Check if a specified codec is supported.
464 bool hasCodec(CodecType type);
467 * Check if a specified codec is supported and supports streaming.
469 bool hasStreamCodec(CodecType type);