2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
22 #include <glog/logging.h>
24 #if !defined(__GNUC__) && !defined(_MSC_VER)
25 #error GroupVarint.h requires GCC or MSVC
28 #include <folly/Portability.h>
30 #if FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 || FOLLY_AARCH64
31 #define HAVE_GROUP_VARINT 1
33 #include <folly/Range.h>
34 #include <folly/detail/GroupVarintDetail.h>
35 #include <folly/lang/Bits.h>
36 #include <folly/portability/Builtins.h>
39 #include <nmmintrin.h>
42 alignas(16) extern const uint64_t groupVarintSSEMasks[];
49 extern const uint8_t groupVarintLengths[];
59 * GroupVarint encoding for 32-bit values.
61 * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
62 * There is one byte of overhead. (The first byte contains the lengths of
63 * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
65 * This implementation assumes little-endian and does unaligned 32-bit
66 * accesses, so it's basically not portable outside of the x86[_64] world.
69 class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
73 * Return the number of bytes used to encode these four values.
75 static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
76 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
80 * Return the number of bytes used to encode four uint32_t values stored
81 * at consecutive positions in an array.
83 static size_t size(const uint32_t* p) {
84 return size(p[0], p[1], p[2], p[3]);
88 * Return the number of bytes used to encode count (<= 4) values.
89 * If you clip a buffer after these many bytes, you can still decode
90 * the first "count" values correctly (if the remaining size() -
91 * partialSize() bytes are filled with garbage).
93 static size_t partialSize(const type* p, size_t count) {
94 DCHECK_LE(count, kGroupSize);
95 size_t s = kHeaderSize + count;
96 for (; count; --count, ++p) {
103 * Return the number of values from *p that are valid from an encoded
104 * buffer of size bytes.
106 static size_t partialCount(const char* p, size_t size) {
107 uint8_t v = uint8_t(*p);
108 size_t s = kHeaderSize;
129 * Given a pointer to the beginning of an GroupVarint32-encoded block,
130 * return the number of bytes used by the encoding.
132 static size_t encodedSize(const char* p) {
133 return kHeaderSize + kGroupSize +
134 b0key(uint8_t(*p)) + b1key(uint8_t(*p)) +
135 b2key(uint8_t(*p)) + b3key(uint8_t(*p));
139 * Encode four uint32_t values into the buffer pointed-to by p, and return
140 * the next position in the buffer (that is, one character past the last
141 * encoded byte). p needs to have at least size()+4 bytes available.
143 static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
144 uint8_t b0key = key(a);
145 uint8_t b1key = key(b);
146 uint8_t b2key = key(c);
147 uint8_t b3key = key(d);
148 *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key;
149 storeUnaligned(p, a);
151 storeUnaligned(p, b);
153 storeUnaligned(p, c);
155 storeUnaligned(p, d);
161 * Encode four uint32_t values from the array pointed-to by src into the
162 * buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
164 static char* encode(char* p, const uint32_t* src) {
165 return encode(p, src[0], src[1], src[2], src[3]);
169 * Decode four uint32_t values from a buffer, and return the next position
170 * in the buffer (that is, one character past the last encoded byte).
171 * The buffer needs to have at least 3 extra bytes available (they
172 * may be read but ignored).
174 static const char* decode_simple(const char* p, uint32_t* a, uint32_t* b,
175 uint32_t* c, uint32_t* d) {
176 size_t k = loadUnaligned<uint8_t>(p);
177 const char* end = p + detail::groupVarintLengths[k];
179 size_t k0 = b0key(k);
180 *a = loadUnaligned<uint32_t>(p) & kMask[k0];
182 size_t k1 = b1key(k);
183 *b = loadUnaligned<uint32_t>(p) & kMask[k1];
185 size_t k2 = b2key(k);
186 *c = loadUnaligned<uint32_t>(p) & kMask[k2];
188 size_t k3 = b3key(k);
189 *d = loadUnaligned<uint32_t>(p) & kMask[k3];
195 * Decode four uint32_t values from a buffer and store them in the array
196 * pointed-to by dest, similar to decode(p,a,b,c,d) above.
198 static const char* decode_simple(const char* p, uint32_t* dest) {
199 return decode_simple(p, dest, dest+1, dest+2, dest+3);
204 * Just like the non-SSSE3 decode below, but with the additional constraint
205 * that we must be able to read at least 17 bytes from the input pointer, p.
207 static const char* decode(const char* p, uint32_t* dest) {
208 uint8_t key = uint8_t(p[0]);
209 __m128i val = _mm_loadu_si128((const __m128i*)(p+1));
211 _mm_load_si128((const __m128i*)&detail::groupVarintSSEMasks[key * 2]);
212 __m128i r = _mm_shuffle_epi8(val, mask);
213 _mm_storeu_si128((__m128i*)dest, r);
214 return p + detail::groupVarintLengths[key];
218 * Just like decode_simple, but with the additional constraint that
219 * we must be able to read at least 17 bytes from the input pointer, p.
221 static const char* decode(const char* p, uint32_t* a, uint32_t* b,
222 uint32_t* c, uint32_t* d) {
223 uint8_t key = uint8_t(p[0]);
224 __m128i val = _mm_loadu_si128((const __m128i*)(p+1));
226 _mm_load_si128((const __m128i*)&detail::groupVarintSSEMasks[key * 2]);
227 __m128i r = _mm_shuffle_epi8(val, mask);
229 // Extracting 32 bits at a time out of an XMM register is a SSE4 feature
231 *a = uint32_t(_mm_extract_epi32(r, 0));
232 *b = uint32_t(_mm_extract_epi32(r, 1));
233 *c = uint32_t(_mm_extract_epi32(r, 2));
234 *d = uint32_t(_mm_extract_epi32(r, 3));
235 #else /* !__SSE4__ */
236 *a = _mm_extract_epi16(r, 0) + (_mm_extract_epi16(r, 1) << 16);
237 *b = _mm_extract_epi16(r, 2) + (_mm_extract_epi16(r, 3) << 16);
238 *c = _mm_extract_epi16(r, 4) + (_mm_extract_epi16(r, 5) << 16);
239 *d = _mm_extract_epi16(r, 6) + (_mm_extract_epi16(r, 7) << 16);
240 #endif /* __SSE4__ */
242 return p + detail::groupVarintLengths[key];
245 #else /* !__SSSE3__ */
246 static const char* decode(const char* p, uint32_t* a, uint32_t* b,
247 uint32_t* c, uint32_t* d) {
248 return decode_simple(p, a, b, c, d);
251 static const char* decode(const char* p, uint32_t* dest) {
252 return decode_simple(p, dest);
254 #endif /* __SSSE3__ */
257 static uint8_t key(uint32_t x) {
258 // __builtin_clz is undefined for the x==0 case
259 return uint8_t(3 - (__builtin_clz(x | 1) / 8));
261 static size_t b0key(size_t x) { return x & 3; }
262 static size_t b1key(size_t x) { return (x >> 2) & 3; }
263 static size_t b2key(size_t x) { return (x >> 4) & 3; }
264 static size_t b3key(size_t x) { return (x >> 6) & 3; }
266 static const uint32_t kMask[];
271 * GroupVarint encoding for 64-bit values.
273 * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
274 * There are two bytes of overhead. (The first two bytes contain the lengths
275 * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
277 * This implementation assumes little-endian and does unaligned 64-bit
278 * accesses, so it's basically not portable outside of the x86[_64] world.
281 class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
284 * Return the number of bytes used to encode these five values.
286 static size_t size(uint64_t a, uint64_t b, uint64_t c, uint64_t d,
288 return kHeaderSize + kGroupSize +
289 key(a) + key(b) + key(c) + key(d) + key(e);
293 * Return the number of bytes used to encode five uint64_t values stored
294 * at consecutive positions in an array.
296 static size_t size(const uint64_t* p) {
297 return size(p[0], p[1], p[2], p[3], p[4]);
301 * Return the number of bytes used to encode count (<= 4) values.
302 * If you clip a buffer after these many bytes, you can still decode
303 * the first "count" values correctly (if the remaining size() -
304 * partialSize() bytes are filled with garbage).
306 static size_t partialSize(const type* p, size_t count) {
307 DCHECK_LE(count, kGroupSize);
308 size_t s = kHeaderSize + count;
309 for (; count; --count, ++p) {
316 * Return the number of values from *p that are valid from an encoded
317 * buffer of size bytes.
319 static size_t partialCount(const char* p, size_t size) {
320 uint16_t v = loadUnaligned<uint16_t>(p);
321 size_t s = kHeaderSize;
346 * Given a pointer to the beginning of an GroupVarint64-encoded block,
347 * return the number of bytes used by the encoding.
349 static size_t encodedSize(const char* p) {
350 uint16_t n = loadUnaligned<uint16_t>(p);
351 return kHeaderSize + kGroupSize +
352 b0key(n) + b1key(n) + b2key(n) + b3key(n) + b4key(n);
356 * Encode five uint64_t values into the buffer pointed-to by p, and return
357 * the next position in the buffer (that is, one character past the last
358 * encoded byte). p needs to have at least size()+8 bytes available.
360 static char* encode(char* p, uint64_t a, uint64_t b, uint64_t c,
361 uint64_t d, uint64_t e) {
362 uint16_t b0key = key(a);
363 uint16_t b1key = key(b);
364 uint16_t b2key = key(c);
365 uint16_t b3key = key(d);
366 uint16_t b4key = key(e);
367 storeUnaligned<uint16_t>(
376 storeUnaligned(p, a);
378 storeUnaligned(p, b);
380 storeUnaligned(p, c);
382 storeUnaligned(p, d);
384 storeUnaligned(p, e);
390 * Encode five uint64_t values from the array pointed-to by src into the
391 * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
393 static char* encode(char* p, const uint64_t* src) {
394 return encode(p, src[0], src[1], src[2], src[3], src[4]);
398 * Decode five uint64_t values from a buffer, and return the next position
399 * in the buffer (that is, one character past the last encoded byte).
400 * The buffer needs to have at least 7 bytes available (they may be read
403 static const char* decode(const char* p, uint64_t* a, uint64_t* b,
404 uint64_t* c, uint64_t* d, uint64_t* e) {
405 uint16_t k = loadUnaligned<uint16_t>(p);
407 uint8_t k0 = b0key(k);
408 *a = loadUnaligned<uint64_t>(p) & kMask[k0];
410 uint8_t k1 = b1key(k);
411 *b = loadUnaligned<uint64_t>(p) & kMask[k1];
413 uint8_t k2 = b2key(k);
414 *c = loadUnaligned<uint64_t>(p) & kMask[k2];
416 uint8_t k3 = b3key(k);
417 *d = loadUnaligned<uint64_t>(p) & kMask[k3];
419 uint8_t k4 = b4key(k);
420 *e = loadUnaligned<uint64_t>(p) & kMask[k4];
426 * Decode five uint64_t values from a buffer and store them in the array
427 * pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
429 static const char* decode(const char* p, uint64_t* dest) {
430 return decode(p, dest, dest+1, dest+2, dest+3, dest+4);
434 enum { kHeaderBytes = 2 };
436 static uint8_t key(uint64_t x) {
437 // __builtin_clzll is undefined for the x==0 case
438 return uint8_t(7 - (__builtin_clzll(x | 1) / 8));
441 static uint8_t b0key(uint16_t x) { return x & 7u; }
442 static uint8_t b1key(uint16_t x) { return (x >> 3) & 7u; }
443 static uint8_t b2key(uint16_t x) { return (x >> 6) & 7u; }
444 static uint8_t b3key(uint16_t x) { return (x >> 9) & 7u; }
445 static uint8_t b4key(uint16_t x) { return (x >> 12) & 7u; }
447 static const uint64_t kMask[];
450 typedef GroupVarint<uint32_t> GroupVarint32;
451 typedef GroupVarint<uint64_t> GroupVarint64;
454 * Simplify use of GroupVarint* for the case where data is available one
455 * entry at a time (instead of one group at a time). Handles buffering
456 * and an incomplete last chunk.
458 * Output is a function object that accepts character ranges:
459 * out(StringPiece) appends the given character range to the output.
461 template <class T, class Output>
462 class GroupVarintEncoder {
464 typedef GroupVarint<T> Base;
467 explicit GroupVarintEncoder(Output out)
472 ~GroupVarintEncoder() {
477 * Add a value to the encoder.
480 buf_[count_++] = val;
481 if (count_ == Base::kGroupSize) {
482 char* p = Base::encode(tmp_, buf_);
483 out_(StringPiece(tmp_, p));
489 * Finish encoding, flushing any buffered values if necessary.
490 * After finish(), the encoder is immediately ready to encode more data
491 * to the same output.
495 // This is not strictly necessary, but it makes testing easy;
496 // uninitialized bytes are guaranteed to be recorded as taking one byte
498 for (size_t i = count_; i < Base::kGroupSize; i++) {
501 Base::encode(tmp_, buf_);
502 out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
508 * Return the appender that was used.
513 const Output& output() const {
518 * Reset the encoder, disregarding any state (except what was already
519 * flushed to the output, of course).
527 char tmp_[Base::kMaxSize];
528 type buf_[Base::kGroupSize];
533 * Simplify use of GroupVarint* for the case where the last group in the
534 * input may be incomplete (but the exact size of the input is known).
535 * Allows for extracting values one at a time.
537 template <typename T>
538 class GroupVarintDecoder {
540 typedef GroupVarint<T> Base;
543 GroupVarintDecoder() = default;
545 explicit GroupVarintDecoder(StringPiece data,
546 size_t maxCount = (size_t)-1)
547 : rrest_(data.end()),
553 remaining_(maxCount) {
556 void reset(StringPiece data, size_t maxCount = (size_t)-1) {
563 remaining_ = maxCount;
567 * Read and return the next value.
569 bool next(type* val) {
570 if (pos_ == count_) {
572 size_t rem = size_t(end_ - p_);
573 if (rem == 0 || remaining_ == 0) {
576 // next() attempts to read one full group at a time, and so we must have
577 // at least enough bytes readable after its end to handle the case if the
578 // last group is full.
580 // The best way to ensure this is to ensure that data has at least
581 // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy
582 // into a temporary buffer.
583 if (limit_ - p_ < Base::kMaxSize) {
584 memcpy(tmp_, p_, rem);
587 limit_ = tmp_ + sizeof(tmp_);
590 const char* n = Base::decode(p_, buf_);
592 // Full group could be decoded
593 if (remaining_ >= Base::kGroupSize) {
594 remaining_ -= Base::kGroupSize;
595 count_ = Base::kGroupSize;
600 p_ += Base::partialSize(buf_, count_);
603 // Can't decode a full group
604 count_ = Base::partialCount(p_, size_t(end_ - p_));
605 if (remaining_ >= count_) {
606 remaining_ -= count_;
611 p_ += Base::partialSize(buf_, count_);
622 StringPiece rest() const {
623 // This is only valid after next() returned false
624 CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0));
625 // p_ may point to the internal buffer (tmp_), but we want
626 // to return subpiece of the original data
627 size_t size = size_t(end_ - p_);
628 return StringPiece(rrest_ - size, rrest_);
636 char tmp_[2 * Base::kMaxSize];
637 type buf_[Base::kGroupSize];
643 typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
644 typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
648 #endif /* FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 */