2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef FOLLY_GROUPVARINT_H_
18 #define FOLLY_GROUPVARINT_H_
21 #error GroupVarint.h requires GCC
24 #include <folly/Portability.h>
26 #if FOLLY_X64 || defined(__i386__)
27 #define HAVE_GROUP_VARINT 1
31 #include <folly/detail/GroupVarintDetail.h>
32 #include <folly/Bits.h>
33 #include <folly/Range.h>
34 #include <glog/logging.h>
37 #include <x86intrin.h>
40 extern const __m128i groupVarintSSEMasks[];
47 extern const uint8_t groupVarintLengths[];
57 * GroupVarint encoding for 32-bit values.
59 * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
60 * There is one byte of overhead. (The first byte contains the lengths of
61 * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
63 * This implementation assumes little-endian and does unaligned 32-bit
64 * accesses, so it's basically not portable outside of the x86[_64] world.
67 class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
71 * Return the number of bytes used to encode these four values.
73 static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
74 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
78 * Return the number of bytes used to encode four uint32_t values stored
79 * at consecutive positions in an array.
81 static size_t size(const uint32_t* p) {
82 return size(p[0], p[1], p[2], p[3]);
86 * Return the number of bytes used to encode count (<= 4) values.
87 * If you clip a buffer after these many bytes, you can still decode
88 * the first "count" values correctly (if the remaining size() -
89 * partialSize() bytes are filled with garbage).
91 static size_t partialSize(const type* p, size_t count) {
92 DCHECK_LE(count, kGroupSize);
93 size_t s = kHeaderSize + count;
94 for (; count; --count, ++p) {
101 * Return the number of values from *p that are valid from an encoded
102 * buffer of size bytes.
104 static size_t partialCount(const char* p, size_t size) {
106 size_t s = kHeaderSize;
108 if (s > size) return 0;
110 if (s > size) return 1;
112 if (s > size) return 2;
114 if (s > size) return 3;
119 * Given a pointer to the beginning of an GroupVarint32-encoded block,
120 * return the number of bytes used by the encoding.
122 static size_t encodedSize(const char* p) {
123 return (kHeaderSize + kGroupSize +
124 b0key(*p) + b1key(*p) + b2key(*p) + b3key(*p));
128 * Encode four uint32_t values into the buffer pointed-to by p, and return
129 * the next position in the buffer (that is, one character past the last
130 * encoded byte). p needs to have at least size()+4 bytes available.
132 static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
133 uint8_t b0key = key(a);
134 uint8_t b1key = key(b);
135 uint8_t b2key = key(c);
136 uint8_t b3key = key(d);
137 *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key;
138 storeUnaligned(p, a);
140 storeUnaligned(p, b);
142 storeUnaligned(p, c);
144 storeUnaligned(p, d);
150 * Encode four uint32_t values from the array pointed-to by src into the
151 * buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
153 static char* encode(char* p, const uint32_t* src) {
154 return encode(p, src[0], src[1], src[2], src[3]);
158 * Decode four uint32_t values from a buffer, and return the next position
159 * in the buffer (that is, one character past the last encoded byte).
160 * The buffer needs to have at least 3 extra bytes available (they
161 * may be read but ignored).
163 static const char* decode_simple(const char* p, uint32_t* a, uint32_t* b,
164 uint32_t* c, uint32_t* d) {
165 size_t k = loadUnaligned<uint8_t>(p);
166 const char* end = p + detail::groupVarintLengths[k];
168 size_t k0 = b0key(k);
169 *a = loadUnaligned<uint32_t>(p) & kMask[k0];
171 size_t k1 = b1key(k);
172 *b = loadUnaligned<uint32_t>(p) & kMask[k1];
174 size_t k2 = b2key(k);
175 *c = loadUnaligned<uint32_t>(p) & kMask[k2];
177 size_t k3 = b3key(k);
178 *d = loadUnaligned<uint32_t>(p) & kMask[k3];
184 * Decode four uint32_t values from a buffer and store them in the array
185 * pointed-to by dest, similar to decode(p,a,b,c,d) above.
187 static const char* decode_simple(const char* p, uint32_t* dest) {
188 return decode_simple(p, dest, dest+1, dest+2, dest+3);
193 * Just like the non-SSSE3 decode below, but with the additional constraint
194 * that we must be able to read at least 17 bytes from the input pointer, p.
196 static const char* decode(const char* p, uint32_t* dest) {
198 __m128i val = _mm_loadu_si128((const __m128i*)(p+1));
199 __m128i mask = detail::groupVarintSSEMasks[key];
200 __m128i r = _mm_shuffle_epi8(val, mask);
201 _mm_storeu_si128((__m128i*)dest, r);
202 return p + detail::groupVarintLengths[key];
206 * Just like decode_simple, but with the additional constraint that
207 * we must be able to read at least 17 bytes from the input pointer, p.
209 static const char* decode(const char* p, uint32_t* a, uint32_t* b,
210 uint32_t* c, uint32_t* d) {
212 __m128i val = _mm_loadu_si128((const __m128i*)(p+1));
213 __m128i mask = detail::groupVarintSSEMasks[key];
214 __m128i r = _mm_shuffle_epi8(val, mask);
216 // Extracting 32 bits at a time out of an XMM register is a SSE4 feature
218 *a = _mm_extract_epi32(r, 0);
219 *b = _mm_extract_epi32(r, 1);
220 *c = _mm_extract_epi32(r, 2);
221 *d = _mm_extract_epi32(r, 3);
222 #else /* !__SSE4__ */
223 *a = _mm_extract_epi16(r, 0) + (_mm_extract_epi16(r, 1) << 16);
224 *b = _mm_extract_epi16(r, 2) + (_mm_extract_epi16(r, 3) << 16);
225 *c = _mm_extract_epi16(r, 4) + (_mm_extract_epi16(r, 5) << 16);
226 *d = _mm_extract_epi16(r, 6) + (_mm_extract_epi16(r, 7) << 16);
227 #endif /* __SSE4__ */
229 return p + detail::groupVarintLengths[key];
232 #else /* !__SSSE3__ */
233 static const char* decode(const char* p, uint32_t* a, uint32_t* b,
234 uint32_t* c, uint32_t* d) {
235 return decode_simple(p, a, b, c, d);
238 static const char* decode(const char* p, uint32_t* dest) {
239 return decode_simple(p, dest);
241 #endif /* __SSSE3__ */
244 static uint8_t key(uint32_t x) {
245 // __builtin_clz is undefined for the x==0 case
246 return 3 - (__builtin_clz(x|1) / 8);
248 static size_t b0key(size_t x) { return x & 3; }
249 static size_t b1key(size_t x) { return (x >> 2) & 3; }
250 static size_t b2key(size_t x) { return (x >> 4) & 3; }
251 static size_t b3key(size_t x) { return (x >> 6) & 3; }
253 static const uint32_t kMask[];
258 * GroupVarint encoding for 64-bit values.
260 * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
261 * There are two bytes of overhead. (The first two bytes contain the lengths
262 * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
264 * This implementation assumes little-endian and does unaligned 64-bit
265 * accesses, so it's basically not portable outside of the x86[_64] world.
268 class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
271 * Return the number of bytes used to encode these five values.
273 static size_t size(uint64_t a, uint64_t b, uint64_t c, uint64_t d,
275 return (kHeaderSize + kGroupSize +
276 key(a) + key(b) + key(c) + key(d) + key(e));
280 * Return the number of bytes used to encode five uint64_t values stored
281 * at consecutive positions in an array.
283 static size_t size(const uint64_t* p) {
284 return size(p[0], p[1], p[2], p[3], p[4]);
288 * Return the number of bytes used to encode count (<= 4) values.
289 * If you clip a buffer after these many bytes, you can still decode
290 * the first "count" values correctly (if the remaining size() -
291 * partialSize() bytes are filled with garbage).
293 static size_t partialSize(const type* p, size_t count) {
294 DCHECK_LE(count, kGroupSize);
295 size_t s = kHeaderSize + count;
296 for (; count; --count, ++p) {
303 * Return the number of values from *p that are valid from an encoded
304 * buffer of size bytes.
306 static size_t partialCount(const char* p, size_t size) {
307 uint16_t v = loadUnaligned<uint16_t>(p);
308 size_t s = kHeaderSize;
310 if (s > size) return 0;
312 if (s > size) return 1;
314 if (s > size) return 2;
316 if (s > size) return 3;
318 if (s > size) return 4;
323 * Given a pointer to the beginning of an GroupVarint64-encoded block,
324 * return the number of bytes used by the encoding.
326 static size_t encodedSize(const char* p) {
327 uint16_t n = loadUnaligned<uint16_t>(p);
328 return (kHeaderSize + kGroupSize +
329 b0key(n) + b1key(n) + b2key(n) + b3key(n) + b4key(n));
333 * Encode five uint64_t values into the buffer pointed-to by p, and return
334 * the next position in the buffer (that is, one character past the last
335 * encoded byte). p needs to have at least size()+8 bytes available.
337 static char* encode(char* p, uint64_t a, uint64_t b, uint64_t c,
338 uint64_t d, uint64_t e) {
339 uint8_t b0key = key(a);
340 uint8_t b1key = key(b);
341 uint8_t b2key = key(c);
342 uint8_t b3key = key(d);
343 uint8_t b4key = key(e);
344 storeUnaligned<uint16_t>(
346 (b4key << 12) | (b3key << 9) | (b2key << 6) | (b1key << 3) | b0key);
348 storeUnaligned(p, a);
350 storeUnaligned(p, b);
352 storeUnaligned(p, c);
354 storeUnaligned(p, d);
356 storeUnaligned(p, e);
362 * Encode five uint64_t values from the array pointed-to by src into the
363 * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
365 static char* encode(char* p, const uint64_t* src) {
366 return encode(p, src[0], src[1], src[2], src[3], src[4]);
370 * Decode five uint64_t values from a buffer, and return the next position
371 * in the buffer (that is, one character past the last encoded byte).
372 * The buffer needs to have at least 7 bytes available (they may be read
375 static const char* decode(const char* p, uint64_t* a, uint64_t* b,
376 uint64_t* c, uint64_t* d, uint64_t* e) {
377 uint16_t k = loadUnaligned<uint16_t>(p);
379 uint8_t k0 = b0key(k);
380 *a = loadUnaligned<uint64_t>(p) & kMask[k0];
382 uint8_t k1 = b1key(k);
383 *b = loadUnaligned<uint64_t>(p) & kMask[k1];
385 uint8_t k2 = b2key(k);
386 *c = loadUnaligned<uint64_t>(p) & kMask[k2];
388 uint8_t k3 = b3key(k);
389 *d = loadUnaligned<uint64_t>(p) & kMask[k3];
391 uint8_t k4 = b4key(k);
392 *e = loadUnaligned<uint64_t>(p) & kMask[k4];
398 * Decode five uint64_t values from a buffer and store them in the array
399 * pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
401 static const char* decode(const char* p, uint64_t* dest) {
402 return decode(p, dest, dest+1, dest+2, dest+3, dest+4);
406 enum { kHeaderBytes = 2 };
408 static uint8_t key(uint64_t x) {
409 // __builtin_clzll is undefined for the x==0 case
410 return 7 - (__builtin_clzll(x|1) / 8);
413 static uint8_t b0key(uint16_t x) { return x & 7; }
414 static uint8_t b1key(uint16_t x) { return (x >> 3) & 7; }
415 static uint8_t b2key(uint16_t x) { return (x >> 6) & 7; }
416 static uint8_t b3key(uint16_t x) { return (x >> 9) & 7; }
417 static uint8_t b4key(uint16_t x) { return (x >> 12) & 7; }
419 static const uint64_t kMask[];
422 typedef GroupVarint<uint32_t> GroupVarint32;
423 typedef GroupVarint<uint64_t> GroupVarint64;
426 * Simplify use of GroupVarint* for the case where data is available one
427 * entry at a time (instead of one group at a time). Handles buffering
428 * and an incomplete last chunk.
430 * Output is a function object that accepts character ranges:
431 * out(StringPiece) appends the given character range to the output.
433 template <class T, class Output>
434 class GroupVarintEncoder {
436 typedef GroupVarint<T> Base;
439 explicit GroupVarintEncoder(Output out)
444 ~GroupVarintEncoder() {
449 * Add a value to the encoder.
452 buf_[count_++] = val;
453 if (count_ == Base::kGroupSize) {
454 char* p = Base::encode(tmp_, buf_);
455 out_(StringPiece(tmp_, p));
461 * Finish encoding, flushing any buffered values if necessary.
462 * After finish(), the encoder is immediately ready to encode more data
463 * to the same output.
467 // This is not strictly necessary, but it makes testing easy;
468 // uninitialized bytes are guaranteed to be recorded as taking one byte
470 for (size_t i = count_; i < Base::kGroupSize; i++) {
473 Base::encode(tmp_, buf_);
474 out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
480 * Return the appender that was used.
485 const Output& output() const {
490 * Reset the encoder, disregarding any state (except what was already
491 * flushed to the output, of course).
499 char tmp_[Base::kMaxSize];
500 type buf_[Base::kGroupSize];
505 * Simplify use of GroupVarint* for the case where the last group in the
506 * input may be incomplete (but the exact size of the input is known).
507 * Allows for extracting values one at a time.
509 template <typename T>
510 class GroupVarintDecoder {
512 typedef GroupVarint<T> Base;
515 GroupVarintDecoder() { }
517 explicit GroupVarintDecoder(StringPiece data,
518 size_t maxCount = (size_t)-1)
519 : rrest_(data.end()),
525 remaining_(maxCount) {
528 void reset(StringPiece data, size_t maxCount = (size_t)-1) {
535 remaining_ = maxCount;
539 * Read and return the next value.
541 bool next(type* val) {
542 if (pos_ == count_) {
544 size_t rem = end_ - p_;
545 if (rem == 0 || remaining_ == 0) {
548 // next() attempts to read one full group at a time, and so we must have
549 // at least enough bytes readable after its end to handle the case if the
550 // last group is full.
552 // The best way to ensure this is to ensure that data has at least
553 // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy
554 // into a temporary buffer.
555 if (limit_ - p_ < Base::kMaxSize) {
556 memcpy(tmp_, p_, rem);
559 limit_ = tmp_ + sizeof(tmp_);
562 const char* n = Base::decode(p_, buf_);
564 // Full group could be decoded
565 if (remaining_ >= Base::kGroupSize) {
566 remaining_ -= Base::kGroupSize;
567 count_ = Base::kGroupSize;
572 p_ += Base::partialSize(buf_, count_);
575 // Can't decode a full group
576 count_ = Base::partialCount(p_, end_ - p_);
577 if (remaining_ >= count_) {
578 remaining_ -= count_;
583 p_ += Base::partialSize(buf_, count_);
594 StringPiece rest() const {
595 // This is only valid after next() returned false
596 CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0));
597 // p_ may point to the internal buffer (tmp_), but we want
598 // to return subpiece of the original data
599 size_t size = end_ - p_;
600 return StringPiece(rrest_ - size, rrest_);
608 char tmp_[2 * Base::kMaxSize];
609 type buf_[Base::kGroupSize];
615 typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
616 typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
620 #endif /* FOLLY_X64 || defined(__i386__) */
621 #endif /* FOLLY_GROUPVARINT_H_ */