From 0558d398803721bc1ab0a6418abc32230cbcdb03 Mon Sep 17 00:00:00 2001 From: Tudor Bosman Date: Wed, 14 Aug 2013 19:17:09 -0700 Subject: [PATCH] Varint in folly Test Plan: test added Reviewed By: alandau@fb.com FB internal diff: D928835 --- folly/Varint.h | 129 ++++++++++++++++++++++++++ folly/test/VarintTest.cpp | 184 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 313 insertions(+) create mode 100644 folly/Varint.h create mode 100644 folly/test/VarintTest.cpp diff --git a/folly/Varint.h b/folly/Varint.h new file mode 100644 index 00000000..3349cb15 --- /dev/null +++ b/folly/Varint.h @@ -0,0 +1,129 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_VARINT_H_ +#define FOLLY_VARINT_H_ + +#include "folly/Range.h" + +namespace folly { + +/** + * Variable-length integer encoding, using a little-endian, base-128 + * representation. + * + * The MSb is set on all bytes except the last. + * + * Details: + * https://developers.google.com/protocol-buffers/docs/encoding#varints + * + * If you want to encode multiple values, GroupVarint (in GroupVarint.h) + * is faster and likely smaller. + */ + +/** + * Maximum length (in bytes) of the varint encoding of a 32-bit value. + */ +constexpr size_t kMaxVarintLength32 = 5; + +/** + * Maximum length (in bytes) of the varint encoding of a 64-bit value. + */ +constexpr size_t kMaxVarintLength64 = 10; + +/** + * Encode a value in the given buffer, returning the number of bytes used + * for encoding. + * buf must have enough space to represent the value (at least + * kMaxVarintLength64 bytes to encode arbitrary 64-bit values) + */ +size_t encodeVarint(uint64_t val, uint8_t* buf); + +/** + * Decode a value from a given buffer, advances data past the returned value. + */ +uint64_t decodeVarint(ByteRange& data); + +/** + * ZigZag encoding that maps signed integers with a small absolute value + * to unsigned integers with a small (positive) values. Without this, + * encoding negative values using Varint would use up 9 or 10 bytes. + * + * if x >= 0, encodeZigZag(x) == 2*x + * if x < 0, encodeZigZag(x) == -2*x + 1 + */ + +inline uint64_t encodeZigZag(int64_t val) { + // Bit-twiddling magic stolen from the Google protocol buffer document; + // val >> 63 is an arithmetic shift because val is signed + return static_cast((val << 1) ^ (val >> 63)); +} + +inline int64_t decodeZigZag(uint64_t val) { + return static_cast((val >> 1) ^ -(val & 1)); +} + +// Implementation below + +inline size_t encodeVarint(uint64_t val, uint8_t* buf) { + uint8_t* p = buf; + while (val >= 128) { + *p++ = 0x80 | (val & 0x7f); + val >>= 7; + } + *p++ = val; + return p - buf; +} + +inline uint64_t decodeVarint(ByteRange& data) { + const int8_t* begin = reinterpret_cast(data.begin()); + const int8_t* end = reinterpret_cast(data.end()); + const int8_t* p = begin; + uint64_t val = 0; + + if (LIKELY(end - begin >= kMaxVarintLength64)) { // fast path + int64_t b; + do { + b = *p++; val = (b & 0x7f) ; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 7; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 14; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 21; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 28; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 35; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 42; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 49; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 56; if (b >= 0) break; + b = *p++; val |= (b & 0x7f) << 63; if (b >= 0) break; + throw std::invalid_argument("Invalid varint value"); // too big + } while (false); + } else { + int shift = 0; + while (p != end && *p < 0) { + val |= static_cast(*p++ & 0x7f) << shift; + shift += 7; + } + if (p == end) throw std::invalid_argument("Invalid varint value"); + val |= static_cast(*p++) << shift; + } + + data.advance(p - begin); + return val; +} + +} // namespaces + +#endif /* FOLLY_VARINT_H_ */ + diff --git a/folly/test/VarintTest.cpp b/folly/test/VarintTest.cpp new file mode 100644 index 00000000..39146848 --- /dev/null +++ b/folly/test/VarintTest.cpp @@ -0,0 +1,184 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "folly/Varint.h" + +#include +#include +#include +#include + +#include +#include + +#include "folly/Benchmark.h" +#include "folly/Random.h" + +DEFINE_int32(random_seed, folly::randomNumberSeed(), "random seed"); + +namespace folly { namespace test { + +void testVarint(uint64_t val, std::initializer_list bytes) { + size_t n = bytes.size(); + ByteRange expected(&*bytes.begin(), n); + + { + uint8_t buf[kMaxVarintLength64]; + EXPECT_EQ(expected.size(), encodeVarint(val, buf)); + EXPECT_TRUE(ByteRange(buf, expected.size()) == expected); + } + + { + ByteRange r = expected; + uint64_t decoded = decodeVarint(r); + EXPECT_TRUE(r.empty()); + EXPECT_EQ(val, decoded); + } + + if (n < kMaxVarintLength64) { + // Try from a full buffer too, different code path + uint8_t buf[kMaxVarintLength64]; + memcpy(buf, &*bytes.begin(), n); + + uint8_t fills[] = {0, 0x7f, 0x80, 0xff}; + + for (uint8_t fill : fills) { + memset(buf + n, fill, kMaxVarintLength64 - n); + ByteRange r(buf, kMaxVarintLength64); + uint64_t decoded = decodeVarint(r); + EXPECT_EQ(val, decoded); + EXPECT_EQ(kMaxVarintLength64 - n, r.size()); + } + } +} + +TEST(Varint, Simple) { + testVarint(0, {0}); + testVarint(1, {1}); + testVarint(127, {127}); + testVarint(128, {0x80, 0x01}); + testVarint(300, {0xac, 0x02}); + testVarint(16383, {0xff, 0x7f}); + testVarint(16384, {0x80, 0x80, 0x01}); + + testVarint(static_cast(-1), + {0xff, 0xff, 0xff, 0xff, 0x0f}); + testVarint(static_cast(-1), + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01}); +} + +TEST(ZigZag, Simple) { + EXPECT_EQ(0, encodeZigZag(0)); + EXPECT_EQ(1, encodeZigZag(-1)); + EXPECT_EQ(2, encodeZigZag(1)); + EXPECT_EQ(3, encodeZigZag(-2)); + EXPECT_EQ(4, encodeZigZag(2)); + + EXPECT_EQ(0, decodeZigZag(0)); + EXPECT_EQ(-1, decodeZigZag(1)); + EXPECT_EQ(1, decodeZigZag(2)); + EXPECT_EQ(-2, decodeZigZag(3)); + EXPECT_EQ(2, decodeZigZag(4)); +} + +namespace { + +constexpr size_t kNumValues = 1000; +std::vector gValues; +std::vector gDecodedValues; +std::vector gEncoded; + +void generateRandomValues() { + LOG(INFO) << "Random seed is " << FLAGS_random_seed; + std::mt19937 rng(FLAGS_random_seed); + + // Approximation of power law + std::uniform_int_distribution numBytes(1, 8); + std::uniform_int_distribution byte(0, 255); + + gValues.resize(kNumValues); + gDecodedValues.resize(kNumValues); + gEncoded.resize(kNumValues * kMaxVarintLength64); + for (size_t i = 0; i < kNumValues; ++i) { + int n = numBytes(rng); + uint64_t val = 0; + for (size_t j = 0; j < n; ++j) { + val = (val << 8) + byte(rng); + } + gValues[i] = val; + } +} + +// Benchmark results (Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz, Linux x86_64) +// +// I0814 19:13:14.466256 7504 VarintTest.cpp:146] Random seed is -1216518886 +// ============================================================================ +// folly/test/VarintTest.cpp relative time/iter iters/s +// ============================================================================ +// VarintEncoding 6.69us 149.37K +// VarintDecoding 6.85us 145.90K +// ============================================================================ +// +// Disabling the "fast path" code in decodeVarint hurts performance: +// +// I0814 19:15:13.871467 9550 VarintTest.cpp:156] Random seed is -1216518886 +// ============================================================================ +// folly/test/VarintTest.cpp relative time/iter iters/s +// ============================================================================ +// VarintEncoding 6.75us 148.26K +// VarintDecoding 12.60us 79.37K +// ============================================================================ + +BENCHMARK(VarintEncoding, iters) { + uint8_t* start = &(*gEncoded.begin()); + uint8_t* p = start; + bool empty = (iters == 0); + while (iters--) { + p = start; + for (auto& v : gValues) { + p += encodeVarint(v, p); + } + } + + gEncoded.erase(gEncoded.begin() + (p - start), gEncoded.end()); +} + +BENCHMARK(VarintDecoding, iters) { + while (iters--) { + size_t i = 0; + ByteRange range(&(*gEncoded.begin()), &(*gEncoded.end())); + while (!range.empty()) { + gDecodedValues[i++] = decodeVarint(range); + } + } +} + +} // namespace + +}} // namespaces + +int main(int argc, char *argv[]) { + testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + int ret = RUN_ALL_TESTS(); + if (ret == 0) { + folly::test::generateRandomValues(); + folly::runBenchmarksOnFlag(); + } + return ret; +} + -- 2.34.1