From 173356a35a2894817b7b45134eb062211b54dfc7 Mon Sep 17 00:00:00 2001 From: Philip Pronin Date: Sat, 6 Dec 2014 16:49:11 -0800 Subject: [PATCH] improve io::Cursor read() performance for small sizeof(T) Summary: I just found that gcc (4.8.2) failed to unroll the loop in `pullAtMost()`, so it didn't replace `memcpy` with a simple load for small `len`. Test Plan: fbconfig -r folly/io/test thrift/lib/cpp2/test && fbmake runtests_opt -j32 Ran unicorn-specific thrift deserialization benchmark from D1724070, verified 50% improvement in `SearchRequest` deserialization performance. `thrift/lib/cpp2/test/ProtocolBench` results: ``` |---- before -----| |---- after -----| ================================================================================================ thrift/lib/cpp2/test/ProtocolBench.cpp relative time/iter iters/s time/iter iters/s ================================================================================================ BinaryProtocol_read_Empty 21.72ns 46.04M 17.58ns 56.89M BinaryProtocol_read_SmallInt 43.03ns 23.24M 23.64ns 42.30M BinaryProtocol_read_BigInt 43.72ns 22.87M 22.03ns 45.38M BinaryProtocol_read_SmallString 88.57ns 11.29M 47.01ns 21.27M BinaryProtocol_read_BigString 365.76ns 2.73M 323.58ns 3.09M BinaryProtocol_read_BigBinary 207.78ns 4.81M 169.09ns 5.91M BinaryProtocol_read_LargeBinary 187.81ns 5.32M 172.09ns 5.81M BinaryProtocol_read_Mixed 161.18ns 6.20M 68.41ns 14.62M BinaryProtocol_read_SmallListInt 177.32ns 5.64M 96.91ns 10.32M BinaryProtocol_read_BigListInt 77.03us 12.98K 15.88us 62.97K BinaryProtocol_read_BigListMixed 1.79ms 557.79 923.99us 1.08K BinaryProtocol_read_LargeListMixed 195.01ms 5.13 103.78ms 9.64 ================================================================================================ ``` Reviewed By: soren@fb.com Subscribers: alandau, bmatheny, mshneer, trunkagent, njormrod, folly-diffs@ FB internal diff: D1724111 Tasks: 5770136 Signature: t1:1724111:1417977810:b7d643d0c819a0bbac77fa0048206153929e50a8 --- folly/io/Compression.cpp | 2 +- folly/io/Cursor.h | 257 ++++++++++++++++++++------------------- 2 files changed, 130 insertions(+), 129 deletions(-) diff --git a/folly/io/Compression.cpp b/folly/io/Compression.cpp index 1a820d42..ef8ed9ef 100644 --- a/folly/io/Compression.cpp +++ b/folly/io/Compression.cpp @@ -159,7 +159,7 @@ inline uint64_t decodeVarintFromCursor(folly::io::Cursor& cursor) { uint64_t val = 0; int8_t b = 0; for (int shift = 0; shift <= 63; shift += 7) { - b = cursor.pullByte(); + b = cursor.read(); val |= static_cast(b & 0x7f) << shift; if (b >= 0) { break; diff --git a/folly/io/Cursor.h b/folly/io/Cursor.h index 803efc66..064aa34d 100644 --- a/folly/io/Cursor.h +++ b/folly/io/Cursor.h @@ -49,16 +49,42 @@ * access to the buffer (you need to call unshare() yourself if necessary). **/ namespace folly { namespace io { + namespace detail { -template +template class CursorBase { + // Make all the templated classes friends for copy constructor. + template friend class CursorBase; public: + explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) { } + + /** + * Copy constructor. + * + * This also allows constructing a CursorBase from other derived types. + * For instance, this allows constructing a Cursor from an RWPrivateCursor. + */ + template + explicit CursorBase(const CursorBase& cursor) + : crtBuf_(cursor.crtBuf_), + offset_(cursor.offset_), + buffer_(cursor.buffer_) { } + + /** + * Reset cursor to point to a new buffer. + */ + void reset(BufType* buf) { + crtBuf_ = buf; + buffer_ = buf; + offset_ = 0; + } + const uint8_t* data() const { return crtBuf_->data() + offset_; } - /* + /** * Return the remaining space available in the current IOBuf. * * May return 0 if the cursor is at the end of an IOBuf. Use peek() instead @@ -70,7 +96,7 @@ class CursorBase { return crtBuf_->length() - offset_; } - /* + /** * Return the space available until the end of the entire IOBuf chain. */ size_t totalLength() const { @@ -107,10 +133,14 @@ class CursorBase { } template - typename std::enable_if::value, T>::type - read() { + typename std::enable_if::value, T>::type read() { T val; - pull(&val, sizeof(T)); + if (LIKELY(length() >= sizeof(T))) { + val = loadUnaligned(data()); + offset_ += sizeof(T); + } else { + pullSlow(&val, sizeof(T)); + } return val; } @@ -133,23 +163,14 @@ class CursorBase { */ std::string readFixedString(size_t len) { std::string str; - str.reserve(len); - for (;;) { - // Fast path: it all fits in one buffer. - size_t available = length(); - if (LIKELY(available >= len)) { - str.append(reinterpret_cast(data()), len); - offset_ += len; - return str; - } - - str.append(reinterpret_cast(data()), available); - if (UNLIKELY(!tryAdvanceBuffer())) { - throw std::out_of_range("string underflow"); - } - len -= available; + if (LIKELY(length() >= len)) { + str.append(reinterpret_cast(data()), len); + offset_ += len; + } else { + readFixedStringSlow(&str, len); } + return str; } /** @@ -161,8 +182,8 @@ class CursorBase { * vs. using pull(). */ std::string readTerminatedString( - char termChar = '\0', - size_t maxLength = std::numeric_limits::max()) { + char termChar = '\0', + size_t maxLength = std::numeric_limits::max()) { std::string str; for (;;) { @@ -194,31 +215,39 @@ class CursorBase { } } - explicit CursorBase(BufType* buf) - : crtBuf_(buf) - , offset_(0) - , buffer_(buf) {} + size_t skipAtMost(size_t len) { + if (LIKELY(length() >= len)) { + offset_ += len; + return len; + } + return skipAtMostSlow(len); + } - // Make all the templated classes friends for copy constructor. - template friend class CursorBase; + void skip(size_t len) { + if (LIKELY(length() >= len)) { + offset_ += len; + } else { + skipSlow(len); + } + } - /* - * Copy constructor. - * - * This also allows constructing a CursorBase from other derived types. - * For instance, this allows constructing a Cursor from an RWPrivateCursor. - */ - template - explicit CursorBase(const CursorBase& cursor) - : crtBuf_(cursor.crtBuf_), - offset_(cursor.offset_), - buffer_(cursor.buffer_) {} + size_t pullAtMost(void* buf, size_t len) { + // Fast path: it all fits in one buffer. + if (LIKELY(length() >= len)) { + memcpy(buf, data(), len); + offset_ += len; + return len; + } + return pullAtMostSlow(buf, len); + } - // reset cursor to point to a new buffer. - void reset(BufType* buf) { - crtBuf_ = buf; - buffer_ = buf; - offset_ = 0; + void pull(void* buf, size_t len) { + if (LIKELY(length() >= len)) { + memcpy(buf, data(), len); + offset_ += len; + } else { + pullSlow(buf, len); + } } /** @@ -232,35 +261,9 @@ class CursorBase { while (UNLIKELY(available == 0 && tryAdvanceBuffer())) { available = length(); } - return std::make_pair(data(), available); } - /** - * Read one byte. Equivalent to pull(&byte, 1) but faster. - */ - uint8_t pullByte() { - // fast path - if (LIKELY(length() >= 1)) { - uint8_t byte = *data(); - offset_++; - return byte; - } - - // slow path - uint8_t byte; - if (UNLIKELY(pullAtMost(&byte, 1) != 1)) { - throw std::out_of_range("underflow"); - } - return byte; - } - - void pull(void* buf, size_t len) { - if (UNLIKELY(pullAtMost(buf, len) != len)) { - throw std::out_of_range("underflow"); - } - } - void clone(std::unique_ptr& buf, size_t len) { if (UNLIKELY(cloneAtMost(buf, len) != len)) { throw std::out_of_range("underflow"); @@ -273,36 +276,6 @@ class CursorBase { } } - void skip(size_t len) { - if (LIKELY(length() >= len)) { - offset_ += len; - return; - } - skipSlow(len); - } - - size_t pullAtMost(void* buf, size_t len) { - uint8_t* p = reinterpret_cast(buf); - size_t copied = 0; - for (;;) { - // Fast path: it all fits in one buffer. - size_t available = length(); - if (LIKELY(available >= len)) { - memcpy(p, data(), len); - offset_ += len; - return copied + len; - } - - memcpy(p, data(), available); - copied += available; - if (UNLIKELY(!tryAdvanceBuffer())) { - return copied; - } - p += available; - len -= available; - } - } - size_t cloneAtMost(folly::IOBuf& buf, size_t len) { buf = folly::IOBuf(); @@ -327,7 +300,6 @@ class CursorBase { return copied + len; } - if (loopCount == 0) { crtBuf_->cloneOneInto(buf); buf.trimStart(offset_); @@ -349,28 +321,9 @@ class CursorBase { if (!buf) { buf = make_unique(); } - return cloneAtMost(*buf, len); } - size_t skipAtMost(size_t len) { - size_t skipped = 0; - for (;;) { - // Fast path: it all fits in one buffer. - size_t available = length(); - if (LIKELY(available >= len)) { - offset_ += len; - return skipped + len; - } - - skipped += available; - if (UNLIKELY(!tryAdvanceBuffer())) { - return skipped; - } - len -= available; - } - } - /** * Return the distance between two cursors. */ @@ -423,10 +376,7 @@ class CursorBase { } protected: - BufType* crtBuf_; - size_t offset_; - - ~CursorBase(){} + ~CursorBase() { } BufType* head() { return buffer_; @@ -445,20 +395,71 @@ class CursorBase { return true; } + BufType* crtBuf_; + size_t offset_ = 0; + private: - void advanceDone() { + void readFixedStringSlow(std::string* str, size_t len) { + for (size_t available; (available = length()) < len; ) { + str->append(reinterpret_cast(data()), available); + if (UNLIKELY(!tryAdvanceBuffer())) { + throw std::out_of_range("string underflow"); + } + len -= available; + } + str->append(reinterpret_cast(data()), len); + offset_ += len; + } + + size_t pullAtMostSlow(void* buf, size_t len) { + uint8_t* p = reinterpret_cast(buf); + size_t copied = 0; + for (size_t available; (available = length()) < len; ) { + memcpy(p, data(), available); + copied += available; + if (UNLIKELY(!tryAdvanceBuffer())) { + return copied; + } + p += available; + len -= available; + } + memcpy(p, data(), len); + offset_ += len; + return copied + len; + } + + void pullSlow(void* buf, size_t len) { + if (UNLIKELY(pullAtMostSlow(buf, len) != len)) { + throw std::out_of_range("underflow"); + } + } + + size_t skipAtMostSlow(size_t len) { + size_t skipped = 0; + for (size_t available; (available = length()) < len; ) { + skipped += available; + if (UNLIKELY(!tryAdvanceBuffer())) { + return skipped; + } + len -= skipped; + } + offset_ += len; + return skipped + len; } void skipSlow(size_t len) { - if (UNLIKELY(skipAtMost(len) != len)) { + if (UNLIKELY(skipAtMostSlow(len) != len)) { throw std::out_of_range("underflow"); } } + void advanceDone() { + } + BufType* buffer_; }; -} //namespace detail +} // namespace detail class Cursor : public detail::CursorBase { public: -- 2.34.1