From: Stepan Palamarchuk Date: Tue, 16 Jan 2018 00:36:41 +0000 (-0800) Subject: Improve fast path of Cursor X-Git-Tag: v2018.01.22.00~33 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=2c730d6fe79b5642133c55545c4ed7570ce2abb9;p=folly.git Improve fast path of Cursor Summary: This change simplifies the fastpath by reducing it to bare minimum (i.e. check length, load data) and removes indirection to IOBuf. Additionally it adds `skipNoAdvance` method to have 1-instruction skip. Disassembly of `read` is over 35 instructions (just hot path). With this change it's doesn to 8. Disassembly after: Dump of assembler code for function folly::io::detail::CursorBase::read(): 0x000000000041f0f0 <+0>: mov 0x18(%rdi),%rax 0x000000000041f0f4 <+4>: lea 0x1(%rax),%rcx 0x000000000041f0f8 <+8>: cmp 0x10(%rdi),%rcx 0x000000000041f0fc <+12>: ja 0x41f105 ::read()+21> 0x000000000041f0fe <+14>: mov (%rax),%al 0x000000000041f100 <+16>: mov %rcx,0x18(%rdi) 0x000000000041f104 <+20>: retq 0x000000000041f105 <+21>: jmpq 0x41f110 ::readSlow()> With this diff Thrift deserialization becomes ~20% faster (with prod workloads). Thrift benchmark: Before: ============================================================================ thrift/lib/cpp2/test/ProtocolBench.cpp relative time/iter iters/s ============================================================================ BinaryProtocol_read_Empty 12.98ns 77.03M BinaryProtocol_read_SmallInt 20.94ns 47.76M BinaryProtocol_read_BigInt 20.86ns 47.93M BinaryProtocol_read_SmallString 34.64ns 28.86M BinaryProtocol_read_BigString 185.53ns 5.39M BinaryProtocol_read_BigBinary 67.34ns 14.85M BinaryProtocol_read_LargeBinary 62.23ns 16.07M BinaryProtocol_read_Mixed 58.74ns 17.03M BinaryProtocol_read_SmallListInt 89.99ns 11.11M BinaryProtocol_read_BigListInt 39.92us 25.05K BinaryProtocol_read_BigListMixed 616.20us 1.62K BinaryProtocol_read_LargeListMixed 83.49ms 11.98 CompactProtocol_read_Empty 11.28ns 88.67M CompactProtocol_read_SmallInt 19.15ns 52.22M CompactProtocol_read_BigInt 26.14ns 38.25M CompactProtocol_read_SmallString 31.04ns 32.22M CompactProtocol_read_BigString 184.55ns 5.42M CompactProtocol_read_BigBinary 69.73ns 14.34M CompactProtocol_read_LargeBinary 64.39ns 15.53M CompactProtocol_read_Mixed 58.73ns 17.03M CompactProtocol_read_SmallListInt 76.50ns 13.07M CompactProtocol_read_BigListInt 25.93us 38.56K CompactProtocol_read_BigListMixed 623.15us 1.60K CompactProtocol_read_LargeListMixed 80.57ms 12.41 ============================================================================ After: ============================================================================ thrift/lib/cpp2/test/ProtocolBench.cpp relative time/iter iters/s ============================================================================ BinaryProtocol_read_Empty 10.40ns 96.17M BinaryProtocol_read_SmallInt 15.14ns 66.03M BinaryProtocol_read_BigInt 15.19ns 65.84M BinaryProtocol_read_SmallString 25.19ns 39.70M BinaryProtocol_read_BigString 172.85ns 5.79M BinaryProtocol_read_BigBinary 56.88ns 17.58M BinaryProtocol_read_LargeBinary 56.77ns 17.61M BinaryProtocol_read_Mixed 43.98ns 22.74M BinaryProtocol_read_SmallListInt 58.19ns 17.19M BinaryProtocol_read_BigListInt 19.75us 50.63K BinaryProtocol_read_BigListMixed 440.20us 2.27K BinaryProtocol_read_LargeListMixed 56.94ms 17.56 CompactProtocol_read_Empty 9.35ns 106.93M CompactProtocol_read_SmallInt 13.07ns 76.49M CompactProtocol_read_BigInt 18.23ns 54.87M CompactProtocol_read_SmallString 25.61ns 39.05M CompactProtocol_read_BigString 174.46ns 5.73M CompactProtocol_read_BigBinary 59.77ns 16.73M CompactProtocol_read_LargeBinary 60.81ns 16.44M CompactProtocol_read_Mixed 42.70ns 23.42M CompactProtocol_read_SmallListInt 66.89ns 14.95M CompactProtocol_read_BigListInt 25.08us 39.87K CompactProtocol_read_BigListMixed 427.93us 2.34K CompactProtocol_read_LargeListMixed 56.11ms 17.82 ============================================================================ Reviewed By: yfeldblum Differential Revision: D6635325 fbshipit-source-id: 393fc1005689042977c03f37f5a898ebe7814d44 --- diff --git a/folly/io/Cursor.h b/folly/io/Cursor.h index fcb97c92..bdaeabe3 100644 --- a/folly/io/Cursor.h +++ b/folly/io/Cursor.h @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -58,7 +57,12 @@ class CursorBase { // Make all the templated classes friends for copy constructor. template friend class CursorBase; public: - explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) { } + explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) { + if (crtBuf_) { + crtPos_ = crtBegin_ = crtBuf_->data(); + crtEnd_ = crtBuf_->tail(); + } + } /** * Copy constructor. @@ -68,9 +72,11 @@ class CursorBase { */ template explicit CursorBase(const CursorBase& cursor) - : crtBuf_(cursor.crtBuf_), - offset_(cursor.offset_), - buffer_(cursor.buffer_) { } + : crtBuf_(cursor.crtBuf_), + crtBegin_(cursor.crtBegin_), + crtEnd_(cursor.crtEnd_), + crtPos_(cursor.crtPos_), + buffer_(cursor.buffer_) {} /** * Reset cursor to point to a new buffer. @@ -78,11 +84,14 @@ class CursorBase { void reset(BufType* buf) { crtBuf_ = buf; buffer_ = buf; - offset_ = 0; + if (crtBuf_) { + crtPos_ = crtBegin_ = crtBuf_->data(); + crtEnd_ = crtBuf_->tail(); + } } const uint8_t* data() const { - return crtBuf_->data() + offset_; + return crtPos_; } /** @@ -94,7 +103,7 @@ class CursorBase { * pointing at the end of a buffer. */ size_t length() const { - return crtBuf_->length() - offset_; + return crtEnd_ - crtPos_; } /** @@ -102,10 +111,10 @@ class CursorBase { */ size_t totalLength() const { if (crtBuf_ == buffer_) { - return crtBuf_->computeChainDataLength() - offset_; + return crtBuf_->computeChainDataLength() - (crtPos_ - crtBegin_); } CursorBase end(buffer_->prev()); - end.offset_ = end.buffer_->length(); + end.crtPos_ = end.crtEnd_; return end - *this; } @@ -135,7 +144,7 @@ class CursorBase { */ bool isAtEnd() const { // Check for the simple cases first. - if (offset_ != crtBuf_->length()) { + if (crtPos_ != crtEnd_) { return false; } if (crtBuf_ == buffer_->prev()) { @@ -158,7 +167,8 @@ class CursorBase { * Advances the cursor to the end of the entire IOBuf chain. */ void advanceToEnd() { - offset_ = buffer_->prev()->length(); + crtBegin_ = buffer_->prev()->data(); + crtPos_ = crtEnd_ = buffer_->prev()->tail(); if (crtBuf_ != buffer_->prev()) { crtBuf_ = buffer_->prev(); static_cast(this)->advanceDone(); @@ -194,7 +204,23 @@ class CursorBase { * same IOBuf chain. */ bool operator==(const Derived& other) const { - return (offset_ == other.offset_) && (crtBuf_ == other.crtBuf_); + const IOBuf* crtBuf = crtBuf_; + auto crtPos = crtPos_; + // We can be pointing to the end of a buffer chunk, find first non-empty. + while (crtPos == crtBuf->tail() && crtBuf != buffer_->prev()) { + crtBuf = crtBuf->next(); + crtPos = crtBuf->data(); + } + + const IOBuf* crtBufOther = other.crtBuf_; + auto crtPosOther = other.crtPos_; + // We can be pointing to the end of a buffer chunk, find first non-empty. + while (crtPosOther == crtBufOther->tail() && + crtBufOther != other.buffer_->prev()) { + crtBufOther = crtBufOther->next(); + crtPosOther = crtBufOther->data(); + } + return (crtPos == crtPosOther) && (crtBuf == crtBufOther); } bool operator!=(const Derived& other) const { return !operator==(other); @@ -203,10 +229,9 @@ class CursorBase { template typename std::enable_if::value, bool>::type tryRead( T& val) { - if (LIKELY(length() >= sizeof(T))) { + if (LIKELY(crtPos_ + sizeof(T) <= crtEnd_)) { val = loadUnaligned(data()); - offset_ += sizeof(T); - advanceBufferIfEmpty(); + crtPos_ += sizeof(T); return true; } return pullAtMostSlow(&val, sizeof(T)) == sizeof(T); @@ -228,11 +253,13 @@ class CursorBase { template T read() { - T val{}; - if (!tryRead(val)) { - std::__throw_out_of_range("underflow"); + if (LIKELY(crtPos_ + sizeof(T) <= crtEnd_)) { + T val = loadUnaligned(data()); + crtPos_ += sizeof(T); + return val; + } else { + return readSlow(); } - return val; } template @@ -257,8 +284,7 @@ class CursorBase { str.reserve(len); if (LIKELY(length() >= len)) { str.append(reinterpret_cast(data()), len); - offset_ += len; - advanceBufferIfEmpty(); + crtPos_ += len; } else { readFixedStringSlow(&str, len); } @@ -307,34 +333,41 @@ class CursorBase { void skipWhile(const Predicate& predicate); size_t skipAtMost(size_t len) { - if (LIKELY(length() >= len)) { - offset_ += len; - advanceBufferIfEmpty(); + if (LIKELY(crtPos_ + len < crtEnd_)) { + crtPos_ += len; return len; } return skipAtMostSlow(len); } void skip(size_t len) { - if (LIKELY(length() >= len)) { - offset_ += len; - advanceBufferIfEmpty(); + if (LIKELY(crtPos_ + len < crtEnd_)) { + crtPos_ += len; } else { skipSlow(len); } } + /** + * Skip bytes in the current IOBuf without advancing to the next one. + * Precondition: length() >= len + */ + void skipNoAdvance(size_t len) { + DCHECK_LE(len, length()); + crtPos_ += len; + } + size_t retreatAtMost(size_t len) { - if (len <= offset_) { - offset_ -= len; + if (len <= static_cast(crtPos_ - crtBegin_)) { + crtPos_ -= len; return len; } return retreatAtMostSlow(len); } void retreat(size_t len) { - if (len <= offset_) { - offset_ -= len; + if (len <= static_cast(crtPos_ - crtBegin_)) { + crtPos_ -= len; } else { retreatSlow(len); } @@ -342,20 +375,18 @@ class CursorBase { size_t pullAtMost(void* buf, size_t len) { // Fast path: it all fits in one buffer. - if (LIKELY(length() >= len)) { + if (LIKELY(crtPos_ + len <= crtEnd_)) { memcpy(buf, data(), len); - offset_ += len; - advanceBufferIfEmpty(); + crtPos_ += len; return len; } return pullAtMostSlow(buf, len); } void pull(void* buf, size_t len) { - if (LIKELY(length() >= len)) { + if (LIKELY(crtPos_ + len <= crtEnd_)) { memcpy(buf, data(), len); - offset_ += len; - advanceBufferIfEmpty(); + crtPos_ += len; } else { pullSlow(buf, len); } @@ -399,6 +430,9 @@ class CursorBase { } size_t cloneAtMost(folly::IOBuf& buf, size_t len) { + // We might be at the end of buffer. + advanceBufferIfEmpty(); + std::unique_ptr tmp; size_t copied = 0; for (int loopCount = 0; true; ++loopCount) { @@ -407,26 +441,26 @@ class CursorBase { if (LIKELY(available >= len)) { if (loopCount == 0) { crtBuf_->cloneOneInto(buf); - buf.trimStart(offset_); + buf.trimStart(crtPos_ - crtBegin_); buf.trimEnd(buf.length() - len); } else { tmp = crtBuf_->cloneOne(); - tmp->trimStart(offset_); + tmp->trimStart(crtPos_ - crtBegin_); tmp->trimEnd(tmp->length() - len); buf.prependChain(std::move(tmp)); } - offset_ += len; + crtPos_ += len; advanceBufferIfEmpty(); return copied + len; } if (loopCount == 0) { crtBuf_->cloneOneInto(buf); - buf.trimStart(offset_); + buf.trimStart(crtPos_ - crtBegin_); } else { tmp = crtBuf_->cloneOne(); - tmp->trimStart(offset_); + tmp->trimStart(crtPos_ - crtBegin_); buf.prependChain(std::move(tmp)); } @@ -453,7 +487,7 @@ class CursorBase { size_t len = 0; if (otherBuf != crtBuf_) { - len += otherBuf->length() - other.offset_; + len += other.crtEnd_ - other.crtPos_; for (otherBuf = otherBuf->next(); otherBuf != crtBuf_ && otherBuf != other.buffer_; @@ -465,13 +499,13 @@ class CursorBase { std::__throw_out_of_range("wrap-around"); } - len += offset_; + len += crtPos_ - crtBegin_; } else { - if (offset_ < other.offset_) { + if (crtPos_ < other.crtPos_) { std::__throw_out_of_range("underflow"); } - len += offset_ - other.offset_; + len += crtPos_ - other.crtPos_; } return len; @@ -492,7 +526,7 @@ class CursorBase { } } - len += offset_; + len += crtPos_ - crtBegin_; return len; } @@ -506,37 +540,48 @@ class CursorBase { bool tryAdvanceBuffer() { BufType* nextBuf = crtBuf_->next(); if (UNLIKELY(nextBuf == buffer_)) { - offset_ = crtBuf_->length(); + crtPos_ = crtEnd_; return false; } - offset_ = 0; crtBuf_ = nextBuf; + crtPos_ = crtBegin_ = crtBuf_->data(); + crtEnd_ = crtBuf_->tail(); static_cast(this)->advanceDone(); return true; } bool tryRetreatBuffer() { if (UNLIKELY(crtBuf_ == buffer_)) { - offset_ = 0; + crtPos_ = crtBegin_; return false; } crtBuf_ = crtBuf_->prev(); - offset_ = crtBuf_->length(); + crtBegin_ = crtBuf_->data(); + crtPos_ = crtEnd_ = crtBuf_->tail(); static_cast(this)->advanceDone(); return true; } void advanceBufferIfEmpty() { - if (length() == 0) { + if (crtPos_ == crtEnd_) { tryAdvanceBuffer(); } } BufType* crtBuf_; - size_t offset_ = 0; + const uint8_t* crtBegin_{nullptr}; + const uint8_t* crtEnd_{nullptr}; + const uint8_t* crtPos_{nullptr}; private: + template + FOLLY_NOINLINE T readSlow() { + T val; + pullSlow(&val, sizeof(T)); + return val; + } + void readFixedStringSlow(std::string* str, size_t len) { for (size_t available; (available = length()) < len; ) { str->append(reinterpret_cast(data()), available); @@ -546,7 +591,7 @@ class CursorBase { len -= available; } str->append(reinterpret_cast(data()), len); - offset_ += len; + crtPos_ += len; advanceBufferIfEmpty(); } @@ -563,7 +608,7 @@ class CursorBase { len -= available; } memcpy(p, data(), len); - offset_ += len; + crtPos_ += len; advanceBufferIfEmpty(); return copied + len; } @@ -583,7 +628,7 @@ class CursorBase { } len -= available; } - offset_ += len; + crtPos_ += len; advanceBufferIfEmpty(); return skipped + len; } @@ -596,14 +641,14 @@ class CursorBase { size_t retreatAtMostSlow(size_t len) { size_t retreated = 0; - for (size_t available; (available = offset_) < len;) { + for (size_t available; (available = crtPos_ - crtBegin_) < len;) { retreated += available; if (UNLIKELY(!tryRetreatBuffer())) { return retreated; } len -= available; } - offset_ -= len; + crtPos_ -= len; return retreated + len; } @@ -748,11 +793,19 @@ class RWCursor if (this->crtBuf_ != this->head() && this->totalLength() < n) { throw std::overflow_error("cannot gather() past the end of the chain"); } - this->crtBuf_->gather(this->offset_ + n); + size_t offset = this->crtPos_ - this->crtBegin_; + this->crtBuf_->gather(offset + n); + this->crtBegin_ = this->crtBuf_->data(); + this->crtEnd_ = this->crtBuf_->tail(); + this->crtPos_ = this->crtBegin_ + offset; } void gatherAtMost(size_t n) { size_t size = std::min(n, this->totalLength()); - return this->crtBuf_->gather(this->offset_ + size); + size_t offset = this->crtPos_ - this->crtBegin_; + this->crtBuf_->gather(offset + size); + this->crtBegin_ = this->crtBuf_->data(); + this->crtEnd_ = this->crtBuf_->tail(); + this->crtPos_ = this->crtBegin_ + offset; } using detail::Writable>::pushAtMost; @@ -774,7 +827,7 @@ class RWCursor maybeUnshare(); } memcpy(writableData(), buf, len); - this->offset_ += len; + this->crtPos_ += len; return copied + len; } @@ -793,16 +846,16 @@ class RWCursor void insert(std::unique_ptr buf) { folly::IOBuf* nextBuf; - if (this->offset_ == 0) { + if (this->crtPos_ == this->crtBegin_) { // Can just prepend nextBuf = this->crtBuf_; this->crtBuf_->prependChain(std::move(buf)); } else { std::unique_ptr remaining; - if (this->crtBuf_->length() - this->offset_ > 0) { + if (this->crtPos_ != this->crtEnd_) { // Need to split current IOBuf in two. remaining = this->crtBuf_->cloneOne(); - remaining->trimStart(this->offset_); + remaining->trimStart(this->crtPos_ - this->crtBegin_); nextBuf = remaining.get(); buf->prependChain(std::move(remaining)); } else { @@ -811,20 +864,26 @@ class RWCursor } this->crtBuf_->trimEnd(this->length()); this->crtBuf_->appendChain(std::move(buf)); + + // Jump past the new links + this->crtBuf_ = nextBuf; + this->crtPos_ = this->crtBegin_ = this->crtBuf_->data(); + this->crtEnd_ = this->crtBuf_->tail(); } - // Jump past the new links - this->offset_ = 0; - this->crtBuf_ = nextBuf; } uint8_t* writableData() { - return this->crtBuf_->writableData() + this->offset_; + return this->crtBuf_->writableData() + (this->crtPos_ - this->crtBegin_); } private: void maybeUnshare() { if (UNLIKELY(maybeShared_)) { + size_t offset = this->crtPos_ - this->crtBegin_; this->crtBuf_->unshareOne(); + this->crtBegin_ = this->crtBuf_->data(); + this->crtEnd_ = this->crtBuf_->tail(); + this->crtPos_ = this->crtBegin_ + offset; maybeShared_ = false; } }