Improve fast path of Cursor
authorStepan Palamarchuk <stepan@fb.com>
Tue, 16 Jan 2018 00:36:41 +0000 (16:36 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Tue, 16 Jan 2018 00:54:23 +0000 (16:54 -0800)
Summary:
This change simplifies the fastpath by reducing it to bare minimum (i.e. check length, load data) and removes indirection to IOBuf.
Additionally it adds `skipNoAdvance` method to have 1-instruction skip.

Disassembly of `read<signed char>` is over 35 instructions (just hot path). With this change it's doesn to 8.
Disassembly after:
  Dump of assembler code for function folly::io::detail::CursorBase<folly::io::Cursor, folly::IOBuf const>::read<unsigned char>():
     0x000000000041f0f0 <+0>:     mov    0x18(%rdi),%rax
     0x000000000041f0f4 <+4>:     lea    0x1(%rax),%rcx
     0x000000000041f0f8 <+8>:     cmp    0x10(%rdi),%rcx
     0x000000000041f0fc <+12>:    ja     0x41f105 <folly::io::detail::CursorBase<folly::io::Cursor, folly::IOBuf const>::read<unsigned char>()+21>
     0x000000000041f0fe <+14>:    mov    (%rax),%al
     0x000000000041f100 <+16>:    mov    %rcx,0x18(%rdi)
     0x000000000041f104 <+20>:    retq
     0x000000000041f105 <+21>:    jmpq   0x41f110 <folly::io::detail::CursorBase<folly::io::Cursor, folly::IOBuf const>::readSlow<unsigned char>()>

With this diff Thrift deserialization becomes ~20% faster (with prod workloads).

Thrift benchmark:
Before:
  ============================================================================
  thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s
  ============================================================================
  BinaryProtocol_read_Empty                                   12.98ns   77.03M
  BinaryProtocol_read_SmallInt                                20.94ns   47.76M
  BinaryProtocol_read_BigInt                                  20.86ns   47.93M
  BinaryProtocol_read_SmallString                             34.64ns   28.86M
  BinaryProtocol_read_BigString                              185.53ns    5.39M
  BinaryProtocol_read_BigBinary                               67.34ns   14.85M
  BinaryProtocol_read_LargeBinary                             62.23ns   16.07M
  BinaryProtocol_read_Mixed                                   58.74ns   17.03M
  BinaryProtocol_read_SmallListInt                            89.99ns   11.11M
  BinaryProtocol_read_BigListInt                              39.92us   25.05K
  BinaryProtocol_read_BigListMixed                           616.20us    1.62K
  BinaryProtocol_read_LargeListMixed                          83.49ms    11.98
  CompactProtocol_read_Empty                                  11.28ns   88.67M
  CompactProtocol_read_SmallInt                               19.15ns   52.22M
  CompactProtocol_read_BigInt                                 26.14ns   38.25M
  CompactProtocol_read_SmallString                            31.04ns   32.22M
  CompactProtocol_read_BigString                             184.55ns    5.42M
  CompactProtocol_read_BigBinary                              69.73ns   14.34M
  CompactProtocol_read_LargeBinary                            64.39ns   15.53M
  CompactProtocol_read_Mixed                                  58.73ns   17.03M
  CompactProtocol_read_SmallListInt                           76.50ns   13.07M
  CompactProtocol_read_BigListInt                             25.93us   38.56K
  CompactProtocol_read_BigListMixed                          623.15us    1.60K
  CompactProtocol_read_LargeListMixed                         80.57ms    12.41
  ============================================================================

After:
  ============================================================================
  thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s
  ============================================================================
  BinaryProtocol_read_Empty                                   10.40ns   96.17M
  BinaryProtocol_read_SmallInt                                15.14ns   66.03M
  BinaryProtocol_read_BigInt                                  15.19ns   65.84M
  BinaryProtocol_read_SmallString                             25.19ns   39.70M
  BinaryProtocol_read_BigString                              172.85ns    5.79M
  BinaryProtocol_read_BigBinary                               56.88ns   17.58M
  BinaryProtocol_read_LargeBinary                             56.77ns   17.61M
  BinaryProtocol_read_Mixed                                   43.98ns   22.74M
  BinaryProtocol_read_SmallListInt                            58.19ns   17.19M
  BinaryProtocol_read_BigListInt                              19.75us   50.63K
  BinaryProtocol_read_BigListMixed                           440.20us    2.27K
  BinaryProtocol_read_LargeListMixed                          56.94ms    17.56
  CompactProtocol_read_Empty                                   9.35ns  106.93M
  CompactProtocol_read_SmallInt                               13.07ns   76.49M
  CompactProtocol_read_BigInt                                 18.23ns   54.87M
  CompactProtocol_read_SmallString                            25.61ns   39.05M
  CompactProtocol_read_BigString                             174.46ns    5.73M
  CompactProtocol_read_BigBinary                              59.77ns   16.73M
  CompactProtocol_read_LargeBinary                            60.81ns   16.44M
  CompactProtocol_read_Mixed                                  42.70ns   23.42M
  CompactProtocol_read_SmallListInt                           66.89ns   14.95M
  CompactProtocol_read_BigListInt                             25.08us   39.87K
  CompactProtocol_read_BigListMixed                          427.93us    2.34K
  CompactProtocol_read_LargeListMixed                         56.11ms    17.82
  ============================================================================

Reviewed By: yfeldblum

Differential Revision: D6635325

fbshipit-source-id: 393fc1005689042977c03f37f5a898ebe7814d44

folly/io/Cursor.h

index fcb97c928f3d178df5e7dc0ff5a743f7978e1777..bdaeabe3d292bf0a9d67f2b3e9be1a9fd34b3783 100644 (file)
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cassert>
@@ -58,7 +57,12 @@ class CursorBase {
   // Make all the templated classes friends for copy constructor.
   template <class D, typename B> friend class CursorBase;
  public:
-  explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) { }
+  explicit CursorBase(BufType* buf) : crtBuf_(buf), buffer_(buf) {
+    if (crtBuf_) {
+      crtPos_ = crtBegin_ = crtBuf_->data();
+      crtEnd_ = crtBuf_->tail();
+    }
+  }
 
   /**
    * Copy constructor.
@@ -68,9 +72,11 @@ class CursorBase {
    */
   template <class OtherDerived, class OtherBuf>
   explicit CursorBase(const CursorBase<OtherDerived, OtherBuf>& cursor)
-    : crtBuf_(cursor.crtBuf_),
-      offset_(cursor.offset_),
-      buffer_(cursor.buffer_) { }
+      : crtBuf_(cursor.crtBuf_),
+        crtBegin_(cursor.crtBegin_),
+        crtEnd_(cursor.crtEnd_),
+        crtPos_(cursor.crtPos_),
+        buffer_(cursor.buffer_) {}
 
   /**
    * Reset cursor to point to a new buffer.
@@ -78,11 +84,14 @@ class CursorBase {
   void reset(BufType* buf) {
     crtBuf_ = buf;
     buffer_ = buf;
-    offset_ = 0;
+    if (crtBuf_) {
+      crtPos_ = crtBegin_ = crtBuf_->data();
+      crtEnd_ = crtBuf_->tail();
+    }
   }
 
   const uint8_t* data() const {
-    return crtBuf_->data() + offset_;
+    return crtPos_;
   }
 
   /**
@@ -94,7 +103,7 @@ class CursorBase {
    * pointing at the end of a buffer.
    */
   size_t length() const {
-    return crtBuf_->length() - offset_;
+    return crtEnd_ - crtPos_;
   }
 
   /**
@@ -102,10 +111,10 @@ class CursorBase {
    */
   size_t totalLength() const {
     if (crtBuf_ == buffer_) {
-      return crtBuf_->computeChainDataLength() - offset_;
+      return crtBuf_->computeChainDataLength() - (crtPos_ - crtBegin_);
     }
     CursorBase end(buffer_->prev());
-    end.offset_ = end.buffer_->length();
+    end.crtPos_ = end.crtEnd_;
     return end - *this;
   }
 
@@ -135,7 +144,7 @@ class CursorBase {
    */
   bool isAtEnd() const {
     // Check for the simple cases first.
-    if (offset_ != crtBuf_->length()) {
+    if (crtPos_ != crtEnd_) {
       return false;
     }
     if (crtBuf_ == buffer_->prev()) {
@@ -158,7 +167,8 @@ class CursorBase {
    * Advances the cursor to the end of the entire IOBuf chain.
    */
   void advanceToEnd() {
-    offset_ = buffer_->prev()->length();
+    crtBegin_ = buffer_->prev()->data();
+    crtPos_ = crtEnd_ = buffer_->prev()->tail();
     if (crtBuf_ != buffer_->prev()) {
       crtBuf_ = buffer_->prev();
       static_cast<Derived*>(this)->advanceDone();
@@ -194,7 +204,23 @@ class CursorBase {
    * same IOBuf chain.
    */
   bool operator==(const Derived& other) const {
-    return (offset_ == other.offset_) && (crtBuf_ == other.crtBuf_);
+    const IOBuf* crtBuf = crtBuf_;
+    auto crtPos = crtPos_;
+    // We can be pointing to the end of a buffer chunk, find first non-empty.
+    while (crtPos == crtBuf->tail() && crtBuf != buffer_->prev()) {
+      crtBuf = crtBuf->next();
+      crtPos = crtBuf->data();
+    }
+
+    const IOBuf* crtBufOther = other.crtBuf_;
+    auto crtPosOther = other.crtPos_;
+    // We can be pointing to the end of a buffer chunk, find first non-empty.
+    while (crtPosOther == crtBufOther->tail() &&
+           crtBufOther != other.buffer_->prev()) {
+      crtBufOther = crtBufOther->next();
+      crtPosOther = crtBufOther->data();
+    }
+    return (crtPos == crtPosOther) && (crtBuf == crtBufOther);
   }
   bool operator!=(const Derived& other) const {
     return !operator==(other);
@@ -203,10 +229,9 @@ class CursorBase {
   template <class T>
   typename std::enable_if<std::is_arithmetic<T>::value, bool>::type tryRead(
       T& val) {
-    if (LIKELY(length() >= sizeof(T))) {
+    if (LIKELY(crtPos_ + sizeof(T) <= crtEnd_)) {
       val = loadUnaligned<T>(data());
-      offset_ += sizeof(T);
-      advanceBufferIfEmpty();
+      crtPos_ += sizeof(T);
       return true;
     }
     return pullAtMostSlow(&val, sizeof(T)) == sizeof(T);
@@ -228,11 +253,13 @@ class CursorBase {
 
   template <class T>
   T read() {
-    T val{};
-    if (!tryRead(val)) {
-      std::__throw_out_of_range("underflow");
+    if (LIKELY(crtPos_ + sizeof(T) <= crtEnd_)) {
+      T val = loadUnaligned<T>(data());
+      crtPos_ += sizeof(T);
+      return val;
+    } else {
+      return readSlow<T>();
     }
-    return val;
   }
 
   template <class T>
@@ -257,8 +284,7 @@ class CursorBase {
     str.reserve(len);
     if (LIKELY(length() >= len)) {
       str.append(reinterpret_cast<const char*>(data()), len);
-      offset_ += len;
-      advanceBufferIfEmpty();
+      crtPos_ += len;
     } else {
       readFixedStringSlow(&str, len);
     }
@@ -307,34 +333,41 @@ class CursorBase {
   void skipWhile(const Predicate& predicate);
 
   size_t skipAtMost(size_t len) {
-    if (LIKELY(length() >= len)) {
-      offset_ += len;
-      advanceBufferIfEmpty();
+    if (LIKELY(crtPos_ + len < crtEnd_)) {
+      crtPos_ += len;
       return len;
     }
     return skipAtMostSlow(len);
   }
 
   void skip(size_t len) {
-    if (LIKELY(length() >= len)) {
-      offset_ += len;
-      advanceBufferIfEmpty();
+    if (LIKELY(crtPos_ + len < crtEnd_)) {
+      crtPos_ += len;
     } else {
       skipSlow(len);
     }
   }
 
+  /**
+   * Skip bytes in the current IOBuf without advancing to the next one.
+   * Precondition: length() >= len
+   */
+  void skipNoAdvance(size_t len) {
+    DCHECK_LE(len, length());
+    crtPos_ += len;
+  }
+
   size_t retreatAtMost(size_t len) {
-    if (len <= offset_) {
-      offset_ -= len;
+    if (len <= static_cast<size_t>(crtPos_ - crtBegin_)) {
+      crtPos_ -= len;
       return len;
     }
     return retreatAtMostSlow(len);
   }
 
   void retreat(size_t len) {
-    if (len <= offset_) {
-      offset_ -= len;
+    if (len <= static_cast<size_t>(crtPos_ - crtBegin_)) {
+      crtPos_ -= len;
     } else {
       retreatSlow(len);
     }
@@ -342,20 +375,18 @@ class CursorBase {
 
   size_t pullAtMost(void* buf, size_t len) {
     // Fast path: it all fits in one buffer.
-    if (LIKELY(length() >= len)) {
+    if (LIKELY(crtPos_ + len <= crtEnd_)) {
       memcpy(buf, data(), len);
-      offset_ += len;
-      advanceBufferIfEmpty();
+      crtPos_ += len;
       return len;
     }
     return pullAtMostSlow(buf, len);
   }
 
   void pull(void* buf, size_t len) {
-    if (LIKELY(length() >= len)) {
+    if (LIKELY(crtPos_ + len <= crtEnd_)) {
       memcpy(buf, data(), len);
-      offset_ += len;
-      advanceBufferIfEmpty();
+      crtPos_ += len;
     } else {
       pullSlow(buf, len);
     }
@@ -399,6 +430,9 @@ class CursorBase {
   }
 
   size_t cloneAtMost(folly::IOBuf& buf, size_t len) {
+    // We might be at the end of buffer.
+    advanceBufferIfEmpty();
+
     std::unique_ptr<folly::IOBuf> tmp;
     size_t copied = 0;
     for (int loopCount = 0; true; ++loopCount) {
@@ -407,26 +441,26 @@ class CursorBase {
       if (LIKELY(available >= len)) {
         if (loopCount == 0) {
           crtBuf_->cloneOneInto(buf);
-          buf.trimStart(offset_);
+          buf.trimStart(crtPos_ - crtBegin_);
           buf.trimEnd(buf.length() - len);
         } else {
           tmp = crtBuf_->cloneOne();
-          tmp->trimStart(offset_);
+          tmp->trimStart(crtPos_ - crtBegin_);
           tmp->trimEnd(tmp->length() - len);
           buf.prependChain(std::move(tmp));
         }
 
-        offset_ += len;
+        crtPos_ += len;
         advanceBufferIfEmpty();
         return copied + len;
       }
 
       if (loopCount == 0) {
         crtBuf_->cloneOneInto(buf);
-        buf.trimStart(offset_);
+        buf.trimStart(crtPos_ - crtBegin_);
       } else {
         tmp = crtBuf_->cloneOne();
-        tmp->trimStart(offset_);
+        tmp->trimStart(crtPos_ - crtBegin_);
         buf.prependChain(std::move(tmp));
       }
 
@@ -453,7 +487,7 @@ class CursorBase {
     size_t len = 0;
 
     if (otherBuf != crtBuf_) {
-      len += otherBuf->length() - other.offset_;
+      len += other.crtEnd_ - other.crtPos_;
 
       for (otherBuf = otherBuf->next();
            otherBuf != crtBuf_ && otherBuf != other.buffer_;
@@ -465,13 +499,13 @@ class CursorBase {
         std::__throw_out_of_range("wrap-around");
       }
 
-      len += offset_;
+      len += crtPos_ - crtBegin_;
     } else {
-      if (offset_ < other.offset_) {
+      if (crtPos_ < other.crtPos_) {
         std::__throw_out_of_range("underflow");
       }
 
-      len += offset_ - other.offset_;
+      len += crtPos_ - other.crtPos_;
     }
 
     return len;
@@ -492,7 +526,7 @@ class CursorBase {
       }
     }
 
-    len += offset_;
+    len += crtPos_ - crtBegin_;
     return len;
   }
 
@@ -506,37 +540,48 @@ class CursorBase {
   bool tryAdvanceBuffer() {
     BufType* nextBuf = crtBuf_->next();
     if (UNLIKELY(nextBuf == buffer_)) {
-      offset_ = crtBuf_->length();
+      crtPos_ = crtEnd_;
       return false;
     }
 
-    offset_ = 0;
     crtBuf_ = nextBuf;
+    crtPos_ = crtBegin_ = crtBuf_->data();
+    crtEnd_ = crtBuf_->tail();
     static_cast<Derived*>(this)->advanceDone();
     return true;
   }
 
   bool tryRetreatBuffer() {
     if (UNLIKELY(crtBuf_ == buffer_)) {
-      offset_ = 0;
+      crtPos_ = crtBegin_;
       return false;
     }
     crtBuf_ = crtBuf_->prev();
-    offset_ = crtBuf_->length();
+    crtBegin_ = crtBuf_->data();
+    crtPos_ = crtEnd_ = crtBuf_->tail();
     static_cast<Derived*>(this)->advanceDone();
     return true;
   }
 
   void advanceBufferIfEmpty() {
-    if (length() == 0) {
+    if (crtPos_ == crtEnd_) {
       tryAdvanceBuffer();
     }
   }
 
   BufType* crtBuf_;
-  size_t offset_ = 0;
+  const uint8_t* crtBegin_{nullptr};
+  const uint8_t* crtEnd_{nullptr};
+  const uint8_t* crtPos_{nullptr};
 
  private:
+  template <class T>
+  FOLLY_NOINLINE T readSlow() {
+    T val;
+    pullSlow(&val, sizeof(T));
+    return val;
+  }
+
   void readFixedStringSlow(std::string* str, size_t len) {
     for (size_t available; (available = length()) < len; ) {
       str->append(reinterpret_cast<const char*>(data()), available);
@@ -546,7 +591,7 @@ class CursorBase {
       len -= available;
     }
     str->append(reinterpret_cast<const char*>(data()), len);
-    offset_ += len;
+    crtPos_ += len;
     advanceBufferIfEmpty();
   }
 
@@ -563,7 +608,7 @@ class CursorBase {
       len -= available;
     }
     memcpy(p, data(), len);
-    offset_ += len;
+    crtPos_ += len;
     advanceBufferIfEmpty();
     return copied + len;
   }
@@ -583,7 +628,7 @@ class CursorBase {
       }
       len -= available;
     }
-    offset_ += len;
+    crtPos_ += len;
     advanceBufferIfEmpty();
     return skipped + len;
   }
@@ -596,14 +641,14 @@ class CursorBase {
 
   size_t retreatAtMostSlow(size_t len) {
     size_t retreated = 0;
-    for (size_t available; (available = offset_) < len;) {
+    for (size_t available; (available = crtPos_ - crtBegin_) < len;) {
       retreated += available;
       if (UNLIKELY(!tryRetreatBuffer())) {
         return retreated;
       }
       len -= available;
     }
-    offset_ -= len;
+    crtPos_ -= len;
     return retreated + len;
   }
 
@@ -748,11 +793,19 @@ class RWCursor
     if (this->crtBuf_ != this->head() && this->totalLength() < n) {
       throw std::overflow_error("cannot gather() past the end of the chain");
     }
-    this->crtBuf_->gather(this->offset_ + n);
+    size_t offset = this->crtPos_ - this->crtBegin_;
+    this->crtBuf_->gather(offset + n);
+    this->crtBegin_ = this->crtBuf_->data();
+    this->crtEnd_ = this->crtBuf_->tail();
+    this->crtPos_ = this->crtBegin_ + offset;
   }
   void gatherAtMost(size_t n) {
     size_t size = std::min(n, this->totalLength());
-    return this->crtBuf_->gather(this->offset_ + size);
+    size_t offset = this->crtPos_ - this->crtBegin_;
+    this->crtBuf_->gather(offset + size);
+    this->crtBegin_ = this->crtBuf_->data();
+    this->crtEnd_ = this->crtBuf_->tail();
+    this->crtPos_ = this->crtBegin_ + offset;
   }
 
   using detail::Writable<RWCursor<access>>::pushAtMost;
@@ -774,7 +827,7 @@ class RWCursor
           maybeUnshare();
         }
         memcpy(writableData(), buf, len);
-        this->offset_ += len;
+        this->crtPos_ += len;
         return copied + len;
       }
 
@@ -793,16 +846,16 @@ class RWCursor
 
   void insert(std::unique_ptr<folly::IOBuf> buf) {
     folly::IOBuf* nextBuf;
-    if (this->offset_ == 0) {
+    if (this->crtPos_ == this->crtBegin_) {
       // Can just prepend
       nextBuf = this->crtBuf_;
       this->crtBuf_->prependChain(std::move(buf));
     } else {
       std::unique_ptr<folly::IOBuf> remaining;
-      if (this->crtBuf_->length() - this->offset_ > 0) {
+      if (this->crtPos_ != this->crtEnd_) {
         // Need to split current IOBuf in two.
         remaining = this->crtBuf_->cloneOne();
-        remaining->trimStart(this->offset_);
+        remaining->trimStart(this->crtPos_ - this->crtBegin_);
         nextBuf = remaining.get();
         buf->prependChain(std::move(remaining));
       } else {
@@ -811,20 +864,26 @@ class RWCursor
       }
       this->crtBuf_->trimEnd(this->length());
       this->crtBuf_->appendChain(std::move(buf));
+
+      // Jump past the new links
+      this->crtBuf_ = nextBuf;
+      this->crtPos_ = this->crtBegin_ = this->crtBuf_->data();
+      this->crtEnd_ = this->crtBuf_->tail();
     }
-    // Jump past the new links
-    this->offset_ = 0;
-    this->crtBuf_ = nextBuf;
   }
 
   uint8_t* writableData() {
-    return this->crtBuf_->writableData() + this->offset_;
+    return this->crtBuf_->writableData() + (this->crtPos_ - this->crtBegin_);
   }
 
  private:
   void maybeUnshare() {
     if (UNLIKELY(maybeShared_)) {
+      size_t offset = this->crtPos_ - this->crtBegin_;
       this->crtBuf_->unshareOne();
+      this->crtBegin_ = this->crtBuf_->data();
+      this->crtEnd_ = this->crtBuf_->tail();
+      this->crtPos_ = this->crtBegin_ + offset;
       maybeShared_ = false;
     }
   }