Improve QueueAppender/IOBufQueue performance
authorStepan Palamarchuk <stepan@fb.com>
Mon, 4 Dec 2017 23:08:24 +0000 (15:08 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Mon, 4 Dec 2017 23:21:07 +0000 (15:21 -0800)
commit64072ab5f788cd1081bdd98ba6dbaabbcd1fafbd
tree4d0619f6d95a489004f2e095d1e8de04260e5733
parent7acf192db1befc5a54c8a77e2450176fe587fa1f
Improve QueueAppender/IOBufQueue performance

Summary:
Currently QueueAppender needs to follow a chain of 4 indirections (QueueAppender->IOBufQueue->IOBuf(head)->IOBuf(tail)->data).
This diff adds a cache of writable tail range in IOBufQueue and allows it to be placed externally.

Before this diff on hot path QueueAppender::write<signed char> was ~167 bytes of code (with majority being actually executed), after this diff it's down to ~30 bytes:
  0x0000000000419d10 <+0>:     mov    (%rdi),%rax
  0x0000000000419d13 <+3>:     cmp    %rax,0x8(%rdi)
  0x0000000000419d17 <+7>:     je     0x419d28 <folly::io::QueueAppender::write<signed char>(signed char)+24>
  0x0000000000419d19 <+9>:     mov    %sil,(%rax)
  0x0000000000419d1c <+12>:    addq   $0x1,(%rdi)
  0x0000000000419d20 <+16>:    retq
  0x0000000000419d21 <+17>:    nopl   0x0(%rax)
  0x0000000000419d28 <+24>:    movsbl %sil,%esi
  0x0000000000419d2c <+28>:    jmpq   0x419ca0 <folly::io::QueueAppender::writeSlow<signed char>(signed char)>

With this diff, Thrift serialization performance is improved up to 2x with production workloads (2x for compact, 3x for binary).

Thrift benchmark output:
Before:
  ============================================================================
  thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s
  ============================================================================
  BinaryProtocol_write_Empty                                  58.05ns   17.23M
  BinaryProtocol_write_SmallInt                               75.17ns   13.30M
  BinaryProtocol_write_BigInt                                 74.60ns   13.41M
  BinaryProtocol_write_SmallString                            85.12ns   11.75M
  BinaryProtocol_write_BigString                             802.96ns    1.25M
  BinaryProtocol_write_BigBinary                             174.69ns    5.72M
  BinaryProtocol_write_LargeBinary                           171.81ns    5.82M
  BinaryProtocol_write_Mixed                                 130.97ns    7.64M
  BinaryProtocol_write_SmallListInt                          123.99ns    8.06M
  BinaryProtocol_write_BigListInt                             40.72us   24.56K
  BinaryProtocol_write_BigListMixed                          784.78us    1.27K
  BinaryProtocol_write_LargeListMixed                         98.84ms    10.12
  CompactProtocol_write_Empty                                 64.38ns   15.53M
  CompactProtocol_write_SmallInt                              76.74ns   13.03M
  CompactProtocol_write_BigInt                                83.62ns   11.96M
  CompactProtocol_write_SmallString                           86.05ns   11.62M
  CompactProtocol_write_BigString                            786.18ns    1.27M
  CompactProtocol_write_BigBinary                            184.91ns    5.41M
  CompactProtocol_write_LargeBinary                          182.12ns    5.49M
  CompactProtocol_write_Mixed                                120.89ns    8.27M
  CompactProtocol_write_SmallListInt                         119.74ns    8.35M
  CompactProtocol_write_BigListInt                            43.76us   22.85K
  CompactProtocol_write_BigListMixed                         595.90us    1.68K
  CompactProtocol_write_LargeListMixed                        72.80ms    13.74
  ============================================================================
After:
  ============================================================================
  thrift/lib/cpp2/test/ProtocolBench.cpp          relative  time/iter  iters/s
  ============================================================================
  BinaryProtocol_write_Empty                                  65.97ns   15.16M
  BinaryProtocol_write_SmallInt                               72.31ns   13.83M
  BinaryProtocol_write_BigInt                                 72.67ns   13.76M
  BinaryProtocol_write_SmallString                            77.56ns   12.89M
  BinaryProtocol_write_BigString                             782.07ns    1.28M
  BinaryProtocol_write_BigBinary                             179.69ns    5.57M
  BinaryProtocol_write_LargeBinary                           182.62ns    5.48M
  BinaryProtocol_write_Mixed                                  91.62ns   10.92M
  BinaryProtocol_write_SmallListInt                           96.22ns   10.39M
  BinaryProtocol_write_BigListInt                             19.65us   50.90K
  BinaryProtocol_write_BigListMixed                          245.69us    4.07K
  BinaryProtocol_write_LargeListMixed                         46.56ms    21.48
  CompactProtocol_write_Empty                                 74.44ns   13.43M
  CompactProtocol_write_SmallInt                              80.35ns   12.45M
  CompactProtocol_write_BigInt                                85.30ns   11.72M
  CompactProtocol_write_SmallString                           82.61ns   12.10M
  CompactProtocol_write_BigString                            784.77ns    1.27M
  CompactProtocol_write_BigBinary                            193.20ns    5.18M
  CompactProtocol_write_LargeBinary                          192.53ns    5.19M
  CompactProtocol_write_Mixed                                 99.78ns   10.02M
  CompactProtocol_write_SmallListInt                         104.77ns    9.54M
  CompactProtocol_write_BigListInt                            25.62us   39.03K
  CompactProtocol_write_BigListMixed                         272.42us    3.67K
  CompactProtocol_write_LargeListMixed                        38.32ms    26.09
  ============================================================================

QueueAppender Benchmark output (although not very representative due to a tight loop):
Before:
  ============================================================================
  folly/io/test/QueueAppenderBenchmark.cpp        relative  time/iter  iters/s
  ============================================================================
  write_uint8                                                 10.50us   95.20K
  write_uint16                                                 5.48us  182.49K
  write_uint32                                                 2.73us  366.22K
  push_64b                                                     9.77us  102.36K
  push_1024b                                                 112.87us    8.86K
  append                                                      64.21us   15.57K
  preallocate_postallocate_1b                                 16.34us   61.19K
  preallocate_postallocate_4b                                 15.56us   64.26K
  preallocate_postallocate_32b                                22.17us   45.11K
  preallocate_postallocate_256b                              149.55us    6.69K
  ============================================================================

After:
  ============================================================================
  folly/io/test/QueueAppenderBenchmark.cpp        relative  time/iter  iters/s
  ============================================================================
  write_uint8                                                  8.86us  112.81K
  write_uint16                                                 3.91us  255.68K
  write_uint32                                                 2.08us  481.78K
  push_64b                                                     8.24us  121.39K
  push_1024b                                                 115.50us    8.66K
  append                                                      67.52us   14.81K
  preallocate_postallocate_1b                                 13.86us   72.17K
  preallocate_postallocate_4b                                 11.67us   85.71K
  preallocate_postallocate_32b                                20.35us   49.14K
  preallocate_postallocate_256b                              148.57us    6.73K
  ============================================================================

Reviewed By: yfeldblum

Differential Revision: D6427749

fbshipit-source-id: 8495cc74b6106b15d201e37533ae4c0a1abc9d74
folly/io/Cursor.h
folly/io/IOBufQueue.cpp
folly/io/IOBufQueue.h
folly/io/test/QueueAppenderBenchmark.cpp [new file with mode: 0644]