various perf improvements
authorJames Sedgwick <jsedgwick@fb.com>
Tue, 16 Jun 2015 17:30:18 +0000 (10:30 -0700)
committerSara Golemon <sgolemon@fb.com>
Wed, 17 Jun 2015 17:26:16 +0000 (10:26 -0700)
Summary: Three strategies
1. Optimistic locking
2. Acquire-release memory ordering instead of full sequential consistency
3. Some low-hanging branch miss optimizations

Please review carefully; the dogscience is strong with this one

```
Before:

============================================================================
folly/futures/test/Benchmark.cpp                relative  time/iter  iters/s
============================================================================
constantFuture                                             127.99ns    7.81M
promiseAndFuture                                  94.89%   134.89ns    7.41M
withThen                                          28.40%   450.63ns    2.22M
----------------------------------------------------------------------------
oneThen                                                    446.68ns    2.24M
twoThens                                          58.35%   765.55ns    1.31M
fourThens                                         31.87%     1.40us  713.41K
hundredThens                                       1.61%    27.78us   35.99K
----------------------------------------------------------------------------
no_contention                                                4.63ms   216.00
contention                                        80.79%     5.73ms   174.52
----------------------------------------------------------------------------
throwAndCatch                                               10.91us   91.64K
throwAndCatchWrapped                             127.14%     8.58us  116.51K
throwWrappedAndCatch                             178.22%     6.12us  163.32K
throwWrappedAndCatchWrapped                      793.75%     1.37us  727.38K
----------------------------------------------------------------------------
throwAndCatchContended                                        1.35s  741.33m
throwAndCatchWrappedContended                    139.18%   969.23ms     1.03
throwWrappedAndCatchContended                    169.51%   795.76ms     1.26
throwWrappedAndCatchWrappedContended            17742.23%     7.60ms   131.53
----------------------------------------------------------------------------
complexUnit                                                127.50us    7.84K
complexBlob4                                     100.14%   127.32us    7.85K
complexBlob8                                     100.16%   127.30us    7.86K
complexBlob64                                     96.45%   132.19us    7.57K
complexBlob128                                    92.83%   137.35us    7.28K
complexBlob256                                    87.79%   145.23us    6.89K
complexBlob512                                    81.64%   156.18us    6.40K
complexBlob1024                                   72.54%   175.76us    5.69K
complexBlob2048                                   58.52%   217.89us    4.59K
complexBlob4096                                   32.54%   391.78us    2.55K
============================================================================

After:
============================================================================
folly/futures/test/Benchmark.cpp                relative  time/iter  iters/s
============================================================================
constantFuture                                              85.28ns   11.73M
promiseAndFuture                                  88.63%    96.22ns   10.39M
withThen                                          30.46%   279.99ns    3.57M
----------------------------------------------------------------------------
oneThen                                                    231.18ns    4.33M
twoThens                                          60.57%   381.70ns    2.62M
fourThens                                         33.52%   689.71ns    1.45M
hundredThens                                       1.49%    15.48us   64.58K
----------------------------------------------------------------------------
no_contention                                                3.84ms   260.19
contention                                        88.29%     4.35ms   229.73
----------------------------------------------------------------------------
throwAndCatch                                               10.63us   94.06K
throwAndCatchWrapped                             127.17%     8.36us  119.61K
throwWrappedAndCatch                             179.83%     5.91us  169.15K
throwWrappedAndCatchWrapped                     1014.48%     1.05us  954.19K
----------------------------------------------------------------------------
throwAndCatchContended                                        1.34s  749.03m
throwAndCatchWrappedContended                    140.66%   949.16ms     1.05
throwWrappedAndCatchContended                    164.87%   809.77ms     1.23
throwWrappedAndCatchWrappedContended            49406.39%     2.70ms   370.07
----------------------------------------------------------------------------
complexUnit                                                 86.83us   11.52K
complexBlob4                                      97.42%    89.12us   11.22K
complexBlob8                                      96.63%    89.85us   11.13K
complexBlob64                                     92.53%    93.84us   10.66K
complexBlob128                                    90.85%    95.57us   10.46K
complexBlob256                                    82.56%   105.17us    9.51K
complexBlob512                                    74.13%   117.12us    8.54K
complexBlob1024                                   63.67%   136.37us    7.33K
complexBlob2048                                   50.25%   172.79us    5.79K
complexBlob4096                                   26.63%   326.05us    3.07K
============================================================================
```

Reviewed By: @djwatson

Differential Revision: D2139822

folly/futures/Future-inl.h
folly/futures/Promise-inl.h
folly/futures/Promise.h
folly/futures/Try-inl.h
folly/futures/detail/Core.h
folly/futures/detail/FSM.h
folly/futures/test/Benchmark.cpp

index 2189585bec8ea238dbc8b37e396d45c1a410799d..0d6679d8b49a4539f24e45cd0653e4bcc83eb37e 100644 (file)
@@ -106,14 +106,12 @@ Future<T>::thenImplementation(F func, detail::argResult<isTry, F, Args...>) {
 
   // wrap these so we can move them into the lambda
   folly::MoveWrapper<Promise<B>> p;
-  p->setInterruptHandler(core_->getInterruptHandler());
+  p->core_->setInterruptHandlerNoLock(core_->getInterruptHandler());
   folly::MoveWrapper<F> funcm(std::forward<F>(func));
 
   // grab the Future now before we lose our handle on the Promise
   auto f = p->getFuture();
-  if (getExecutor()) {
-    f.setExecutor(getExecutor());
-  }
+  f.core_->setExecutorNoLock(getExecutor());
 
   /* This is a bit tricky.
 
@@ -174,13 +172,12 @@ Future<T>::thenImplementation(F func, detail::argResult<isTry, F, Args...>) {
 
   // wrap these so we can move them into the lambda
   folly::MoveWrapper<Promise<B>> p;
+  p->core_->setInterruptHandlerNoLock(core_->getInterruptHandler());
   folly::MoveWrapper<F> funcm(std::forward<F>(func));
 
   // grab the Future now before we lose our handle on the Promise
   auto f = p->getFuture();
-  if (getExecutor()) {
-    f.setExecutor(getExecutor());
-  }
+  f.core_->setExecutorNoLock(getExecutor());
 
   setCallback_(
     [p, funcm](Try<T>&& t) mutable {
index 245b0d6fee6325472c670f818b92dbbd30a5b8b2..751d9b4cb8d36f18a215ad600b331dace8aa9417 100644 (file)
@@ -44,16 +44,19 @@ Promise<T>& Promise<T>::operator=(Promise<T>&& other) noexcept {
 
 template <class T>
 void Promise<T>::throwIfFulfilled() {
-  if (!core_)
+  if (UNLIKELY(!core_)) {
     throw NoState();
-  if (core_->ready())
+  }
+  if (UNLIKELY(core_->ready())) {
     throw PromiseAlreadySatisfied();
+  }
 }
 
 template <class T>
 void Promise<T>::throwIfRetrieved() {
-  if (retrieved_)
+  if (UNLIKELY(retrieved_)) {
     throw FutureAlreadyRetrieved();
+  }
 }
 
 template <class T>
index 27b20bd830f816898a8ffface7b2805fab46d5b3..c6af9168eddcd39e91c8ecce59fb3d5568064ac6 100644 (file)
@@ -103,6 +103,7 @@ public:
 
 private:
   typedef typename Future<T>::corePtr corePtr;
+  template <class> friend class Future;
 
   // Whether the Future has been retrieved (a one-time operation).
   bool retrieved_;
index 22bc87796a2c46e6709c92773032c8e5302b412c..fac9cc56c7d1c79bd1e1c34804e01a9954e38517 100644 (file)
@@ -79,9 +79,9 @@ Try<T>& Try<T>::operator=(const Try<T>& t) {
 
 template <class T>
 Try<T>::~Try() {
-  if (contains_ == Contains::VALUE) {
+  if (LIKELY(contains_ == Contains::VALUE)) {
     value_.~T();
-  } else if (contains_ == Contains::EXCEPTION) {
+  } else if (UNLIKELY(contains_ == Contains::EXCEPTION)) {
     e_.~unique_ptr<exception_wrapper>();
   }
 }
index 27842b55d2e92062359b341d1dea5f8ff6ded81f..0265159120a5b4b30a085c24e699a4c9458ccf02 100644 (file)
@@ -81,12 +81,12 @@ class Core {
   Core() {}
 
   explicit Core(Try<T>&& t)
-    : fsm_(State::OnlyResult),
-      attached_(1),
-      result_(std::move(t)) {}
+    : result_(std::move(t)),
+      fsm_(State::OnlyResult),
+      attached_(1) {}
 
   ~Core() {
-    assert(attached_ == 0);
+    DCHECK(attached_ == 0);
   }
 
   // not copyable
@@ -212,7 +212,7 @@ class Core {
   void detachPromise() {
     // detachPromise() and setResult() should never be called in parallel
     // so we don't need to protect this.
-    if (!result_) {
+    if (UNLIKELY(!result_)) {
       setResult(Try<T>(exception_wrapper(BrokenPromise())));
     }
     detachOne();
@@ -220,63 +220,89 @@ class Core {
 
   /// May call from any thread
   void deactivate() {
-    active_ = false;
+    active_.store(false, std::memory_order_release);
   }
 
   /// May call from any thread
   void activate() {
-    active_ = true;
+    active_.store(true, std::memory_order_release);
     maybeCallback();
   }
 
   /// May call from any thread
-  bool isActive() { return active_; }
+  bool isActive() { return active_.load(std::memory_order_acquire); }
 
   /// Call only from Future thread
-  void setExecutor(Executor* x, int8_t priority) {
-    folly::MSLGuard g(executorLock_);
+  void setExecutor(Executor* x, int8_t priority = Executor::MID_PRI) {
+    if (!executorLock_.try_lock()) {
+      executorLock_.lock();
+    }
+    executor_ = x;
+    priority_ = priority;
+    executorLock_.unlock();
+  }
+
+  void setExecutorNoLock(Executor* x, int8_t priority = Executor::MID_PRI) {
     executor_ = x;
     priority_ = priority;
   }
 
   Executor* getExecutor() {
-    folly::MSLGuard g(executorLock_);
     return executor_;
   }
 
   /// Call only from Future thread
   void raise(exception_wrapper e) {
-    folly::MSLGuard guard(interruptLock_);
+    if (!interruptLock_.try_lock()) {
+      interruptLock_.lock();
+    }
     if (!interrupt_ && !hasResult()) {
       interrupt_ = folly::make_unique<exception_wrapper>(std::move(e));
       if (interruptHandler_) {
         interruptHandler_(*interrupt_);
       }
     }
+    interruptLock_.unlock();
   }
 
   std::function<void(exception_wrapper const&)> getInterruptHandler() {
-    folly::MSLGuard guard(interruptLock_);
-    return interruptHandler_;
+    if (!interruptHandlerSet_.load(std::memory_order_acquire)) {
+      return nullptr;
+    }
+    if (!interruptLock_.try_lock()) {
+      interruptLock_.lock();
+    }
+    auto handler = interruptHandler_;
+    interruptLock_.unlock();
+    return handler;
   }
 
   /// Call only from Promise thread
   void setInterruptHandler(std::function<void(exception_wrapper const&)> fn) {
-    folly::MSLGuard guard(interruptLock_);
+    if (!interruptLock_.try_lock()) {
+      interruptLock_.lock();
+    }
     if (!hasResult()) {
       if (interrupt_) {
         fn(*interrupt_);
       } else {
-        interruptHandler_ = std::move(fn);
+        setInterruptHandlerNoLock(std::move(fn));
       }
     }
+    interruptLock_.unlock();
+  }
+
+  void setInterruptHandlerNoLock(
+      std::function<void(exception_wrapper const&)> fn) {
+    interruptHandlerSet_.store(true, std::memory_order_relaxed);
+    interruptHandler_ = std::move(fn);
   }
 
  protected:
   void maybeCallback() {
     FSM_START(fsm_)
       case State::Armed:
-        if (active_) {
+        if (active_.load(std::memory_order_acquire)) {
           FSM_UPDATE2(fsm_, State::Done, []{}, [this]{ this->doCallback(); });
         }
         FSM_BREAK
@@ -289,17 +315,20 @@ class Core {
   void doCallback() {
     RequestContext::setContext(context_);
 
-    // TODO(6115514) semantic race on reading executor_ and setExecutor()
-    Executor* x;
+    Executor* x = executor_;
     int8_t priority;
-    {
-      folly::MSLGuard g(executorLock_);
+    if (x) {
+      if (!executorLock_.try_lock()) {
+        executorLock_.lock();
+      }
       x = executor_;
       priority = priority_;
+      executorLock_.unlock();
     }
 
     if (x) {
-      ++attached_; // keep Core alive until executor did its thing
+      // keep Core alive until executor did its thing
+      ++attached_;
       try {
         if (LIKELY(x->getNumPriorities() == 1)) {
           x->add([this]() mutable {
@@ -330,17 +359,21 @@ class Core {
     }
   }
 
+  // lambdaBuf occupies exactly one cache line
+  static constexpr size_t lambdaBufSize = 8 * sizeof(void*);
+  char lambdaBuf_[lambdaBufSize];
+  // place result_ next to increase the likelihood that the value will be
+  // contained entirely in one cache line
+  folly::Optional<Try<T>> result_ {};
+  std::function<void(Try<T>&&)> callback_ {nullptr};
   FSM<State> fsm_ {State::Start};
   std::atomic<unsigned char> attached_ {2};
   std::atomic<bool> active_ {true};
+  std::atomic<bool> interruptHandlerSet_ {false};
   folly::MicroSpinLock interruptLock_ {0};
   folly::MicroSpinLock executorLock_ {0};
   int8_t priority_ {-1};
   Executor* executor_ {nullptr};
-  folly::Optional<Try<T>> result_ {};
-  std::function<void(Try<T>&&)> callback_ {nullptr};
-  static constexpr size_t lambdaBufSize = 8 * sizeof(void*);
-  char lambdaBuf_[lambdaBufSize];
   std::shared_ptr<RequestContext> context_ {nullptr};
   std::unique_ptr<exception_wrapper> interrupt_ {};
   std::function<void(exception_wrapper const&)> interruptHandler_ {nullptr};
index 0d4ffbed462c9dfb2865ac31df5354a6a392b019..ce77c551173637f85efe3f8624432c577b172056 100644 (file)
@@ -44,7 +44,7 @@ public:
   explicit FSM(Enum startState) : state_(startState) {}
 
   Enum getState() const {
-    return state_.load(std::memory_order_relaxed);
+    return state_.load(std::memory_order_acquire);
   }
 
   /// Atomically do a state transition with accompanying action.
@@ -52,10 +52,16 @@ public:
   /// @returns true on success, false and action unexecuted otherwise
   template <class F>
   bool updateState(Enum A, Enum B, F const& action) {
-    std::lock_guard<Mutex> lock(mutex_);
-    if (state_ != A) return false;
+    if (!mutex_.try_lock()) {
+      mutex_.lock();
+    }
+    if (state_.load(std::memory_order_relaxed) != A) {
+      mutex_.unlock();
+      return false;
+    }
     action();
-    state_ = B;
+    state_.store(B, std::memory_order_relaxed);
+    mutex_.unlock();
     return true;
   }
 
index 386e8b9d02f7d9e51cfcf2aef3196095c1caeee2..8721622b565bdeb4fa01191d00ad8b0a5a1c1a88 100644 (file)
@@ -20,6 +20,7 @@
 #include <folly/Benchmark.h>
 #include <folly/futures/Future.h>
 #include <folly/futures/Promise.h>
+#include <folly/futures/InlineExecutor.h>
 
 #include <semaphore.h>
 #include <vector>
@@ -290,6 +291,98 @@ BENCHMARK_RELATIVE(throwWrappedAndCatchWrappedContended) {
   contend(throwWrappedAndCatchWrappedImpl);
 }
 
+InlineExecutor exe;
+
+template <class T>
+Future<T> fGen() {
+  Promise<T> p;
+  auto f = p.getFuture()
+    .then([] (T&& t) {
+      return std::move(t);
+    })
+    .then([] (T&& t) {
+      return makeFuture(std::move(t));
+    })
+    .via(&exe)
+    .then([] (T&& t) {
+      return std::move(t);
+    })
+    .then([] (T&& t) {
+      return makeFuture(std::move(t));
+    });
+  p.setValue(T());
+  return f;
+}
+
+template <class T>
+std::vector<Future<T>> fsGen() {
+  std::vector<Future<T>> fs;
+  for (auto i = 0; i < 10; i++) {
+    fs.push_back(fGen<T>());
+  }
+  return fs;
+}
+
+template <class T>
+void complexBenchmark() {
+  collect(fsGen<T>());
+  collectAll(fsGen<T>());
+  collectAny(fsGen<T>());
+  futures::map(fsGen<T>(), [] (const T& t) {
+    return t;
+  });
+  futures::map(fsGen<T>(), [] (const T& t) {
+    return makeFuture(T(t));
+  });
+}
+
+BENCHMARK_DRAW_LINE();
+
+template <size_t S>
+struct Blob {
+  char buf[S];
+};
+
+BENCHMARK(complexUnit) {
+  complexBenchmark<Unit>();
+}
+
+BENCHMARK_RELATIVE(complexBlob4) {
+  complexBenchmark<Blob<4>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob8) {
+  complexBenchmark<Blob<8>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob64) {
+  complexBenchmark<Blob<64>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob128) {
+  complexBenchmark<Blob<128>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob256) {
+  complexBenchmark<Blob<256>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob512) {
+  complexBenchmark<Blob<512>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob1024) {
+  complexBenchmark<Blob<1024>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob2048) {
+  complexBenchmark<Blob<2048>>();
+}
+
+BENCHMARK_RELATIVE(complexBlob4096) {
+  complexBenchmark<Blob<4096>>();
+}
+
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   folly::runBenchmarks();