From 79c25e6f4a7742342347b4dc23d4e237eac2aa37 Mon Sep 17 00:00:00 2001
From: Nathan Bronson <ngbronson@fb.com>
Date: Wed, 15 Jan 2014 21:20:03 -0800
Subject: [PATCH] Baton - flushing of thread-local memory during a long wait

Summary:
This diff causes Baton to reduce a thread's memory footprint when it
blocks for an extended period (by default 5 to 7.5 seconds).  Reductions
are achieved by flushing the thread-local jemalloc caches (if jemalloc
is in use) and by calling madvise(MADV_DONTNEED) on the portion of the
thread's stack that isn't active.  Once the thread resumes execution
both of these resources will be reallocated.  Configuration is via
system-wide default.

Test Plan:
1. new unit tests
2. manual execution of existing unit tests with very low idleTimeout
3. peek and poke with gdb to observe madvise discarding the page

Reviewed By: davejwatson@fb.com

FB internal diff: D1146966
---
 folly/Baton.h                  |  21 +++-
 folly/Makefile.am              |   2 +
 folly/detail/MemoryIdler.cpp   | 178 ++++++++++++++++++++++++++++
 folly/detail/MemoryIdler.h     | 143 +++++++++++++++++++++++
 folly/test/MemoryIdlerTest.cpp | 208 +++++++++++++++++++++++++++++++++
 5 files changed, 549 insertions(+), 3 deletions(-)
 create mode 100644 folly/detail/MemoryIdler.cpp
 create mode 100644 folly/detail/MemoryIdler.h
 create mode 100644 folly/test/MemoryIdlerTest.cpp

diff --git a/folly/Baton.h b/folly/Baton.h
index 4d987a99..c53afa35 100644
--- a/folly/Baton.h
+++ b/folly/Baton.h
@@ -24,6 +24,7 @@
 #include <assert.h>
 
 #include <folly/detail/Futex.h>
+#include <folly/detail/MemoryIdler.h>
 
 namespace folly {
 
@@ -151,11 +152,25 @@ struct Baton : boost::noncopyable {
     }
 
     while (true) {
-      state_.futexWait(WAITING);
+      detail::MemoryIdler::futexWait(state_, WAITING);
 
       // state_ is the truth even if FUTEX_WAIT reported a matching
-      // FUTEX_WAKE, since we aren't using type-stable storage and
-      // we don't guarantee reuse
+      // FUTEX_WAKE, since we aren't using type-stable storage and we
+      // don't guarantee reuse.  The scenario goes like this: thread
+      // A's last touch of a Baton is a call to wake(), which stores
+      // LATE_DELIVERY and gets an unlucky context switch before delivering
+      // the corresponding futexWake.  Thread B sees LATE_DELIVERY
+      // without consuming a futex event, because it calls futexWait
+      // with an expected value of WAITING and hence doesn't go to sleep.
+      // B returns, so the Baton's memory is reused and becomes another
+      // Baton (or a reuse of this one).  B calls futexWait on the new
+      // Baton lifetime, then A wakes up and delivers a spurious futexWake
+      // to the same memory location.  B's futexWait will then report a
+      // consumed wake event even though state_ is still WAITING.
+      //
+      // It would be possible to add an extra state_ dance to communicate
+      // that the futexWake has been sent so that we can be sure to consume
+      // it before returning, but that would be a perf and complexity hit.
       uint32_t s = state_.load(std::memory_order_acquire);
       assert(s == WAITING || s == LATE_DELIVERY);
 
diff --git a/folly/Makefile.am b/folly/Makefile.am
index 4aed816a..e5fe6355 100644
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -46,6 +46,7 @@ nobase_follyinclude_HEADERS = \
 	detail/Futex.h \
 	detail/GroupVarintDetail.h \
 	detail/Malloc.h \
+	detail/MemoryIdler.h \
 	detail/MPMCPipelineDetail.h \
 	detail/SlowFingerprint.h \
 	detail/Stats.h \
@@ -155,6 +156,7 @@ libfolly_la_SOURCES = \
 	io/IOBufQueue.cpp \
 	io/RecordIO.cpp \
 	json.cpp \
+	detail/MemoryIdler.cpp \
 	MemoryMapping.cpp \
 	Random.cpp \
 	Range.cpp \
diff --git a/folly/detail/MemoryIdler.cpp b/folly/detail/MemoryIdler.cpp
new file mode 100644
index 00000000..711bd90f
--- /dev/null
+++ b/folly/detail/MemoryIdler.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MemoryIdler.h"
+#include <folly/Logging.h>
+#include <folly/Malloc.h>
+#include <folly/ScopeGuard.h>
+#include <folly/detail/CacheLocality.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <utility>
+
+
+// weak linking means the symbol will be null if not available, instead
+// of a link failure
+extern "C" int mallctl(const char *name, void *oldp, size_t *oldlenp,
+                       void *newp, size_t newlen)
+    __attribute__((weak));
+
+
+namespace folly { namespace detail {
+
+AtomicStruct<std::chrono::steady_clock::duration>
+MemoryIdler::defaultIdleTimeout(std::chrono::seconds(5));
+
+
+/// Calls mallctl, optionally reading and/or writing an unsigned value
+/// if in and/or out is non-null.  Logs on error
+static unsigned mallctlWrapper(const char* cmd, const unsigned* in,
+                               unsigned* out) {
+  size_t outLen = sizeof(unsigned);
+  int err = mallctl(cmd,
+                    out, out ? &outLen : nullptr,
+                    const_cast<unsigned*>(in), in ? sizeof(unsigned) : 0);
+  if (err != 0) {
+    FB_LOG_EVERY_MS(WARNING, 10000)
+      << "mallctl " << cmd << ": " << strerror(err) << " (" << err << ")";
+  }
+  return err;
+}
+
+void MemoryIdler::flushLocalMallocCaches() {
+  if (usingJEMalloc()) {
+    if (!mallctl) {
+      FB_LOG_EVERY_MS(ERROR, 10000) << "mallctl weak link failed";
+      return;
+    }
+
+    // "tcache.flush" was renamed to "thread.tcache.flush" in jemalloc 3
+    (void)mallctlWrapper("thread.tcache.flush", nullptr, nullptr);
+
+    // By default jemalloc has 4 arenas per cpu, and then assigns each
+    // thread to one of those arenas.  This means that in any service
+    // that doesn't perform a lot of context switching, the chances that
+    // another thread will be using the current thread's arena (and hence
+    // doing the appropriate dirty-page purging) are low.  Some good
+    // tuned configurations (such as that used by hhvm) use fewer arenas
+    // and then pin threads to avoid contended access.  In that case,
+    // purging the arenas is counter-productive.  We use the heuristic
+    // that if narenas <= 2 * num_cpus then we shouldn't do anything here,
+    // which detects when the narenas has been reduced from the default
+    unsigned narenas;
+    unsigned arenaForCurrent;
+    if (mallctlWrapper("arenas.narenas", nullptr, &narenas) == 0 &&
+        narenas > 2 * CacheLocality::system().numCpus &&
+        mallctlWrapper("thread.arena", nullptr, &arenaForCurrent) == 0) {
+      (void)mallctlWrapper("arenas.purge", &arenaForCurrent, nullptr);
+    }
+  }
+}
+
+
+#ifdef __x86_64__
+
+static const size_t s_pageSize = sysconf(_SC_PAGESIZE);
+static __thread uintptr_t tls_stackLimit;
+static __thread size_t tls_stackSize;
+
+static void fetchStackLimits() {
+  pthread_attr_t attr;
+#if defined(_GNU_SOURCE) && defined(__linux__) // Linux+GNU extension
+  pthread_getattr_np(pthread_self(), &attr);
+#else
+  pthread_attr_init(&attr);
+#endif
+  SCOPE_EXIT { pthread_attr_destroy(&attr); };
+
+  void* addr;
+  size_t rawSize;
+  int err;
+  if ((err = pthread_attr_getstack(&attr, &addr, &rawSize))) {
+    // unexpected, but it is better to continue in prod than do nothing
+    FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack error " << err;
+    assert(false);
+    tls_stackSize = 1;
+    return;
+  }
+  assert(addr != nullptr);
+  assert(rawSize >= PTHREAD_STACK_MIN);
+
+  // glibc subtracts guard page from stack size, even though pthread docs
+  // seem to imply the opposite
+  size_t guardSize;
+  if (pthread_attr_getguardsize(&attr, &guardSize) != 0) {
+    guardSize = 0;
+  }
+  assert(rawSize > guardSize);
+
+  // stack goes down, so guard page adds to the base addr
+  tls_stackLimit = uintptr_t(addr) + guardSize;
+  tls_stackSize = rawSize - guardSize;
+
+  assert((tls_stackLimit & (s_pageSize - 1)) == 0);
+}
+
+static __attribute__((noinline)) uintptr_t getStackPtr() {
+  char marker;
+  auto rv = uintptr_t(&marker);
+  return rv;
+}
+
+void MemoryIdler::unmapUnusedStack(size_t retain) {
+  if (tls_stackSize == 0) {
+    fetchStackLimits();
+  }
+  if (tls_stackSize <= std::max(size_t(1), retain)) {
+    // covers both missing stack info, and impossibly large retain
+    return;
+  }
+
+  auto sp = getStackPtr();
+  assert(sp >= tls_stackLimit);
+  assert(sp - tls_stackLimit < tls_stackSize);
+
+  auto end = (sp - retain) & ~(s_pageSize - 1);
+  if (end <= tls_stackLimit) {
+    // no pages are eligible for unmapping
+    return;
+  }
+
+  size_t len = end - tls_stackLimit;
+  assert((len & (s_pageSize - 1)) == 0);
+  if (madvise((void*)tls_stackLimit, len, MADV_DONTNEED) != 0) {
+    // It is likely that the stack vma hasn't been fully grown.  In this
+    // case madvise will apply dontneed to the present vmas, then return
+    // errno of ENOMEM.  We can also get an EAGAIN, theoretically.
+    // EINVAL means either an invalid alignment or length, or that some
+    // of the pages are locked or shared.  Neither should occur.
+    int e = errno;
+    assert(e == EAGAIN || e == ENOMEM);
+  }
+}
+
+#else
+
+void MemoryIdler::unmapUnusedStack(size_t retain) {
+}
+
+#endif
+
+}}
diff --git a/folly/detail/MemoryIdler.h b/folly/detail/MemoryIdler.h
new file mode 100644
index 00000000..5898b07f
--- /dev/null
+++ b/folly/detail/MemoryIdler.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_DETAIL_MEMORYIDLER_H
+#define FOLLY_DETAIL_MEMORYIDLER_H
+
+#include <atomic>
+#include <chrono>
+#include <folly/AtomicStruct.h>
+#include <folly/Hash.h>
+#include <folly/Traits.h>
+#include "Futex.h"
+
+namespace folly {
+
+// gcc 4.7 doesn't do std::is_trivial correctly, override so we can use
+// AtomicStruct<duration>
+template<>
+struct IsTriviallyCopyable<std::chrono::steady_clock::duration>
+  : std::true_type {};
+
+}
+
+namespace folly { namespace detail {
+
+/// MemoryIdler provides helper routines that allow routines to return
+/// some assigned memory resources back to the system.  The intended
+/// use is that when a thread is waiting for a long time (perhaps it
+/// is in a LIFO thread pool and hasn't been needed for a long time)
+/// it should release its thread-local malloc caches (both jemalloc and
+/// tcmalloc use these for better performance) and unmap the stack pages
+/// that contain no useful data.
+struct MemoryIdler {
+
+  /// Returns memory from thread-local allocation pools to the global
+  /// pool, if we know how to for the current malloc implementation.
+  /// jemalloc is supported.
+  static void flushLocalMallocCaches();
+
+
+  enum {
+    /// This value is a tradeoff between reclaiming memory and triggering
+    /// a page fault immediately on wakeup.  Note that the actual unit
+    /// of idling for the stack is pages, so the actual stack that
+    /// will be available on wakeup without a page fault is between
+    /// kDefaultStackToRetain and kDefaultStackToRetain + PageSize -
+    /// 1 bytes.
+    kDefaultStackToRetain = 1024,
+  };
+
+  /// Uses madvise to discard the portion of the thread's stack that
+  /// currently doesn't hold any data, trying to ensure that no page
+  /// faults will occur during the next retain bytes of stack allocation
+  static void unmapUnusedStack(size_t retain = kDefaultStackToRetain);
+
+
+  /// The system-wide default for the amount of time a blocking
+  /// thread should wait before reclaiming idle memory.  Set this to
+  /// Duration::max() to never wait.  The default value is 5 seconds.
+  /// Endpoints using this idle timeout might randomly wait longer to
+  /// avoid synchronizing their flushes.
+  static AtomicStruct<std::chrono::steady_clock::duration> defaultIdleTimeout;
+
+
+  /// Equivalent to fut.futexWait(expected, waitMask), but calls
+  /// flushLocalMallocCaches() and unmapUnusedStack(stackToRetain)
+  /// after idleTimeout has passed (if it has passed).  Internally uses
+  /// fut.futexWait and fut.futexWaitUntil.  Like futexWait, returns
+  /// false if interrupted with a signal.  The actual timeout will be
+  /// pseudo-randomly chosen to be between idleTimeout and idleTimeout *
+  /// (1 + timeoutVariationFraction), to smooth out the behavior in a
+  /// system with bursty requests.  The default is to wait up to 50%
+  /// extra, so on average 25% extra
+  template <template <typename> class Atom,
+            typename Clock = std::chrono::steady_clock>
+  static bool futexWait(
+      Futex<Atom>& fut,
+      uint32_t expected,
+      uint32_t waitMask = -1,
+      typename Clock::duration idleTimeout
+          = defaultIdleTimeout.load(std::memory_order_acquire),
+      size_t stackToRetain = kDefaultStackToRetain,
+      float timeoutVariationFrac = 0.5) {
+
+    if (idleTimeout == Clock::duration::max()) {
+      // no need to use futexWaitUntil if no timeout is possible
+      return fut.futexWait(expected, waitMask);
+    }
+
+    if (idleTimeout.count() > 0) {
+      auto begin = Clock::now();
+
+      if (timeoutVariationFrac > 0) {
+        // hash the pthread_t and the time to get the adjustment.
+        // Standard hash func isn't very good, so bit mix the result
+        auto pr = std::make_pair(pthread_self(),
+                                 begin.time_since_epoch().count());
+        std::hash<decltype(pr)> hash_fn;
+        uint64_t h = folly::hash::twang_mix64(hash_fn(pr));
+
+        // multiplying the duration by a floating point doesn't work, grr..
+        auto extraFrac =
+            timeoutVariationFrac / std::numeric_limits<uint64_t>::max() * h;
+        uint64_t tics = idleTimeout.count() * (1 + extraFrac);
+        idleTimeout = typename Clock::duration(tics);
+      }
+
+      while (true) {
+        auto rv = fut.futexWaitUntil(expected, begin + idleTimeout, waitMask);
+        if (rv == FutexResult::TIMEDOUT) {
+          // timeout is over
+          break;
+        }
+        // finished before timeout hit, no flush
+        assert(rv == FutexResult::VALUE_CHANGED || rv == FutexResult::AWOKEN ||
+               rv == FutexResult::INTERRUPTED);
+        return rv == FutexResult::AWOKEN;
+      }
+    }
+
+    // flush, then wait with no timeout
+    flushLocalMallocCaches();
+    unmapUnusedStack(stackToRetain);
+    return fut.futexWait(expected, waitMask);
+  }
+};
+
+}} // namespace folly::detail
+
+#endif
diff --git a/folly/test/MemoryIdlerTest.cpp b/folly/test/MemoryIdlerTest.cpp
new file mode 100644
index 00000000..99630d94
--- /dev/null
+++ b/folly/test/MemoryIdlerTest.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/detail/MemoryIdler.h>
+#include <folly/Baton.h>
+#include <memory>
+#include <thread>
+#include <assert.h>
+#include <semaphore.h>
+#include <gflags/gflags.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <folly/Benchmark.h>
+
+using namespace folly;
+using namespace folly::detail;
+using namespace testing;
+
+TEST(MemoryIdler, releaseStack) {
+  MemoryIdler::unmapUnusedStack();
+}
+
+TEST(MemoryIdler, releaseStackMinExtra) {
+  MemoryIdler::unmapUnusedStack(0);
+}
+
+TEST(MemoryIdler, releaseStackLargeExtra) {
+  MemoryIdler::unmapUnusedStack(30000000);
+}
+
+TEST(MemoryIdler, releaseMallocTLS) {
+  auto p = new int[4];
+  MemoryIdler::flushLocalMallocCaches();
+  delete[] p;
+  MemoryIdler::flushLocalMallocCaches();
+  p = new int[4];
+  MemoryIdler::flushLocalMallocCaches();
+  delete[] p;
+}
+
+
+/// MockedAtom gives us a way to select a mocked Futex implementation
+/// inside Baton, even though the atom itself isn't exercised by the
+/// mocked futex
+template <typename T>
+struct MockAtom : public std::atomic<T> {
+  explicit MockAtom(T init = 0) : std::atomic<T>(init) {}
+};
+
+
+/// MockClock is a bit tricky because we are mocking a static function
+/// (now()), so we need to find the corresponding mock instance without
+/// extending its scope beyond that of the test.  I generally avoid
+/// shared_ptr, but a weak_ptr is just the ticket here
+struct MockClock {
+  typedef std::chrono::steady_clock::duration duration;
+  typedef std::chrono::steady_clock::time_point time_point;
+
+  MOCK_METHOD0(nowImpl, time_point(void));
+
+  /// Hold on to the returned shared_ptr until the end of the test
+  static std::shared_ptr<StrictMock<MockClock>> setup() {
+    auto rv = std::make_shared<StrictMock<MockClock>>();
+    s_mockClockInstance = rv;
+    return rv;
+  }
+
+  static time_point now() {
+    return s_mockClockInstance.lock()->nowImpl();
+  }
+
+  static std::weak_ptr<StrictMock<MockClock>> s_mockClockInstance;
+};
+
+std::weak_ptr<StrictMock<MockClock>> MockClock::s_mockClockInstance;
+
+
+
+namespace folly { namespace detail {
+
+/// Futex<MockAtom> is our mocked futex implementation.  Note that the
+/// method signatures differ from the real Futex because we have elided
+/// unused default params and collapsed templated methods into the
+/// used type
+template<>
+struct Futex<MockAtom> {
+  MOCK_METHOD2(futexWait, bool(uint32_t, uint32_t));
+  MOCK_METHOD3(futexWaitUntil,
+               FutexResult(uint32_t, const MockClock::time_point&, uint32_t));
+};
+
+}}
+
+TEST(MemoryIdler, futexWaitValueChangedEarly) {
+  StrictMock<Futex<MockAtom>> fut;
+  auto clock = MockClock::setup();
+  auto begin = MockClock::time_point(std::chrono::seconds(100));
+  auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+  EXPECT_CALL(*clock, nowImpl())
+      .WillOnce(Return(begin));
+  EXPECT_CALL(fut, futexWaitUntil(1, AllOf(Ge(begin + idleTimeout),
+                                           Lt(begin + 2 * idleTimeout)), -1))
+      .WillOnce(Return(FutexResult::VALUE_CHANGED));
+  EXPECT_FALSE((MemoryIdler::futexWait<MockAtom, MockClock>(fut, 1)));
+}
+
+TEST(MemoryIdler, futexWaitValueChangedLate) {
+  StrictMock<Futex<MockAtom>> fut;
+  auto clock = MockClock::setup();
+  auto begin = MockClock::time_point(std::chrono::seconds(100));
+  auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+  EXPECT_CALL(*clock, nowImpl())
+      .WillOnce(Return(begin));
+  EXPECT_CALL(fut, futexWaitUntil(1, AllOf(Ge(begin + idleTimeout),
+                                           Lt(begin + 2 * idleTimeout)), -1))
+      .WillOnce(Return(FutexResult::TIMEDOUT));
+  EXPECT_CALL(fut, futexWait(1, -1))
+      .WillOnce(Return(false));
+  EXPECT_FALSE((MemoryIdler::futexWait<MockAtom, MockClock>(fut, 1)));
+}
+
+TEST(MemoryIdler, futexWaitAwokenEarly) {
+  StrictMock<Futex<MockAtom>> fut;
+  auto clock = MockClock::setup();
+  auto begin = MockClock::time_point(std::chrono::seconds(100));
+  auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+  EXPECT_CALL(*clock, nowImpl())
+      .WillOnce(Return(begin));
+  EXPECT_CALL(fut, futexWaitUntil(1, Ge(begin + idleTimeout), -1))
+      .WillOnce(Return(FutexResult::AWOKEN));
+  EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(fut, 1)));
+}
+
+TEST(MemoryIdler, futexWaitAwokenLate) {
+  StrictMock<Futex<MockAtom>> fut;
+  auto clock = MockClock::setup();
+  auto begin = MockClock::time_point(std::chrono::seconds(100));
+  auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+  EXPECT_CALL(*clock, nowImpl())
+      .WillOnce(Return(begin));
+  EXPECT_CALL(fut, futexWaitUntil(1, begin + idleTimeout, -1))
+      .WillOnce(Return(FutexResult::TIMEDOUT));
+  EXPECT_CALL(fut, futexWait(1, -1))
+      .WillOnce(Return(true));
+  EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
+      fut, 1, -1, idleTimeout, 100, 0.0f)));
+}
+
+TEST(MemoryIdler, futexWaitImmediateFlush) {
+  StrictMock<Futex<MockAtom>> fut;
+  auto clock = MockClock::setup();
+
+  EXPECT_CALL(fut, futexWait(2, 0xff))
+      .WillOnce(Return(true));
+  EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
+      fut, 2, 0xff, std::chrono::seconds(0))));
+}
+
+TEST(MemoryIdler, futexWaitNeverFlush) {
+  StrictMock<Futex<MockAtom>> fut;
+  auto clock = MockClock::setup();
+
+  EXPECT_CALL(fut, futexWait(1, -1))
+      .WillOnce(Return(true));
+  EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
+      fut, 1, -1, MockClock::duration::max())));
+}
+
+
+BENCHMARK(releaseStack, iters) {
+  for (size_t i = 0; i < iters; ++i) {
+    MemoryIdler::unmapUnusedStack();
+  }
+}
+
+BENCHMARK(releaseMallocTLS, iters) {
+  for (size_t i = 0; i < iters; ++i) {
+    MemoryIdler::flushLocalMallocCaches();
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, true);
+
+  auto rv = RUN_ALL_TESTS();
+  if (!rv && FLAGS_benchmark) {
+    folly::runBenchmarks();
+  }
+  return rv;
+}
-- 
2.34.1