#include <assert.h>
#include <folly/detail/Futex.h>
+#include <folly/detail/MemoryIdler.h>
namespace folly {
}
while (true) {
- state_.futexWait(WAITING);
+ detail::MemoryIdler::futexWait(state_, WAITING);
// state_ is the truth even if FUTEX_WAIT reported a matching
- // FUTEX_WAKE, since we aren't using type-stable storage and
- // we don't guarantee reuse
+ // FUTEX_WAKE, since we aren't using type-stable storage and we
+ // don't guarantee reuse. The scenario goes like this: thread
+ // A's last touch of a Baton is a call to wake(), which stores
+ // LATE_DELIVERY and gets an unlucky context switch before delivering
+ // the corresponding futexWake. Thread B sees LATE_DELIVERY
+ // without consuming a futex event, because it calls futexWait
+ // with an expected value of WAITING and hence doesn't go to sleep.
+ // B returns, so the Baton's memory is reused and becomes another
+ // Baton (or a reuse of this one). B calls futexWait on the new
+ // Baton lifetime, then A wakes up and delivers a spurious futexWake
+ // to the same memory location. B's futexWait will then report a
+ // consumed wake event even though state_ is still WAITING.
+ //
+ // It would be possible to add an extra state_ dance to communicate
+ // that the futexWake has been sent so that we can be sure to consume
+ // it before returning, but that would be a perf and complexity hit.
uint32_t s = state_.load(std::memory_order_acquire);
assert(s == WAITING || s == LATE_DELIVERY);
detail/Futex.h \
detail/GroupVarintDetail.h \
detail/Malloc.h \
+ detail/MemoryIdler.h \
detail/MPMCPipelineDetail.h \
detail/SlowFingerprint.h \
detail/Stats.h \
io/IOBufQueue.cpp \
io/RecordIO.cpp \
json.cpp \
+ detail/MemoryIdler.cpp \
MemoryMapping.cpp \
Random.cpp \
Range.cpp \
--- /dev/null
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MemoryIdler.h"
+#include <folly/Logging.h>
+#include <folly/Malloc.h>
+#include <folly/ScopeGuard.h>
+#include <folly/detail/CacheLocality.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <utility>
+
+
+// weak linking means the symbol will be null if not available, instead
+// of a link failure
+extern "C" int mallctl(const char *name, void *oldp, size_t *oldlenp,
+ void *newp, size_t newlen)
+ __attribute__((weak));
+
+
+namespace folly { namespace detail {
+
+AtomicStruct<std::chrono::steady_clock::duration>
+MemoryIdler::defaultIdleTimeout(std::chrono::seconds(5));
+
+
+/// Calls mallctl, optionally reading and/or writing an unsigned value
+/// if in and/or out is non-null. Logs on error
+static unsigned mallctlWrapper(const char* cmd, const unsigned* in,
+ unsigned* out) {
+ size_t outLen = sizeof(unsigned);
+ int err = mallctl(cmd,
+ out, out ? &outLen : nullptr,
+ const_cast<unsigned*>(in), in ? sizeof(unsigned) : 0);
+ if (err != 0) {
+ FB_LOG_EVERY_MS(WARNING, 10000)
+ << "mallctl " << cmd << ": " << strerror(err) << " (" << err << ")";
+ }
+ return err;
+}
+
+void MemoryIdler::flushLocalMallocCaches() {
+ if (usingJEMalloc()) {
+ if (!mallctl) {
+ FB_LOG_EVERY_MS(ERROR, 10000) << "mallctl weak link failed";
+ return;
+ }
+
+ // "tcache.flush" was renamed to "thread.tcache.flush" in jemalloc 3
+ (void)mallctlWrapper("thread.tcache.flush", nullptr, nullptr);
+
+ // By default jemalloc has 4 arenas per cpu, and then assigns each
+ // thread to one of those arenas. This means that in any service
+ // that doesn't perform a lot of context switching, the chances that
+ // another thread will be using the current thread's arena (and hence
+ // doing the appropriate dirty-page purging) are low. Some good
+ // tuned configurations (such as that used by hhvm) use fewer arenas
+ // and then pin threads to avoid contended access. In that case,
+ // purging the arenas is counter-productive. We use the heuristic
+ // that if narenas <= 2 * num_cpus then we shouldn't do anything here,
+ // which detects when the narenas has been reduced from the default
+ unsigned narenas;
+ unsigned arenaForCurrent;
+ if (mallctlWrapper("arenas.narenas", nullptr, &narenas) == 0 &&
+ narenas > 2 * CacheLocality::system().numCpus &&
+ mallctlWrapper("thread.arena", nullptr, &arenaForCurrent) == 0) {
+ (void)mallctlWrapper("arenas.purge", &arenaForCurrent, nullptr);
+ }
+ }
+}
+
+
+#ifdef __x86_64__
+
+static const size_t s_pageSize = sysconf(_SC_PAGESIZE);
+static __thread uintptr_t tls_stackLimit;
+static __thread size_t tls_stackSize;
+
+static void fetchStackLimits() {
+ pthread_attr_t attr;
+#if defined(_GNU_SOURCE) && defined(__linux__) // Linux+GNU extension
+ pthread_getattr_np(pthread_self(), &attr);
+#else
+ pthread_attr_init(&attr);
+#endif
+ SCOPE_EXIT { pthread_attr_destroy(&attr); };
+
+ void* addr;
+ size_t rawSize;
+ int err;
+ if ((err = pthread_attr_getstack(&attr, &addr, &rawSize))) {
+ // unexpected, but it is better to continue in prod than do nothing
+ FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack error " << err;
+ assert(false);
+ tls_stackSize = 1;
+ return;
+ }
+ assert(addr != nullptr);
+ assert(rawSize >= PTHREAD_STACK_MIN);
+
+ // glibc subtracts guard page from stack size, even though pthread docs
+ // seem to imply the opposite
+ size_t guardSize;
+ if (pthread_attr_getguardsize(&attr, &guardSize) != 0) {
+ guardSize = 0;
+ }
+ assert(rawSize > guardSize);
+
+ // stack goes down, so guard page adds to the base addr
+ tls_stackLimit = uintptr_t(addr) + guardSize;
+ tls_stackSize = rawSize - guardSize;
+
+ assert((tls_stackLimit & (s_pageSize - 1)) == 0);
+}
+
+static __attribute__((noinline)) uintptr_t getStackPtr() {
+ char marker;
+ auto rv = uintptr_t(&marker);
+ return rv;
+}
+
+void MemoryIdler::unmapUnusedStack(size_t retain) {
+ if (tls_stackSize == 0) {
+ fetchStackLimits();
+ }
+ if (tls_stackSize <= std::max(size_t(1), retain)) {
+ // covers both missing stack info, and impossibly large retain
+ return;
+ }
+
+ auto sp = getStackPtr();
+ assert(sp >= tls_stackLimit);
+ assert(sp - tls_stackLimit < tls_stackSize);
+
+ auto end = (sp - retain) & ~(s_pageSize - 1);
+ if (end <= tls_stackLimit) {
+ // no pages are eligible for unmapping
+ return;
+ }
+
+ size_t len = end - tls_stackLimit;
+ assert((len & (s_pageSize - 1)) == 0);
+ if (madvise((void*)tls_stackLimit, len, MADV_DONTNEED) != 0) {
+ // It is likely that the stack vma hasn't been fully grown. In this
+ // case madvise will apply dontneed to the present vmas, then return
+ // errno of ENOMEM. We can also get an EAGAIN, theoretically.
+ // EINVAL means either an invalid alignment or length, or that some
+ // of the pages are locked or shared. Neither should occur.
+ int e = errno;
+ assert(e == EAGAIN || e == ENOMEM);
+ }
+}
+
+#else
+
+void MemoryIdler::unmapUnusedStack(size_t retain) {
+}
+
+#endif
+
+}}
--- /dev/null
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FOLLY_DETAIL_MEMORYIDLER_H
+#define FOLLY_DETAIL_MEMORYIDLER_H
+
+#include <atomic>
+#include <chrono>
+#include <folly/AtomicStruct.h>
+#include <folly/Hash.h>
+#include <folly/Traits.h>
+#include "Futex.h"
+
+namespace folly {
+
+// gcc 4.7 doesn't do std::is_trivial correctly, override so we can use
+// AtomicStruct<duration>
+template<>
+struct IsTriviallyCopyable<std::chrono::steady_clock::duration>
+ : std::true_type {};
+
+}
+
+namespace folly { namespace detail {
+
+/// MemoryIdler provides helper routines that allow routines to return
+/// some assigned memory resources back to the system. The intended
+/// use is that when a thread is waiting for a long time (perhaps it
+/// is in a LIFO thread pool and hasn't been needed for a long time)
+/// it should release its thread-local malloc caches (both jemalloc and
+/// tcmalloc use these for better performance) and unmap the stack pages
+/// that contain no useful data.
+struct MemoryIdler {
+
+ /// Returns memory from thread-local allocation pools to the global
+ /// pool, if we know how to for the current malloc implementation.
+ /// jemalloc is supported.
+ static void flushLocalMallocCaches();
+
+
+ enum {
+ /// This value is a tradeoff between reclaiming memory and triggering
+ /// a page fault immediately on wakeup. Note that the actual unit
+ /// of idling for the stack is pages, so the actual stack that
+ /// will be available on wakeup without a page fault is between
+ /// kDefaultStackToRetain and kDefaultStackToRetain + PageSize -
+ /// 1 bytes.
+ kDefaultStackToRetain = 1024,
+ };
+
+ /// Uses madvise to discard the portion of the thread's stack that
+ /// currently doesn't hold any data, trying to ensure that no page
+ /// faults will occur during the next retain bytes of stack allocation
+ static void unmapUnusedStack(size_t retain = kDefaultStackToRetain);
+
+
+ /// The system-wide default for the amount of time a blocking
+ /// thread should wait before reclaiming idle memory. Set this to
+ /// Duration::max() to never wait. The default value is 5 seconds.
+ /// Endpoints using this idle timeout might randomly wait longer to
+ /// avoid synchronizing their flushes.
+ static AtomicStruct<std::chrono::steady_clock::duration> defaultIdleTimeout;
+
+
+ /// Equivalent to fut.futexWait(expected, waitMask), but calls
+ /// flushLocalMallocCaches() and unmapUnusedStack(stackToRetain)
+ /// after idleTimeout has passed (if it has passed). Internally uses
+ /// fut.futexWait and fut.futexWaitUntil. Like futexWait, returns
+ /// false if interrupted with a signal. The actual timeout will be
+ /// pseudo-randomly chosen to be between idleTimeout and idleTimeout *
+ /// (1 + timeoutVariationFraction), to smooth out the behavior in a
+ /// system with bursty requests. The default is to wait up to 50%
+ /// extra, so on average 25% extra
+ template <template <typename> class Atom,
+ typename Clock = std::chrono::steady_clock>
+ static bool futexWait(
+ Futex<Atom>& fut,
+ uint32_t expected,
+ uint32_t waitMask = -1,
+ typename Clock::duration idleTimeout
+ = defaultIdleTimeout.load(std::memory_order_acquire),
+ size_t stackToRetain = kDefaultStackToRetain,
+ float timeoutVariationFrac = 0.5) {
+
+ if (idleTimeout == Clock::duration::max()) {
+ // no need to use futexWaitUntil if no timeout is possible
+ return fut.futexWait(expected, waitMask);
+ }
+
+ if (idleTimeout.count() > 0) {
+ auto begin = Clock::now();
+
+ if (timeoutVariationFrac > 0) {
+ // hash the pthread_t and the time to get the adjustment.
+ // Standard hash func isn't very good, so bit mix the result
+ auto pr = std::make_pair(pthread_self(),
+ begin.time_since_epoch().count());
+ std::hash<decltype(pr)> hash_fn;
+ uint64_t h = folly::hash::twang_mix64(hash_fn(pr));
+
+ // multiplying the duration by a floating point doesn't work, grr..
+ auto extraFrac =
+ timeoutVariationFrac / std::numeric_limits<uint64_t>::max() * h;
+ uint64_t tics = idleTimeout.count() * (1 + extraFrac);
+ idleTimeout = typename Clock::duration(tics);
+ }
+
+ while (true) {
+ auto rv = fut.futexWaitUntil(expected, begin + idleTimeout, waitMask);
+ if (rv == FutexResult::TIMEDOUT) {
+ // timeout is over
+ break;
+ }
+ // finished before timeout hit, no flush
+ assert(rv == FutexResult::VALUE_CHANGED || rv == FutexResult::AWOKEN ||
+ rv == FutexResult::INTERRUPTED);
+ return rv == FutexResult::AWOKEN;
+ }
+ }
+
+ // flush, then wait with no timeout
+ flushLocalMallocCaches();
+ unmapUnusedStack(stackToRetain);
+ return fut.futexWait(expected, waitMask);
+ }
+};
+
+}} // namespace folly::detail
+
+#endif
--- /dev/null
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/detail/MemoryIdler.h>
+#include <folly/Baton.h>
+#include <memory>
+#include <thread>
+#include <assert.h>
+#include <semaphore.h>
+#include <gflags/gflags.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <folly/Benchmark.h>
+
+using namespace folly;
+using namespace folly::detail;
+using namespace testing;
+
+TEST(MemoryIdler, releaseStack) {
+ MemoryIdler::unmapUnusedStack();
+}
+
+TEST(MemoryIdler, releaseStackMinExtra) {
+ MemoryIdler::unmapUnusedStack(0);
+}
+
+TEST(MemoryIdler, releaseStackLargeExtra) {
+ MemoryIdler::unmapUnusedStack(30000000);
+}
+
+TEST(MemoryIdler, releaseMallocTLS) {
+ auto p = new int[4];
+ MemoryIdler::flushLocalMallocCaches();
+ delete[] p;
+ MemoryIdler::flushLocalMallocCaches();
+ p = new int[4];
+ MemoryIdler::flushLocalMallocCaches();
+ delete[] p;
+}
+
+
+/// MockedAtom gives us a way to select a mocked Futex implementation
+/// inside Baton, even though the atom itself isn't exercised by the
+/// mocked futex
+template <typename T>
+struct MockAtom : public std::atomic<T> {
+ explicit MockAtom(T init = 0) : std::atomic<T>(init) {}
+};
+
+
+/// MockClock is a bit tricky because we are mocking a static function
+/// (now()), so we need to find the corresponding mock instance without
+/// extending its scope beyond that of the test. I generally avoid
+/// shared_ptr, but a weak_ptr is just the ticket here
+struct MockClock {
+ typedef std::chrono::steady_clock::duration duration;
+ typedef std::chrono::steady_clock::time_point time_point;
+
+ MOCK_METHOD0(nowImpl, time_point(void));
+
+ /// Hold on to the returned shared_ptr until the end of the test
+ static std::shared_ptr<StrictMock<MockClock>> setup() {
+ auto rv = std::make_shared<StrictMock<MockClock>>();
+ s_mockClockInstance = rv;
+ return rv;
+ }
+
+ static time_point now() {
+ return s_mockClockInstance.lock()->nowImpl();
+ }
+
+ static std::weak_ptr<StrictMock<MockClock>> s_mockClockInstance;
+};
+
+std::weak_ptr<StrictMock<MockClock>> MockClock::s_mockClockInstance;
+
+
+
+namespace folly { namespace detail {
+
+/// Futex<MockAtom> is our mocked futex implementation. Note that the
+/// method signatures differ from the real Futex because we have elided
+/// unused default params and collapsed templated methods into the
+/// used type
+template<>
+struct Futex<MockAtom> {
+ MOCK_METHOD2(futexWait, bool(uint32_t, uint32_t));
+ MOCK_METHOD3(futexWaitUntil,
+ FutexResult(uint32_t, const MockClock::time_point&, uint32_t));
+};
+
+}}
+
+TEST(MemoryIdler, futexWaitValueChangedEarly) {
+ StrictMock<Futex<MockAtom>> fut;
+ auto clock = MockClock::setup();
+ auto begin = MockClock::time_point(std::chrono::seconds(100));
+ auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+ EXPECT_CALL(*clock, nowImpl())
+ .WillOnce(Return(begin));
+ EXPECT_CALL(fut, futexWaitUntil(1, AllOf(Ge(begin + idleTimeout),
+ Lt(begin + 2 * idleTimeout)), -1))
+ .WillOnce(Return(FutexResult::VALUE_CHANGED));
+ EXPECT_FALSE((MemoryIdler::futexWait<MockAtom, MockClock>(fut, 1)));
+}
+
+TEST(MemoryIdler, futexWaitValueChangedLate) {
+ StrictMock<Futex<MockAtom>> fut;
+ auto clock = MockClock::setup();
+ auto begin = MockClock::time_point(std::chrono::seconds(100));
+ auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+ EXPECT_CALL(*clock, nowImpl())
+ .WillOnce(Return(begin));
+ EXPECT_CALL(fut, futexWaitUntil(1, AllOf(Ge(begin + idleTimeout),
+ Lt(begin + 2 * idleTimeout)), -1))
+ .WillOnce(Return(FutexResult::TIMEDOUT));
+ EXPECT_CALL(fut, futexWait(1, -1))
+ .WillOnce(Return(false));
+ EXPECT_FALSE((MemoryIdler::futexWait<MockAtom, MockClock>(fut, 1)));
+}
+
+TEST(MemoryIdler, futexWaitAwokenEarly) {
+ StrictMock<Futex<MockAtom>> fut;
+ auto clock = MockClock::setup();
+ auto begin = MockClock::time_point(std::chrono::seconds(100));
+ auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+ EXPECT_CALL(*clock, nowImpl())
+ .WillOnce(Return(begin));
+ EXPECT_CALL(fut, futexWaitUntil(1, Ge(begin + idleTimeout), -1))
+ .WillOnce(Return(FutexResult::AWOKEN));
+ EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(fut, 1)));
+}
+
+TEST(MemoryIdler, futexWaitAwokenLate) {
+ StrictMock<Futex<MockAtom>> fut;
+ auto clock = MockClock::setup();
+ auto begin = MockClock::time_point(std::chrono::seconds(100));
+ auto idleTimeout = MemoryIdler::defaultIdleTimeout.load();
+
+ EXPECT_CALL(*clock, nowImpl())
+ .WillOnce(Return(begin));
+ EXPECT_CALL(fut, futexWaitUntil(1, begin + idleTimeout, -1))
+ .WillOnce(Return(FutexResult::TIMEDOUT));
+ EXPECT_CALL(fut, futexWait(1, -1))
+ .WillOnce(Return(true));
+ EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
+ fut, 1, -1, idleTimeout, 100, 0.0f)));
+}
+
+TEST(MemoryIdler, futexWaitImmediateFlush) {
+ StrictMock<Futex<MockAtom>> fut;
+ auto clock = MockClock::setup();
+
+ EXPECT_CALL(fut, futexWait(2, 0xff))
+ .WillOnce(Return(true));
+ EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
+ fut, 2, 0xff, std::chrono::seconds(0))));
+}
+
+TEST(MemoryIdler, futexWaitNeverFlush) {
+ StrictMock<Futex<MockAtom>> fut;
+ auto clock = MockClock::setup();
+
+ EXPECT_CALL(fut, futexWait(1, -1))
+ .WillOnce(Return(true));
+ EXPECT_TRUE((MemoryIdler::futexWait<MockAtom, MockClock>(
+ fut, 1, -1, MockClock::duration::max())));
+}
+
+
+BENCHMARK(releaseStack, iters) {
+ for (size_t i = 0; i < iters; ++i) {
+ MemoryIdler::unmapUnusedStack();
+ }
+}
+
+BENCHMARK(releaseMallocTLS, iters) {
+ for (size_t i = 0; i < iters; ++i) {
+ MemoryIdler::flushLocalMallocCaches();
+ }
+}
+
+int main(int argc, char** argv) {
+ testing::InitGoogleTest(&argc, argv);
+ google::ParseCommandLineFlags(&argc, &argv, true);
+
+ auto rv = RUN_ALL_TESTS();
+ if (!rv && FLAGS_benchmark) {
+ folly::runBenchmarks();
+ }
+ return rv;
+}