From 79c25e6f4a7742342347b4dc23d4e237eac2aa37 Mon Sep 17 00:00:00 2001 From: Nathan Bronson Date: Wed, 15 Jan 2014 21:20:03 -0800 Subject: [PATCH] Baton - flushing of thread-local memory during a long wait Summary: This diff causes Baton to reduce a thread's memory footprint when it blocks for an extended period (by default 5 to 7.5 seconds). Reductions are achieved by flushing the thread-local jemalloc caches (if jemalloc is in use) and by calling madvise(MADV_DONTNEED) on the portion of the thread's stack that isn't active. Once the thread resumes execution both of these resources will be reallocated. Configuration is via system-wide default. Test Plan: 1. new unit tests 2. manual execution of existing unit tests with very low idleTimeout 3. peek and poke with gdb to observe madvise discarding the page Reviewed By: davejwatson@fb.com FB internal diff: D1146966 --- folly/Baton.h | 21 +++- folly/Makefile.am | 2 + folly/detail/MemoryIdler.cpp | 178 ++++++++++++++++++++++++++++ folly/detail/MemoryIdler.h | 143 +++++++++++++++++++++++ folly/test/MemoryIdlerTest.cpp | 208 +++++++++++++++++++++++++++++++++ 5 files changed, 549 insertions(+), 3 deletions(-) create mode 100644 folly/detail/MemoryIdler.cpp create mode 100644 folly/detail/MemoryIdler.h create mode 100644 folly/test/MemoryIdlerTest.cpp diff --git a/folly/Baton.h b/folly/Baton.h index 4d987a99..c53afa35 100644 --- a/folly/Baton.h +++ b/folly/Baton.h @@ -24,6 +24,7 @@ #include #include +#include namespace folly { @@ -151,11 +152,25 @@ struct Baton : boost::noncopyable { } while (true) { - state_.futexWait(WAITING); + detail::MemoryIdler::futexWait(state_, WAITING); // state_ is the truth even if FUTEX_WAIT reported a matching - // FUTEX_WAKE, since we aren't using type-stable storage and - // we don't guarantee reuse + // FUTEX_WAKE, since we aren't using type-stable storage and we + // don't guarantee reuse. The scenario goes like this: thread + // A's last touch of a Baton is a call to wake(), which stores + // LATE_DELIVERY and gets an unlucky context switch before delivering + // the corresponding futexWake. Thread B sees LATE_DELIVERY + // without consuming a futex event, because it calls futexWait + // with an expected value of WAITING and hence doesn't go to sleep. + // B returns, so the Baton's memory is reused and becomes another + // Baton (or a reuse of this one). B calls futexWait on the new + // Baton lifetime, then A wakes up and delivers a spurious futexWake + // to the same memory location. B's futexWait will then report a + // consumed wake event even though state_ is still WAITING. + // + // It would be possible to add an extra state_ dance to communicate + // that the futexWake has been sent so that we can be sure to consume + // it before returning, but that would be a perf and complexity hit. uint32_t s = state_.load(std::memory_order_acquire); assert(s == WAITING || s == LATE_DELIVERY); diff --git a/folly/Makefile.am b/folly/Makefile.am index 4aed816a..e5fe6355 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -46,6 +46,7 @@ nobase_follyinclude_HEADERS = \ detail/Futex.h \ detail/GroupVarintDetail.h \ detail/Malloc.h \ + detail/MemoryIdler.h \ detail/MPMCPipelineDetail.h \ detail/SlowFingerprint.h \ detail/Stats.h \ @@ -155,6 +156,7 @@ libfolly_la_SOURCES = \ io/IOBufQueue.cpp \ io/RecordIO.cpp \ json.cpp \ + detail/MemoryIdler.cpp \ MemoryMapping.cpp \ Random.cpp \ Range.cpp \ diff --git a/folly/detail/MemoryIdler.cpp b/folly/detail/MemoryIdler.cpp new file mode 100644 index 00000000..711bd90f --- /dev/null +++ b/folly/detail/MemoryIdler.cpp @@ -0,0 +1,178 @@ +/* + * Copyright 2014 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "MemoryIdler.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +// weak linking means the symbol will be null if not available, instead +// of a link failure +extern "C" int mallctl(const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) + __attribute__((weak)); + + +namespace folly { namespace detail { + +AtomicStruct +MemoryIdler::defaultIdleTimeout(std::chrono::seconds(5)); + + +/// Calls mallctl, optionally reading and/or writing an unsigned value +/// if in and/or out is non-null. Logs on error +static unsigned mallctlWrapper(const char* cmd, const unsigned* in, + unsigned* out) { + size_t outLen = sizeof(unsigned); + int err = mallctl(cmd, + out, out ? &outLen : nullptr, + const_cast(in), in ? sizeof(unsigned) : 0); + if (err != 0) { + FB_LOG_EVERY_MS(WARNING, 10000) + << "mallctl " << cmd << ": " << strerror(err) << " (" << err << ")"; + } + return err; +} + +void MemoryIdler::flushLocalMallocCaches() { + if (usingJEMalloc()) { + if (!mallctl) { + FB_LOG_EVERY_MS(ERROR, 10000) << "mallctl weak link failed"; + return; + } + + // "tcache.flush" was renamed to "thread.tcache.flush" in jemalloc 3 + (void)mallctlWrapper("thread.tcache.flush", nullptr, nullptr); + + // By default jemalloc has 4 arenas per cpu, and then assigns each + // thread to one of those arenas. This means that in any service + // that doesn't perform a lot of context switching, the chances that + // another thread will be using the current thread's arena (and hence + // doing the appropriate dirty-page purging) are low. Some good + // tuned configurations (such as that used by hhvm) use fewer arenas + // and then pin threads to avoid contended access. In that case, + // purging the arenas is counter-productive. We use the heuristic + // that if narenas <= 2 * num_cpus then we shouldn't do anything here, + // which detects when the narenas has been reduced from the default + unsigned narenas; + unsigned arenaForCurrent; + if (mallctlWrapper("arenas.narenas", nullptr, &narenas) == 0 && + narenas > 2 * CacheLocality::system().numCpus && + mallctlWrapper("thread.arena", nullptr, &arenaForCurrent) == 0) { + (void)mallctlWrapper("arenas.purge", &arenaForCurrent, nullptr); + } + } +} + + +#ifdef __x86_64__ + +static const size_t s_pageSize = sysconf(_SC_PAGESIZE); +static __thread uintptr_t tls_stackLimit; +static __thread size_t tls_stackSize; + +static void fetchStackLimits() { + pthread_attr_t attr; +#if defined(_GNU_SOURCE) && defined(__linux__) // Linux+GNU extension + pthread_getattr_np(pthread_self(), &attr); +#else + pthread_attr_init(&attr); +#endif + SCOPE_EXIT { pthread_attr_destroy(&attr); }; + + void* addr; + size_t rawSize; + int err; + if ((err = pthread_attr_getstack(&attr, &addr, &rawSize))) { + // unexpected, but it is better to continue in prod than do nothing + FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack error " << err; + assert(false); + tls_stackSize = 1; + return; + } + assert(addr != nullptr); + assert(rawSize >= PTHREAD_STACK_MIN); + + // glibc subtracts guard page from stack size, even though pthread docs + // seem to imply the opposite + size_t guardSize; + if (pthread_attr_getguardsize(&attr, &guardSize) != 0) { + guardSize = 0; + } + assert(rawSize > guardSize); + + // stack goes down, so guard page adds to the base addr + tls_stackLimit = uintptr_t(addr) + guardSize; + tls_stackSize = rawSize - guardSize; + + assert((tls_stackLimit & (s_pageSize - 1)) == 0); +} + +static __attribute__((noinline)) uintptr_t getStackPtr() { + char marker; + auto rv = uintptr_t(&marker); + return rv; +} + +void MemoryIdler::unmapUnusedStack(size_t retain) { + if (tls_stackSize == 0) { + fetchStackLimits(); + } + if (tls_stackSize <= std::max(size_t(1), retain)) { + // covers both missing stack info, and impossibly large retain + return; + } + + auto sp = getStackPtr(); + assert(sp >= tls_stackLimit); + assert(sp - tls_stackLimit < tls_stackSize); + + auto end = (sp - retain) & ~(s_pageSize - 1); + if (end <= tls_stackLimit) { + // no pages are eligible for unmapping + return; + } + + size_t len = end - tls_stackLimit; + assert((len & (s_pageSize - 1)) == 0); + if (madvise((void*)tls_stackLimit, len, MADV_DONTNEED) != 0) { + // It is likely that the stack vma hasn't been fully grown. In this + // case madvise will apply dontneed to the present vmas, then return + // errno of ENOMEM. We can also get an EAGAIN, theoretically. + // EINVAL means either an invalid alignment or length, or that some + // of the pages are locked or shared. Neither should occur. + int e = errno; + assert(e == EAGAIN || e == ENOMEM); + } +} + +#else + +void MemoryIdler::unmapUnusedStack(size_t retain) { +} + +#endif + +}} diff --git a/folly/detail/MemoryIdler.h b/folly/detail/MemoryIdler.h new file mode 100644 index 00000000..5898b07f --- /dev/null +++ b/folly/detail/MemoryIdler.h @@ -0,0 +1,143 @@ +/* + * Copyright 2014 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_DETAIL_MEMORYIDLER_H +#define FOLLY_DETAIL_MEMORYIDLER_H + +#include +#include +#include +#include +#include +#include "Futex.h" + +namespace folly { + +// gcc 4.7 doesn't do std::is_trivial correctly, override so we can use +// AtomicStruct +template<> +struct IsTriviallyCopyable + : std::true_type {}; + +} + +namespace folly { namespace detail { + +/// MemoryIdler provides helper routines that allow routines to return +/// some assigned memory resources back to the system. The intended +/// use is that when a thread is waiting for a long time (perhaps it +/// is in a LIFO thread pool and hasn't been needed for a long time) +/// it should release its thread-local malloc caches (both jemalloc and +/// tcmalloc use these for better performance) and unmap the stack pages +/// that contain no useful data. +struct MemoryIdler { + + /// Returns memory from thread-local allocation pools to the global + /// pool, if we know how to for the current malloc implementation. + /// jemalloc is supported. + static void flushLocalMallocCaches(); + + + enum { + /// This value is a tradeoff between reclaiming memory and triggering + /// a page fault immediately on wakeup. Note that the actual unit + /// of idling for the stack is pages, so the actual stack that + /// will be available on wakeup without a page fault is between + /// kDefaultStackToRetain and kDefaultStackToRetain + PageSize - + /// 1 bytes. + kDefaultStackToRetain = 1024, + }; + + /// Uses madvise to discard the portion of the thread's stack that + /// currently doesn't hold any data, trying to ensure that no page + /// faults will occur during the next retain bytes of stack allocation + static void unmapUnusedStack(size_t retain = kDefaultStackToRetain); + + + /// The system-wide default for the amount of time a blocking + /// thread should wait before reclaiming idle memory. Set this to + /// Duration::max() to never wait. The default value is 5 seconds. + /// Endpoints using this idle timeout might randomly wait longer to + /// avoid synchronizing their flushes. + static AtomicStruct defaultIdleTimeout; + + + /// Equivalent to fut.futexWait(expected, waitMask), but calls + /// flushLocalMallocCaches() and unmapUnusedStack(stackToRetain) + /// after idleTimeout has passed (if it has passed). Internally uses + /// fut.futexWait and fut.futexWaitUntil. Like futexWait, returns + /// false if interrupted with a signal. The actual timeout will be + /// pseudo-randomly chosen to be between idleTimeout and idleTimeout * + /// (1 + timeoutVariationFraction), to smooth out the behavior in a + /// system with bursty requests. The default is to wait up to 50% + /// extra, so on average 25% extra + template