From 762965ce25c73623845d3075111b565421dcc608 Mon Sep 17 00:00:00 2001 From: Tudor Bosman Date: Thu, 26 Sep 2013 17:19:24 -0700 Subject: [PATCH] Fix bad bug in folly::ThreadLocal Summary: There were a few bugs, actually: 1. prevSize < jemallocMinInPlaceExpandable compared an element count with a byte size; this hid the next bug for a while (as we needed 4096 ThreadLocalPtr objects in order to trigger it) 2. if rallocm(... ALLOCM_NO_MOVE) succeeds in expanding in place, we don't increment elementsCapacity_, which is bad. Switched to allocm() so we always take advantage of all memory that was actually allocated. @override-unit-failures Clearly unrelated +Warning: This development build of composer is over 30 days old. It is recommended to update it by running "hphp/test/slow/ext_phar/composer.php self-update" to get the latest version. Test Plan: test added, which failed before and doesn't any more Reviewed By: lucian@fb.com FB internal diff: D987009 --- folly/ThreadCachedInt.h | 3 + folly/ThreadLocal.h | 2 +- folly/detail/ThreadLocalDetail.h | 125 +++++++++++++++++---------- folly/test/AtomicHashArrayTest.cpp | 2 + folly/test/ThreadCachedArenaTest.cpp | 1 + folly/test/ThreadLocalTest.cpp | 91 +++++++++++++++++-- 6 files changed, 172 insertions(+), 52 deletions(-) diff --git a/folly/ThreadCachedInt.h b/folly/ThreadCachedInt.h index 69c03d1f..b9d30429 100644 --- a/folly/ThreadCachedInt.h +++ b/folly/ThreadCachedInt.h @@ -24,6 +24,9 @@ #define FOLLY_THREADCACHEDINT_H #include + +#include + #include "folly/Likely.h" #include "folly/ThreadLocal.h" diff --git a/folly/ThreadLocal.h b/folly/ThreadLocal.h index b4c031e7..e2ef58b0 100644 --- a/folly/ThreadLocal.h +++ b/folly/ThreadLocal.h @@ -196,7 +196,7 @@ class ThreadLocalPtr { friend class ThreadLocalPtr; threadlocal_detail::StaticMeta& meta_; - boost::mutex* lock_; + std::mutex* lock_; int id_; public: diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h index b9624389..20a74fa5 100644 --- a/folly/detail/ThreadLocalDetail.h +++ b/folly/detail/ThreadLocalDetail.h @@ -19,20 +19,22 @@ #include #include -#include + +#include #include #include -#include -#include -#include - #include -#include "folly/Exception.h" #include "folly/Foreach.h" +#include "folly/Exception.h" #include "folly/Malloc.h" +// TODO(tudorb): Remove this declaration after Malloc.h is pushed to +// third-party. +extern "C" int allocm(void**, size_t*, size_t, int) +__attribute__((weak)); + namespace folly { namespace threadlocal_detail { @@ -154,7 +156,7 @@ struct StaticMeta { int nextId_; std::vector freeIds_; - boost::mutex lock_; + std::mutex lock_; pthread_key_t pthreadKey_; ThreadEntry head_; @@ -208,7 +210,7 @@ struct StaticMeta { // We wouldn't call pthread_setspecific unless we actually called get() DCHECK_NE(threadEntry_.elementsCapacity, 0); { - boost::lock_guard g(meta.lock_); + std::lock_guard g(meta.lock_); meta.erase(&threadEntry_); // No need to hold the lock any longer; threadEntry_ is private to this // thread now that it's been removed from meta. @@ -224,7 +226,7 @@ struct StaticMeta { static int create() { int id; auto & meta = instance(); - boost::lock_guard g(meta.lock_); + std::lock_guard g(meta.lock_); if (!meta.freeIds_.empty()) { id = meta.freeIds_.back(); meta.freeIds_.pop_back(); @@ -240,7 +242,7 @@ struct StaticMeta { // Elements in other threads that use this id. std::vector elements; { - boost::lock_guard g(meta.lock_); + std::lock_guard g(meta.lock_); for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) { if (id < e->elementsCapacity && e->elements[id].ptr) { elements.push_back(e->elements[id]); @@ -277,62 +279,94 @@ struct StaticMeta { * @id to fit in. */ static void reserve(int id) { - size_t prevSize = threadEntry_.elementsCapacity; - size_t newSize = static_cast((id + 5) * 1.7); + size_t prevCapacity = threadEntry_.elementsCapacity; + // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent + // very slow start. + size_t newCapacity = static_cast((id + 5) * 1.7); + assert(newCapacity > prevCapacity); auto& meta = instance(); - ElementWrapper* ptr = nullptr; - - // Rely on jemalloc to zero the memory if possible -- maybe it knows - // it's already zeroed and saves us some work. - if (!usingJEMalloc() || - prevSize < jemallocMinInPlaceExpandable || - (rallocm( - static_cast(static_cast(&threadEntry_.elements)), - nullptr, newSize * sizeof(ElementWrapper), 0, - ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) { - // Sigh, must realloc, but we can't call realloc here, as elements is - // still linked in meta, so another thread might access invalid memory - // after realloc succeeds. We'll copy by hand and update threadEntry_ - // under the lock. + ElementWrapper* reallocated = nullptr; + + // Need to grow. Note that we can't call realloc, as elements is + // still linked in meta, so another thread might access invalid memory + // after realloc succeeds. We'll copy by hand and update threadEntry_ + // under the lock. + if (usingJEMalloc()) { + bool success = false; + size_t newByteSize = newCapacity * sizeof(ElementWrapper); + size_t realByteSize = 0; + + // Try to grow in place. // - // Note that we're using calloc instead of malloc in order to zero - // the entire region. rallocm (ALLOCM_ZERO) will only zero newly - // allocated memory, so if a previous allocation allocated more than - // we requested, it's our responsibility to guarantee that the tail - // is zeroed. calloc() is simpler than malloc() followed by memset(), - // and potentially faster when dealing with a lot of memory, as - // it can get already-zeroed pages from the kernel. - ptr = static_cast( - calloc(newSize, sizeof(ElementWrapper)) - ); - if (!ptr) throw std::bad_alloc(); + // Note that rallocm(ALLOCM_ZERO) will only zero newly allocated memory, + // even if a previous allocation allocated more than we requested. + // This is fine; we always use ALLOCM_ZERO with jemalloc and we + // always expand our allocation to the real size. + if (prevCapacity * sizeof(ElementWrapper) >= + jemallocMinInPlaceExpandable) { + success = (rallocm(reinterpret_cast(&threadEntry_.elements), + &realByteSize, + newByteSize, + 0, + ALLOCM_NO_MOVE | ALLOCM_ZERO) == ALLOCM_SUCCESS); + + } + + // In-place growth failed. + if (!success) { + // Note that, unlike calloc,allocm(... ALLOCM_ZERO) zeros all + // allocated bytes (*realByteSize) and not just the requested + // bytes (newByteSize) + success = (allocm(reinterpret_cast(&reallocated), + &realByteSize, + newByteSize, + ALLOCM_ZERO) == ALLOCM_SUCCESS); + } + + if (success) { + // Expand to real size + assert(realByteSize / sizeof(ElementWrapper) >= newCapacity); + newCapacity = realByteSize / sizeof(ElementWrapper); + } else { + throw std::bad_alloc(); + } + } else { // no jemalloc + // calloc() is simpler than malloc() followed by memset(), and + // potentially faster when dealing with a lot of memory, as it can get + // already-zeroed pages from the kernel. + reallocated = static_cast( + calloc(newCapacity, sizeof(ElementWrapper))); + if (!reallocated) { + throw std::bad_alloc(); + } } // Success, update the entry { - boost::lock_guard g(meta.lock_); + std::lock_guard g(meta.lock_); - if (prevSize == 0) { + if (prevCapacity == 0) { meta.push_back(&threadEntry_); } - if (ptr) { + if (reallocated) { /* * Note: we need to hold the meta lock when copying data out of * the old vector, because some other thread might be * destructing a ThreadLocal and writing to the elements vector * of this thread. */ - memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize); + memcpy(reallocated, threadEntry_.elements, + sizeof(ElementWrapper) * prevCapacity); using std::swap; - swap(ptr, threadEntry_.elements); - threadEntry_.elementsCapacity = newSize; + swap(reallocated, threadEntry_.elements); } + threadEntry_.elementsCapacity = newCapacity; } - free(ptr); + free(reallocated); - if (prevSize == 0) { + if (prevCapacity == 0) { pthread_setspecific(meta.pthreadKey_, &meta); } } @@ -340,6 +374,7 @@ struct StaticMeta { static ElementWrapper& get(int id) { if (UNLIKELY(threadEntry_.elementsCapacity <= id)) { reserve(id); + assert(threadEntry_.elementsCapacity > id); } return threadEntry_.elements[id]; } diff --git a/folly/test/AtomicHashArrayTest.cpp b/folly/test/AtomicHashArrayTest.cpp index 1fa8d994..fa6b78de 100644 --- a/folly/test/AtomicHashArrayTest.cpp +++ b/folly/test/AtomicHashArrayTest.cpp @@ -15,7 +15,9 @@ */ #include + #include +#include #include #include "folly/AtomicHashArray.h" diff --git a/folly/test/ThreadCachedArenaTest.cpp b/folly/test/ThreadCachedArenaTest.cpp index bc720ed0..431a044b 100644 --- a/folly/test/ThreadCachedArenaTest.cpp +++ b/folly/test/ThreadCachedArenaTest.cpp @@ -17,6 +17,7 @@ #include "folly/ThreadCachedArena.h" #include "folly/Memory.h" +#include #include #include #include diff --git a/folly/test/ThreadLocalTest.cpp b/folly/test/ThreadLocalTest.cpp index 1c948257..b9f69a0f 100644 --- a/folly/test/ThreadLocalTest.cpp +++ b/folly/test/ThreadLocalTest.cpp @@ -18,18 +18,23 @@ #include #include -#include -#include -#include +#include + +#include #include -#include +#include #include +#include +#include +#include #include -#include +#include + #include -#include #include #include +#include + #include "folly/Benchmark.h" using namespace folly; @@ -298,6 +303,80 @@ TEST(ThreadLocal, Movable2) { EXPECT_EQ(4, tls.size()); } +namespace { + +constexpr size_t kFillObjectSize = 300; + +std::atomic gDestroyed; + +/** + * Fill a chunk of memory with a unique-ish pattern that includes the thread id + * (so deleting one of these from another thread would cause a failure) + * + * Verify it explicitly and on destruction. + */ +class FillObject { + public: + explicit FillObject(uint64_t idx) : idx_(idx) { + uint64_t v = val(); + for (size_t i = 0; i < kFillObjectSize; ++i) { + data_[i] = v; + } + } + + void check() { + uint64_t v = val(); + for (size_t i = 0; i < kFillObjectSize; ++i) { + CHECK_EQ(v, data_[i]); + } + } + + ~FillObject() { + ++gDestroyed; + } + + private: + uint64_t val() const { + return (idx_ << 40) | uint64_t(pthread_self()); + } + + uint64_t idx_; + uint64_t data_[kFillObjectSize]; +}; + +} // namespace + +TEST(ThreadLocal, Stress) { + constexpr size_t numFillObjects = 250; + std::array, numFillObjects> objects; + + constexpr size_t numThreads = 32; + constexpr size_t numReps = 20; + + std::vector threads; + threads.reserve(numThreads); + + for (size_t i = 0; i < numThreads; ++i) { + threads.emplace_back([&objects] { + for (size_t rep = 0; rep < numReps; ++rep) { + for (size_t i = 0; i < objects.size(); ++i) { + objects[i].reset(new FillObject(rep * objects.size() + i)); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + for (size_t i = 0; i < objects.size(); ++i) { + objects[i]->check(); + } + } + }); + } + + for (auto& t : threads) { + t.join(); + } + + EXPECT_EQ(numFillObjects * numThreads * numReps, gDestroyed); +} + // Yes, threads and fork don't mix // (http://cppwisdom.quora.com/Why-threads-and-fork-dont-mix) but if you're // stupid or desperate enough to try, we shouldn't stand in your way. -- 2.34.1