From c55ca702ba15bc9dbb7f5c18b267e97ec1d79073 Mon Sep 17 00:00:00 2001 From: Jordan DeLong Date: Wed, 14 Aug 2013 17:03:04 -0700 Subject: [PATCH] Fix a ThreadLocal bug: hold the meta lock when resizing the element vector Summary: There appears to be a race here. leizha reported issues with a heavily recycled AtomicHashMap (ThreadCachedInt inside). It looks like what's happening is this: - Thread A: ~ThreadCachedInt from an AHM - meta lock is taken, and the ThreadElement list is iterated - all entries are zerod, and the id is marked free - then releases the lock - Thread B: someone is calling get() on an unrelated id - hit reserve: rallocm on the pointer or unsynchronized memcpy from the element vector - waits on the lock - when it gets the lock, it stores back the value that it read that was zero'd by A. Later, someone reuses the id from the freelist, and reuses the previously freed pointer, and eventually double-freeing it. (nullptr is the signifier for "this thread doesn't have an instance of the threadlocal yet"). Test Plan: leizha's test case doesn't segv after this diff---it was reliably breaking with corruption in malloc before it. I'm working on making that test case into a unit test to add to this diff, but I'm putting it up early in case there's something wrong with the theory above or in case someone has an idea for a better fix. Reviewed By: tudorb@fb.com FB internal diff: D928534 --- folly/detail/ThreadLocalDetail.h | 46 +++++++---- folly/test/AHMIntStressTest.cpp | 126 +++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 16 deletions(-) create mode 100644 folly/test/AHMIntStressTest.cpp diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h index ad53b9c9..b9624389 100644 --- a/folly/detail/ThreadLocalDetail.h +++ b/folly/detail/ThreadLocalDetail.h @@ -245,14 +245,20 @@ struct StaticMeta { if (id < e->elementsCapacity && e->elements[id].ptr) { elements.push_back(e->elements[id]); - // Writing another thread's ThreadEntry from here is fine; - // the only other potential reader is the owning thread -- - // from onThreadExit (which grabs the lock, so is properly - // synchronized with us) or from get() -- but using get() on a - // ThreadLocalPtr object that's being destroyed is a bug, so - // undefined behavior is fair game. - e->elements[id].ptr = NULL; - e->elements[id].deleter = NULL; + /* + * Writing another thread's ThreadEntry from here is fine; + * the only other potential reader is the owning thread -- + * from onThreadExit (which grabs the lock, so is properly + * synchronized with us) or from get(), which also grabs + * the lock if it needs to resize the elements vector. + * + * We can't conflict with reads for a get(id), because + * it's illegal to call get on a thread local that's + * destructing. + */ + e->elements[id].ptr = nullptr; + e->elements[id].deleter = nullptr; + e->elements[id].ownsDeleter = false; } } meta.freeIds_.push_back(id); @@ -275,13 +281,14 @@ struct StaticMeta { size_t newSize = static_cast((id + 5) * 1.7); auto& meta = instance(); ElementWrapper* ptr = nullptr; + // Rely on jemalloc to zero the memory if possible -- maybe it knows // it's already zeroed and saves us some work. if (!usingJEMalloc() || prevSize < jemallocMinInPlaceExpandable || (rallocm( static_cast(static_cast(&threadEntry_.elements)), - NULL, newSize * sizeof(ElementWrapper), 0, + nullptr, newSize * sizeof(ElementWrapper), 0, ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) { // Sigh, must realloc, but we can't call realloc here, as elements is // still linked in meta, so another thread might access invalid memory @@ -295,25 +302,32 @@ struct StaticMeta { // is zeroed. calloc() is simpler than malloc() followed by memset(), // and potentially faster when dealing with a lot of memory, as // it can get already-zeroed pages from the kernel. - if ((ptr = static_cast( - calloc(newSize, sizeof(ElementWrapper)))) != nullptr) { - memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize); - } else { - throw std::bad_alloc(); - } + ptr = static_cast( + calloc(newSize, sizeof(ElementWrapper)) + ); + if (!ptr) throw std::bad_alloc(); } // Success, update the entry { boost::lock_guard g(meta.lock_); + if (prevSize == 0) { meta.push_back(&threadEntry_); } + if (ptr) { + /* + * Note: we need to hold the meta lock when copying data out of + * the old vector, because some other thread might be + * destructing a ThreadLocal and writing to the elements vector + * of this thread. + */ + memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize); using std::swap; swap(ptr, threadEntry_.elements); + threadEntry_.elementsCapacity = newSize; } - threadEntry_.elementsCapacity = newSize; } free(ptr); diff --git a/folly/test/AHMIntStressTest.cpp b/folly/test/AHMIntStressTest.cpp new file mode 100644 index 00000000..1d90660b --- /dev/null +++ b/folly/test/AHMIntStressTest.cpp @@ -0,0 +1,126 @@ +/* + * Copyright 2013 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include "folly/AtomicHashMap.h" +#include "folly/ScopeGuard.h" +#include "folly/Memory.h" + +namespace { + +struct MyObject { + explicit MyObject(int i) : i(i) {} + int i; +}; + +typedef folly::AtomicHashMap> MyMap; +typedef std::lock_guard Guard; + +std::unique_ptr newMap() { return folly::make_unique(100); } + +struct MyObjectDirectory { + MyObjectDirectory() + : cur_(newMap()) + , prev_(newMap()) + {} + + std::shared_ptr get(int key) { + auto val = tryGet(key); + if (val) { + return val; + } + + std::shared_ptr cur; + { + Guard g(lock_); + cur = cur_; + } + + auto ret = cur->insert(key, std::make_shared(key)); + return ret.first->second; + } + + std::shared_ptr tryGet(int key) { + std::shared_ptr cur; + std::shared_ptr prev; + { + Guard g(lock_); + cur = cur_; + prev = prev_; + } + + auto it = cur->find(key); + if (it != cur->end()) { + return it->second; + } + + it = prev->find(key); + if (it != prev->end()) { + auto ret = cur->insert(key, it->second); + return ret.first->second; + } + + return nullptr; + } + + void archive() { + std::shared_ptr cur(newMap()); + + Guard g(lock_); + prev_ = cur_; + cur_ = cur; + } + + std::mutex lock_; + std::shared_ptr cur_; + std::shared_ptr prev_; +}; + +} + +////////////////////////////////////////////////////////////////////// + +/* + * This test case stresses ThreadLocal allocation/deallocation heavily + * via ThreadCachedInt and AtomicHashMap, and a bunch of other + * mallocing. + */ +TEST(AHMIntStressTest, Test) { + auto const objs = new MyObjectDirectory(); + SCOPE_EXIT { delete objs; }; + + std::vector threads; + for (int threadId = 0; threadId < 64; ++threadId) { + threads.emplace_back( + [objs,threadId] { + for (int recycles = 0; recycles < 500; ++recycles) { + for (int i = 0; i < 10; i++) { + auto val = objs->get(i); + } + + objs->archive(); + } + } + ); + } + + for (auto& t : threads) t.join(); +} -- 2.34.1