From c55ca702ba15bc9dbb7f5c18b267e97ec1d79073 Mon Sep 17 00:00:00 2001
From: Jordan DeLong <jdelong@fb.com>
Date: Wed, 14 Aug 2013 17:03:04 -0700
Subject: [PATCH] Fix a ThreadLocal bug: hold the meta lock when resizing the
 element vector

Summary:
There appears to be a race here.  leizha reported issues with
a heavily recycled AtomicHashMap (ThreadCachedInt inside).  It looks
like what's happening is this:

- Thread A: ~ThreadCachedInt from an AHM
- meta lock is taken, and the ThreadElement list is iterated
- all entries are zerod, and the id is marked free
- then releases the lock
- Thread B: someone is calling get() on an unrelated id
- hit reserve: rallocm on the pointer or unsynchronized memcpy from
the element vector
- waits on the lock
- when it gets the lock, it stores back the value that it read that
was zero'd by A.

Later, someone reuses the id from the freelist, and reuses the
previously freed pointer, and eventually double-freeing it.  (nullptr
is the signifier for "this thread doesn't have an instance of the
threadlocal yet").

Test Plan:
leizha's test case doesn't segv after this diff---it was
reliably breaking with corruption in malloc before it.  I'm working on
making that test case into a unit test to add to this diff, but I'm
putting it up early in case there's something wrong with the theory
above or in case someone has an idea for a better fix.

Reviewed By: tudorb@fb.com

FB internal diff: D928534
---
 folly/detail/ThreadLocalDetail.h |  46 +++++++----
 folly/test/AHMIntStressTest.cpp  | 126 +++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+), 16 deletions(-)
 create mode 100644 folly/test/AHMIntStressTest.cpp

diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h
index ad53b9c9..b9624389 100644
--- a/folly/detail/ThreadLocalDetail.h
+++ b/folly/detail/ThreadLocalDetail.h
@@ -245,14 +245,20 @@ struct StaticMeta {
           if (id < e->elementsCapacity && e->elements[id].ptr) {
             elements.push_back(e->elements[id]);
 
-            // Writing another thread's ThreadEntry from here is fine;
-            // the only other potential reader is the owning thread --
-            // from onThreadExit (which grabs the lock, so is properly
-            // synchronized with us) or from get() -- but using get() on a
-            // ThreadLocalPtr object that's being destroyed is a bug, so
-            // undefined behavior is fair game.
-            e->elements[id].ptr = NULL;
-            e->elements[id].deleter = NULL;
+            /*
+             * Writing another thread's ThreadEntry from here is fine;
+             * the only other potential reader is the owning thread --
+             * from onThreadExit (which grabs the lock, so is properly
+             * synchronized with us) or from get(), which also grabs
+             * the lock if it needs to resize the elements vector.
+             *
+             * We can't conflict with reads for a get(id), because
+             * it's illegal to call get on a thread local that's
+             * destructing.
+             */
+            e->elements[id].ptr = nullptr;
+            e->elements[id].deleter = nullptr;
+            e->elements[id].ownsDeleter = false;
           }
         }
         meta.freeIds_.push_back(id);
@@ -275,13 +281,14 @@ struct StaticMeta {
     size_t newSize = static_cast<size_t>((id + 5) * 1.7);
     auto& meta = instance();
     ElementWrapper* ptr = nullptr;
+
     // Rely on jemalloc to zero the memory if possible -- maybe it knows
     // it's already zeroed and saves us some work.
     if (!usingJEMalloc() ||
         prevSize < jemallocMinInPlaceExpandable ||
         (rallocm(
           static_cast<void**>(static_cast<void*>(&threadEntry_.elements)),
-          NULL, newSize * sizeof(ElementWrapper), 0,
+          nullptr, newSize * sizeof(ElementWrapper), 0,
           ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) {
       // Sigh, must realloc, but we can't call realloc here, as elements is
       // still linked in meta, so another thread might access invalid memory
@@ -295,25 +302,32 @@ struct StaticMeta {
       // is zeroed.  calloc() is simpler than malloc() followed by memset(),
       // and potentially faster when dealing with a lot of memory, as
       // it can get already-zeroed pages from the kernel.
-      if ((ptr = static_cast<ElementWrapper*>(
-             calloc(newSize, sizeof(ElementWrapper)))) != nullptr) {
-        memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize);
-      } else {
-        throw std::bad_alloc();
-      }
+      ptr = static_cast<ElementWrapper*>(
+        calloc(newSize, sizeof(ElementWrapper))
+      );
+      if (!ptr) throw std::bad_alloc();
     }
 
     // Success, update the entry
     {
       boost::lock_guard<boost::mutex> g(meta.lock_);
+
       if (prevSize == 0) {
         meta.push_back(&threadEntry_);
       }
+
       if (ptr) {
+       /*
+        * Note: we need to hold the meta lock when copying data out of
+        * the old vector, because some other thread might be
+        * destructing a ThreadLocal and writing to the elements vector
+        * of this thread.
+        */
+        memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize);
         using std::swap;
         swap(ptr, threadEntry_.elements);
+        threadEntry_.elementsCapacity = newSize;
       }
-      threadEntry_.elementsCapacity = newSize;
     }
 
     free(ptr);
diff --git a/folly/test/AHMIntStressTest.cpp b/folly/test/AHMIntStressTest.cpp
new file mode 100644
index 00000000..1d90660b
--- /dev/null
+++ b/folly/test/AHMIntStressTest.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2013 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <thread>
+#include <memory>
+#include <mutex>
+
+#include "folly/AtomicHashMap.h"
+#include "folly/ScopeGuard.h"
+#include "folly/Memory.h"
+
+namespace {
+
+struct MyObject {
+  explicit MyObject(int i) : i(i) {}
+  int i;
+};
+
+typedef folly::AtomicHashMap<int,std::shared_ptr<MyObject>> MyMap;
+typedef std::lock_guard<std::mutex> Guard;
+
+std::unique_ptr<MyMap> newMap() { return folly::make_unique<MyMap>(100); }
+
+struct MyObjectDirectory {
+  MyObjectDirectory()
+    : cur_(newMap())
+    , prev_(newMap())
+  {}
+
+  std::shared_ptr<MyObject> get(int key) {
+    auto val = tryGet(key);
+    if (val) {
+      return val;
+    }
+
+    std::shared_ptr<MyMap> cur;
+    {
+      Guard g(lock_);
+      cur = cur_;
+    }
+
+    auto ret = cur->insert(key, std::make_shared<MyObject>(key));
+    return ret.first->second;
+  }
+
+  std::shared_ptr<MyObject> tryGet(int key) {
+    std::shared_ptr<MyMap> cur;
+    std::shared_ptr<MyMap> prev;
+    {
+      Guard g(lock_);
+      cur = cur_;
+      prev = prev_;
+    }
+
+    auto it = cur->find(key);
+    if (it != cur->end()) {
+      return it->second;
+    }
+
+    it = prev->find(key);
+    if (it != prev->end()) {
+      auto ret = cur->insert(key, it->second);
+      return ret.first->second;
+    }
+
+    return nullptr;
+  }
+
+  void archive() {
+    std::shared_ptr<MyMap> cur(newMap());
+
+    Guard g(lock_);
+    prev_ = cur_;
+    cur_ = cur;
+  }
+
+  std::mutex lock_;
+  std::shared_ptr<MyMap> cur_;
+  std::shared_ptr<MyMap> prev_;
+};
+
+}
+
+//////////////////////////////////////////////////////////////////////
+
+/*
+ * This test case stresses ThreadLocal allocation/deallocation heavily
+ * via ThreadCachedInt and AtomicHashMap, and a bunch of other
+ * mallocing.
+ */
+TEST(AHMIntStressTest, Test) {
+  auto const objs = new MyObjectDirectory();
+  SCOPE_EXIT { delete objs; };
+
+  std::vector<std::thread> threads;
+  for (int threadId = 0; threadId < 64; ++threadId) {
+    threads.emplace_back(
+      [objs,threadId] {
+        for (int recycles = 0; recycles < 500; ++recycles) {
+          for (int i = 0; i < 10; i++) {
+            auto val = objs->get(i);
+          }
+
+          objs->archive();
+        }
+      }
+    );
+  }
+
+  for (auto& t : threads) t.join();
+}
-- 
2.34.1