Fix bad bug in folly::ThreadLocal

author Tudor Bosman <tudorb@fb.com>

Fri, 27 Sep 2013 00:19:24 +0000 (17:19 -0700)

committer Peter Griess <pgriess@fb.com>

Tue, 15 Oct 2013 01:44:12 +0000 (18:44 -0700)
author Tudor Bosman <tudorb@fb.com>
Fri, 27 Sep 2013 00:19:24 +0000 (17:19 -0700)
committer Peter Griess <pgriess@fb.com>
Tue, 15 Oct 2013 01:44:12 +0000 (18:44 -0700)
diff --git a/folly/ThreadCachedInt.h b/folly/ThreadCachedInt.h

index 69c03d1f020ce7f5214f3eaf7d77c57c0b302125..b9d30429d3832bf2e56fe4348f45cd06dba8e81e 100644 (file)
--- a/folly/ThreadCachedInt.h
+++ b/folly/ThreadCachedInt.h
@@ -24,6 +24,9 @@
  #define FOLLY_THREADCACHEDINT_H
  
  #include <atomic>
+
+#include <boost/noncopyable.hpp>
+
  #include "folly/Likely.h"
  #include "folly/ThreadLocal.h"
  
diff --git a/folly/ThreadLocal.h b/folly/ThreadLocal.h

index b4c031e750b3d8d2b0a47e863918ade33b313b4e..e2ef58b0d8e203c4d8c507e11410593c04a7d0ae 100644 (file)
--- a/folly/ThreadLocal.h
+++ b/folly/ThreadLocal.h
@@ -196,7 +196,7 @@ class ThreadLocalPtr {
      friend class ThreadLocalPtr<T,Tag>;
  
      threadlocal_detail::StaticMeta<Tag>& meta_;
-    boost::mutex* lock_;
+    std::mutex* lock_;
      int id_;
  
     public:
diff --git a/folly/detail/ThreadLocalDetail.h b/folly/detail/ThreadLocalDetail.h

index b9624389f4ee3836b618b4a6bf659ab2e84386cd..20a74fa56f0678f2580b39d62b69c8bc4e7a606d 100644 (file)
--- a/folly/detail/ThreadLocalDetail.h
+++ b/folly/detail/ThreadLocalDetail.h
@@ -19,20 +19,22 @@
  
  #include <limits.h>
  #include <pthread.h>
-#include <list>
+
+#include <mutex>
  #include <string>
  #include <vector>
  
-#include <boost/thread/locks.hpp>
-#include <boost/thread/mutex.hpp>
-#include <boost/thread/locks.hpp>
-
  #include <glog/logging.h>
  
-#include "folly/Exception.h"
  #include "folly/Foreach.h"
+#include "folly/Exception.h"
  #include "folly/Malloc.h"
  
+// TODO(tudorb): Remove this declaration after Malloc.h is pushed to
+// third-party.
+extern "C" int allocm(void**, size_t*, size_t, int)
+__attribute__((weak));
+
  namespace folly {
  namespace threadlocal_detail {
  
@@ -154,7 +156,7 @@ struct StaticMeta {
  
    int nextId_;
    std::vector<int> freeIds_;
-  boost::mutex lock_;
+  std::mutex lock_;
    pthread_key_t pthreadKey_;
    ThreadEntry head_;
  
@@ -208,7 +210,7 @@ struct StaticMeta {
      // We wouldn't call pthread_setspecific unless we actually called get()
      DCHECK_NE(threadEntry_.elementsCapacity, 0);
      {
-      boost::lock_guard<boost::mutex> g(meta.lock_);
+      std::lock_guard<std::mutex> g(meta.lock_);
        meta.erase(&threadEntry_);
        // No need to hold the lock any longer; threadEntry_ is private to this
        // thread now that it's been removed from meta.
@@ -224,7 +226,7 @@ struct StaticMeta {
    static int create() {
      int id;
      auto & meta = instance();
-    boost::lock_guard<boost::mutex> g(meta.lock_);
+    std::lock_guard<std::mutex> g(meta.lock_);
      if (!meta.freeIds_.empty()) {
        id = meta.freeIds_.back();
        meta.freeIds_.pop_back();
@@ -240,7 +242,7 @@ struct StaticMeta {
        // Elements in other threads that use this id.
        std::vector<ElementWrapper> elements;
        {
-        boost::lock_guard<boost::mutex> g(meta.lock_);
+        std::lock_guard<std::mutex> g(meta.lock_);
          for (ThreadEntry* e = meta.head_.next; e != &meta.head_; e = e->next) {
            if (id < e->elementsCapacity && e->elements[id].ptr) {
              elements.push_back(e->elements[id]);
@@ -277,62 +279,94 @@ struct StaticMeta {
     * @id to fit in.
     */
    static void reserve(int id) {
-    size_t prevSize = threadEntry_.elementsCapacity;
-    size_t newSize = static_cast<size_t>((id + 5) * 1.7);
+    size_t prevCapacity = threadEntry_.elementsCapacity;
+    // Growth factor < 2, see folly/docs/FBVector.md; + 5 to prevent
+    // very slow start.
+    size_t newCapacity = static_cast<size_t>((id + 5) * 1.7);
+    assert(newCapacity > prevCapacity);
      auto& meta = instance();
-    ElementWrapper* ptr = nullptr;
-
-    // Rely on jemalloc to zero the memory if possible -- maybe it knows
-    // it's already zeroed and saves us some work.
-    if (!usingJEMalloc() ||
-        prevSize < jemallocMinInPlaceExpandable ||
-        (rallocm(
-          static_cast<void**>(static_cast<void*>(&threadEntry_.elements)),
-          nullptr, newSize * sizeof(ElementWrapper), 0,
-          ALLOCM_NO_MOVE | ALLOCM_ZERO) != ALLOCM_SUCCESS)) {
-      // Sigh, must realloc, but we can't call realloc here, as elements is
-      // still linked in meta, so another thread might access invalid memory
-      // after realloc succeeds.  We'll copy by hand and update threadEntry_
-      // under the lock.
+    ElementWrapper* reallocated = nullptr;
+
+    // Need to grow. Note that we can't call realloc, as elements is
+    // still linked in meta, so another thread might access invalid memory
+    // after realloc succeeds. We'll copy by hand and update threadEntry_
+    // under the lock.
+    if (usingJEMalloc()) {
+      bool success = false;
+      size_t newByteSize = newCapacity * sizeof(ElementWrapper);
+      size_t realByteSize = 0;
+
+      // Try to grow in place.
        //
-      // Note that we're using calloc instead of malloc in order to zero
-      // the entire region.  rallocm (ALLOCM_ZERO) will only zero newly
-      // allocated memory, so if a previous allocation allocated more than
-      // we requested, it's our responsibility to guarantee that the tail
-      // is zeroed.  calloc() is simpler than malloc() followed by memset(),
-      // and potentially faster when dealing with a lot of memory, as
-      // it can get already-zeroed pages from the kernel.
-      ptr = static_cast<ElementWrapper*>(
-        calloc(newSize, sizeof(ElementWrapper))
-      );
-      if (!ptr) throw std::bad_alloc();
+      // Note that rallocm(ALLOCM_ZERO) will only zero newly allocated memory,
+      // even if a previous allocation allocated more than we requested.
+      // This is fine; we always use ALLOCM_ZERO with jemalloc and we
+      // always expand our allocation to the real size.
+      if (prevCapacity * sizeof(ElementWrapper) >=
+          jemallocMinInPlaceExpandable) {
+        success = (rallocm(reinterpret_cast<void**>(&threadEntry_.elements),
+                           &realByteSize,
+                           newByteSize,
+                           0,
+                           ALLOCM_NO_MOVE | ALLOCM_ZERO) == ALLOCM_SUCCESS);
+
+      }
+
+      // In-place growth failed.
+      if (!success) {
+        // Note that, unlike calloc,allocm(... ALLOCM_ZERO) zeros all
+        // allocated bytes (*realByteSize) and not just the requested
+        // bytes (newByteSize)
+        success = (allocm(reinterpret_cast<void**>(&reallocated),
+                          &realByteSize,
+                          newByteSize,
+                          ALLOCM_ZERO) == ALLOCM_SUCCESS);
+      }
+
+      if (success) {
+        // Expand to real size
+        assert(realByteSize / sizeof(ElementWrapper) >= newCapacity);
+        newCapacity = realByteSize / sizeof(ElementWrapper);
+      } else {
+        throw std::bad_alloc();
+      }
+    } else {  // no jemalloc
+      // calloc() is simpler than malloc() followed by memset(), and
+      // potentially faster when dealing with a lot of memory, as it can get
+      // already-zeroed pages from the kernel.
+      reallocated = static_cast<ElementWrapper*>(
+          calloc(newCapacity, sizeof(ElementWrapper)));
+      if (!reallocated) {
+        throw std::bad_alloc();
+      }
      }
  
      // Success, update the entry
      {
-      boost::lock_guard<boost::mutex> g(meta.lock_);
+      std::lock_guard<std::mutex> g(meta.lock_);
  
-      if (prevSize == 0) {
+      if (prevCapacity == 0) {
          meta.push_back(&threadEntry_);
        }
  
-      if (ptr) {
+      if (reallocated) {
         /*
          * Note: we need to hold the meta lock when copying data out of
          * the old vector, because some other thread might be
          * destructing a ThreadLocal and writing to the elements vector
          * of this thread.
          */
-        memcpy(ptr, threadEntry_.elements, sizeof(ElementWrapper) * prevSize);
+        memcpy(reallocated, threadEntry_.elements,
+               sizeof(ElementWrapper) * prevCapacity);
          using std::swap;
-        swap(ptr, threadEntry_.elements);
-        threadEntry_.elementsCapacity = newSize;
+        swap(reallocated, threadEntry_.elements);
        }
+      threadEntry_.elementsCapacity = newCapacity;
      }
  
-    free(ptr);
+    free(reallocated);
  
-    if (prevSize == 0) {
+    if (prevCapacity == 0) {
        pthread_setspecific(meta.pthreadKey_, &meta);
      }
    }
@@ -340,6 +374,7 @@ struct StaticMeta {
    static ElementWrapper& get(int id) {
      if (UNLIKELY(threadEntry_.elementsCapacity <= id)) {
        reserve(id);
+      assert(threadEntry_.elementsCapacity > id);
      }
      return threadEntry_.elements[id];
    }
diff --git a/folly/test/AtomicHashArrayTest.cpp b/folly/test/AtomicHashArrayTest.cpp

index 1fa8d994c2a6caa7a21a78e4d69c297b134bdcf7..fa6b78dea614ddd8581b96304b309ff14d950a34 100644 (file)
--- a/folly/test/AtomicHashArrayTest.cpp
+++ b/folly/test/AtomicHashArrayTest.cpp
@@ -15,7 +15,9 @@
   */
  
  #include <sys/mman.h>
+
  #include <cstddef>
+#include <map>
  #include <stdexcept>
  
  #include "folly/AtomicHashArray.h"
diff --git a/folly/test/ThreadCachedArenaTest.cpp b/folly/test/ThreadCachedArenaTest.cpp

index bc720ed09d38ee43c196bb4eb385f1b6a1cf25f4..431a044b9233cab1d144fd7efc92659f75b0069f 100644 (file)
--- a/folly/test/ThreadCachedArenaTest.cpp
+++ b/folly/test/ThreadCachedArenaTest.cpp
@@ -17,6 +17,7 @@
  #include "folly/ThreadCachedArena.h"
  #include "folly/Memory.h"
  
+#include <map>
  #include <mutex>
  #include <thread>
  #include <iterator>
diff --git a/folly/test/ThreadLocalTest.cpp b/folly/test/ThreadLocalTest.cpp

index 1c948257f35e360a593d888dafc974fc94fcbc03..b9f69a0fd812e1485545be7d1c3c34b73ef76670 100644 (file)
--- a/folly/test/ThreadLocalTest.cpp
+++ b/folly/test/ThreadLocalTest.cpp
@@ -18,18 +18,23 @@
  
  #include <sys/types.h>
  #include <sys/wait.h>
-#include <map>
-#include <unordered_map>
-#include <set>
+#include <unistd.h>
+
+#include <array>
  #include <atomic>
-#include <mutex>
+#include <chrono>
  #include <condition_variable>
+#include <map>
+#include <mutex>
+#include <set>
  #include <thread>
-#include <unistd.h>
+#include <unordered_map>
+
  #include <boost/thread/tss.hpp>
-#include <gtest/gtest.h>
  #include <gflags/gflags.h>
  #include <glog/logging.h>
+#include <gtest/gtest.h>
+
  #include "folly/Benchmark.h"
  
  using namespace folly;
@@ -298,6 +303,80 @@ TEST(ThreadLocal, Movable2) {
    EXPECT_EQ(4, tls.size());
  }
  
+namespace {
+
+constexpr size_t kFillObjectSize = 300;
+
+std::atomic<uint64_t> gDestroyed;
+
+/**
+ * Fill a chunk of memory with a unique-ish pattern that includes the thread id
+ * (so deleting one of these from another thread would cause a failure)
+ *
+ * Verify it explicitly and on destruction.
+ */
+class FillObject {
+ public:
+  explicit FillObject(uint64_t idx) : idx_(idx) {
+    uint64_t v = val();
+    for (size_t i = 0; i < kFillObjectSize; ++i) {
+      data_[i] = v;
+    }
+  }
+
+  void check() {
+    uint64_t v = val();
+    for (size_t i = 0; i < kFillObjectSize; ++i) {
+      CHECK_EQ(v, data_[i]);
+    }
+  }
+
+  ~FillObject() {
+    ++gDestroyed;
+  }
+
+ private:
+  uint64_t val() const {
+    return (idx_ << 40) | uint64_t(pthread_self());
+  }
+
+  uint64_t idx_;
+  uint64_t data_[kFillObjectSize];
+};
+
+}  // namespace
+
+TEST(ThreadLocal, Stress) {
+  constexpr size_t numFillObjects = 250;
+  std::array<ThreadLocalPtr<FillObject>, numFillObjects> objects;
+
+  constexpr size_t numThreads = 32;
+  constexpr size_t numReps = 20;
+
+  std::vector<std::thread> threads;
+  threads.reserve(numThreads);
+
+  for (size_t i = 0; i < numThreads; ++i) {
+    threads.emplace_back([&objects] {
+      for (size_t rep = 0; rep < numReps; ++rep) {
+        for (size_t i = 0; i < objects.size(); ++i) {
+          objects[i].reset(new FillObject(rep * objects.size() + i));
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
+        }
+        for (size_t i = 0; i < objects.size(); ++i) {
+          objects[i]->check();
+        }
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  EXPECT_EQ(numFillObjects * numThreads * numReps, gDestroyed);
+}
+
  // Yes, threads and fork don't mix
  // (http://cppwisdom.quora.com/Why-threads-and-fork-dont-mix) but if you're
  // stupid or desperate enough to try, we shouldn't stand in your way.
author	Tudor Bosman <tudorb@fb.com>
	Fri, 27 Sep 2013 00:19:24 +0000 (17:19 -0700)
committer	Peter Griess <pgriess@fb.com>
	Tue, 15 Oct 2013 01:44:12 +0000 (18:44 -0700)
folly/ThreadCachedInt.h		patch \| blob \| history
folly/ThreadLocal.h		patch \| blob \| history
folly/detail/ThreadLocalDetail.h		patch \| blob \| history
folly/test/AtomicHashArrayTest.cpp		patch \| blob \| history
folly/test/ThreadCachedArenaTest.cpp		patch \| blob \| history
folly/test/ThreadLocalTest.cpp		patch \| blob \| history