Fix data race in folly/executors/Codel.cpp exposed by TSAN
authorKenny Yu <kennyyu@fb.com>
Thu, 7 Sep 2017 17:46:48 +0000 (10:46 -0700)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Thu, 7 Sep 2017 17:58:30 +0000 (10:58 -0700)
Summary:
I found a data race with TSAN while attempting to run a sanitizer version of my service:

```
WARNING: ThreadSanitizer: data race (pid=266)
  Read of size 8 at 0x7b58000c0c08 by thread T370:
    @ folly::Codel::overloaded(std::chrono::duration<long, std::ratio<1l, 1000000000l> >) at ./folly/executors/Codel.cpp:44
    @ apache::thrift::concurrency::ThreadManager::ImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::Worker<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::run() at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:119
    @ apache::thrift::concurrency::PthreadThread::threadMain(void*) at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:200
    @ __tsan_thread_start_func at crtstuff.c:?

  Previous write of size 8 at 0x7b58000c0c08 by thread T371:
    @ folly::Codel::overloaded(std::chrono::duration<long, std::ratio<1l, 1000000000l> >) at ./folly/executors/Codel.cpp:62
    @ apache::thrift::concurrency::ThreadManager::ImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::Worker<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::run() at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:119
    @ apache::thrift::concurrency::PthreadThread::threadMain(void*) at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:200
    @ __tsan_thread_start_func at crtstuff.c:?

  Location is heap block of size 744 at 0x7b58000c0c00 allocated by thread T314:
    @ operator new(unsigned long) at ??:?
    @ PriorityImplT at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:826
    @ void __gnu_cxx::new_allocator<apache::thrift::concurrency::PriorityThreadManager::PriorityImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > > >::construct<apache::thrift::concurrency::PriorityThreadManager::PriorityImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >, std::array<std::pair<std::shared_ptr<apache::thrift::concurrency::ThreadFactory>, unsigned long>, 5ul>&, bool&, unsigned long&>(apache::thrift::concurrency::PriorityThreadManager::PriorityImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >*, std::array<std::pair<std::shared_ptr<apache::thrift::concurrency::ThreadFactory>, unsigned long>, 5ul>&, bool&, unsigned long&)
    @ std::shared_ptr<apache::thrift::concurrency::PriorityThreadManager> apache::thrift::concurrency::PriorityThreadManager::newPriorityThreadManager<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >(std::array<unsigned long, 5ul> const&, bool, unsigned long) at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:1090
    @ std::shared_ptr<apache::thrift::concurrency::PriorityThreadManager> apache::thrift::concurrency::PriorityThreadManager::newPriorityThreadManager<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >(unsigned long, bool, unsigned long) at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:1100
    @ apache::thrift::ThriftServer::serve() at ./thrift/lib/cpp2/server/ThriftServer.cpp:475
    @ apache::thrift::server::TServer::run() at ./thrift/lib/cpp/server/TServer.h:186
    @ apache::thrift::concurrency::PthreadThread::threadMain(void*) at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:200
    @ __tsan_thread_start_func at crtstuff.c:?

  Thread T370 (tid=638, running) created by thread T314 at:
    @ pthread_create at ??:?
    @ apache::thrift::concurrency::PthreadThread::start() at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:108
    @ apache::thrift::concurrency::ThreadManager::ImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::addWorker(unsigned long) at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:185
    @ apache::thrift::concurrency::PriorityThreadManager::PriorityImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::start() at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:840
    @ apache::thrift::ThriftServer::setup() at ./thrift/lib/cpp2/server/ThriftServer.cpp:347
    @ apache::thrift::ThriftServer::serve() at ./thrift/lib/cpp2/server/ThriftServer.cpp:475
    @ apache::thrift::server::TServer::run() at ./thrift/lib/cpp/server/TServer.h:186
    @ apache::thrift::concurrency::PthreadThread::threadMain(void*) at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:200
    @ __tsan_thread_start_func at crtstuff.c:?

  Thread T371 (tid=639, running) created by thread T314 at:
    @ pthread_create at ??:?
    @ apache::thrift::concurrency::PthreadThread::start() at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:108
    @ apache::thrift::concurrency::ThreadManager::ImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::addWorker(unsigned long) at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:185
    @ apache::thrift::concurrency::PriorityThreadManager::PriorityImplT<folly::LifoSemImpl<std::atomic, folly::Baton<std::atomic, true, true> > >::start() at ./thrift/lib/cpp/concurrency/ThreadManager.tcc:840
    @ apache::thrift::ThriftServer::setup() at ./thrift/lib/cpp2/server/ThriftServer.cpp:347
    @ apache::thrift::ThriftServer::serve() at ./thrift/lib/cpp2/server/ThriftServer.cpp:475
    @ apache::thrift::server::TServer::run() at ./thrift/lib/cpp/server/TServer.h:186
    @ apache::thrift::concurrency::PthreadThread::threadMain(void*) at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:200
    @ __tsan_thread_start_func at crtstuff.c:?

  Thread T314 (tid=582, running) created by main thread at:
    @ pthread_create at ??:?
    @ apache::thrift::concurrency::PthreadThread::start() at ./thrift/lib/cpp/concurrency/PosixThreadFactory.cpp:108
    ...
```

Looks like there is a data race in how `codelMinDelay_` is used. I couldn't get `std::atomic` to compile with `std::chrono::nanoseconds`,
so I used `std::atomic<uint64_t>` and converted between `uint64_t` and time types appropriately.

Reviewed By: yfeldblum

Differential Revision: D5759588

fbshipit-source-id: 8213f3789808265ddfe5ab122f0f86490d0ea6ea

folly/executors/Codel.cpp
folly/executors/Codel.h

index c84b9be3ca3e70baee6af5f8715c5886b3844499..d5da27e3ee490b3ef95412b1ab5962a6280943d7 100644 (file)
 DEFINE_int32(codel_interval, 100, "Codel default interval time in ms");
 DEFINE_int32(codel_target_delay, 5, "Target codel queueing delay in ms");
 
-using std::chrono::nanoseconds;
-using std::chrono::milliseconds;
+using namespace std::chrono;
 
 namespace folly {
 
 Codel::Codel()
-    : codelMinDelay_(0),
-      codelIntervalTime_(std::chrono::steady_clock::now()),
+    : codelMinDelayNs_(0),
+      codelIntervalTimeNs_(
+          duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
+              .count()),
       codelResetDelay_(true),
       overloaded_(false) {}
 
-bool Codel::overloaded(std::chrono::nanoseconds delay) {
+bool Codel::overloaded(nanoseconds delay) {
   bool ret = false;
-  auto now = std::chrono::steady_clock::now();
+  auto now = steady_clock::now();
 
   // Avoid another thread updating the value at the same time we are using it
   // to calculate the overloaded state
-  auto minDelay = codelMinDelay_;
+  auto minDelay = nanoseconds(codelMinDelayNs_);
 
-  if (now > codelIntervalTime_ &&
+  if (now > steady_clock::time_point(nanoseconds(codelIntervalTimeNs_)) &&
       // testing before exchanging is more cacheline-friendly
       (!codelResetDelay_.load(std::memory_order_acquire) &&
        !codelResetDelay_.exchange(true))) {
-    codelIntervalTime_ = now + getInterval();
+    codelIntervalTimeNs_ =
+        duration_cast<nanoseconds>((now + getInterval()).time_since_epoch())
+            .count();
 
     if (minDelay > getTargetDelay()) {
       overloaded_ = true;
@@ -57,12 +60,12 @@ bool Codel::overloaded(std::chrono::nanoseconds delay) {
   // and that it happens after the interval reset above
   if (codelResetDelay_.load(std::memory_order_acquire) &&
       codelResetDelay_.exchange(false)) {
-    codelMinDelay_ = delay;
+    codelMinDelayNs_ = delay.count();
     // More than one request must come in during an interval before codel
     // starts dropping requests
     return false;
-  } else if (delay < codelMinDelay_) {
-    codelMinDelay_ = delay;
+  } else if (delay < nanoseconds(codelMinDelayNs_)) {
+    codelMinDelayNs_ = delay.count();
   }
 
   // Here is where we apply different logic than codel proper. Instead of
@@ -84,7 +87,7 @@ int Codel::getLoad() {
 }
 
 nanoseconds Codel::getMinDelay() {
-  return codelMinDelay_;
+  return nanoseconds(codelMinDelayNs_);
 }
 
 milliseconds Codel::getInterval() {
index 98718bb8fc497a2ceadae18e125ed98ef1b6e370..7137057403c35163e3722a6cc6c50bddad0aaba4 100644 (file)
@@ -80,8 +80,8 @@ class Codel {
   std::chrono::milliseconds getSloughTimeout();
 
  private:
-  std::chrono::nanoseconds codelMinDelay_;
-  std::chrono::time_point<std::chrono::steady_clock> codelIntervalTime_;
+  std::atomic<uint64_t> codelMinDelayNs_;
+  std::atomic<uint64_t> codelIntervalTimeNs_;
 
   // flag to make overloaded() thread-safe, since we only want
   // to reset the delay once per time period