From 8091d7199edecafa678d009b82f04c36dd8ce9a7 Mon Sep 17 00:00:00 2001
From: Yedidya Feldblum <yfeldblum@fb.com>
Date: Wed, 20 Dec 2017 11:41:29 -0800
Subject: [PATCH] Kill FOLLY_ALIGNED etc

Summary:
[Folly] Kill `FOLLY_ALIGNED` etc.

`alignas` is standardized as of C++11. Let us just use that.

Replace:
* `FOLLY_ALIGNED` with `alignas`
* `FOLLY_ALIGNED_MAX` with `alignas(folly::max_align_v)`
* `FOLLY_ALIGN_TO_AVOID_FALSE_SHARING` with `alignas(folly::hardware_destructive_interference_size)`

Because where `alignas` may be placed is more restrictive than where attributes may be placed, we also need to move these directives in some cases on top of doing the replacement.

Reviewed By: Orvid

Differential Revision: D6555167

fbshipit-source-id: 4b05b570bace3f8c0fe810b6dd58781dd45757f4
---
 folly/Conv.cpp                                  |  8 ++++----
 folly/Conv.h                                    |  2 +-
 folly/IndexedMemPool.h                          |  7 ++++---
 folly/MPMCQueue.h                               | 11 ++++++-----
 folly/Portability.h                             |  8 --------
 folly/ProducerConsumerQueue.h                   |  6 ++++--
 folly/SharedMutex.h                             | 15 +++++++--------
 folly/TokenBucket.h                             |  2 +-
 folly/concurrency/CacheLocality.h               |  6 ------
 folly/concurrency/UnboundedQueue.h              |  3 +--
 .../detail/ConcurrentHashMap-detail.h           |  2 +-
 folly/executors/IOThreadPoolExecutor.h          |  3 ++-
 folly/executors/ThreadPoolExecutor.h            |  3 ++-
 .../experimental/flat_combining/FlatCombining.h | 17 +++++++----------
 .../flat_combining/test/FlatCombiningExamples.h |  3 +--
 folly/experimental/hazptr/hazptr-impl.h         |  3 +--
 16 files changed, 42 insertions(+), 57 deletions(-)

diff --git a/folly/Conv.cpp b/folly/Conv.cpp
index 8e2f5752..b15d44d9 100644
--- a/folly/Conv.cpp
+++ b/folly/Conv.cpp
@@ -86,7 +86,7 @@ template <> const char *const MaxString<__uint128_t>::value =
 // still not overflow uint16_t.
 constexpr int32_t OOR = 10000;
 
-FOLLY_ALIGNED(16) constexpr uint16_t shift1[] = {
+alignas(16) constexpr uint16_t shift1[] = {
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  // 0-9
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  10
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  20
@@ -115,7 +115,7 @@ FOLLY_ALIGNED(16) constexpr uint16_t shift1[] = {
   OOR, OOR, OOR, OOR, OOR, OOR                       // 250
 };
 
-FOLLY_ALIGNED(16) constexpr uint16_t shift10[] = {
+alignas(16) constexpr uint16_t shift10[] = {
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  // 0-9
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  10
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  20
@@ -144,7 +144,7 @@ FOLLY_ALIGNED(16) constexpr uint16_t shift10[] = {
   OOR, OOR, OOR, OOR, OOR, OOR                       // 250
 };
 
-FOLLY_ALIGNED(16) constexpr uint16_t shift100[] = {
+alignas(16) constexpr uint16_t shift100[] = {
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  // 0-9
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  10
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  20
@@ -173,7 +173,7 @@ FOLLY_ALIGNED(16) constexpr uint16_t shift100[] = {
   OOR, OOR, OOR, OOR, OOR, OOR                       // 250
 };
 
-FOLLY_ALIGNED(16) constexpr uint16_t shift1000[] = {
+alignas(16) constexpr uint16_t shift1000[] = {
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  // 0-9
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  10
   OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR, OOR,  //  20
diff --git a/folly/Conv.h b/folly/Conv.h
index a545816d..b37e1aa0 100644
--- a/folly/Conv.h
+++ b/folly/Conv.h
@@ -303,7 +303,7 @@ inline uint32_t digits10(uint64_t v) {
   // 10^i, defined for i 0 through 19.
   // This is 20 * 8 == 160 bytes, which fits neatly into 5 cache lines
   // (assuming a cache line size of 64).
-  static const uint64_t powersOf10[20] FOLLY_ALIGNED(64) = {
+  alignas(64) static const uint64_t powersOf10[20] = {
       1,
       10,
       100,
diff --git a/folly/IndexedMemPool.h b/folly/IndexedMemPool.h
index 82ae6295..cab5cac1 100644
--- a/folly/IndexedMemPool.h
+++ b/folly/IndexedMemPool.h
@@ -351,7 +351,7 @@ struct IndexedMemPool : boost::noncopyable {
     }
   };
 
-  struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING LocalList {
+  struct alignas(hardware_destructive_interference_size) LocalList {
     AtomicStruct<TaggedPtr,Atom> head;
 
     LocalList() : head(TaggedPtr{}) {}
@@ -377,7 +377,7 @@ struct IndexedMemPool : boost::noncopyable {
 
   /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
   /// actually constructed.  Note that slots_[0] is not constructed or used
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Slot* slots_;
+  alignas(hardware_destructive_interference_size) Slot* slots_;
 
   /// use AccessSpreader to find your list.  We use stripes instead of
   /// thread-local to avoid the need to grow or shrink on thread start
@@ -386,7 +386,8 @@ struct IndexedMemPool : boost::noncopyable {
 
   /// this is the head of a list of node chained by globalNext, that are
   /// themselves each the head of a list chained by localNext
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING AtomicStruct<TaggedPtr,Atom> globalHead_;
+  alignas(hardware_destructive_interference_size)
+      AtomicStruct<TaggedPtr, Atom> globalHead_;
 
   ///////////// private methods
 
diff --git a/folly/MPMCQueue.h b/folly/MPMCQueue.h
index 0db70fa0..932ae8fc 100644
--- a/folly/MPMCQueue.h
+++ b/folly/MPMCQueue.h
@@ -981,7 +981,7 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
   };
 
   /// The maximum number of items in the queue at once
-  size_t FOLLY_ALIGN_TO_AVOID_FALSE_SHARING capacity_;
+  alignas(hardware_destructive_interference_size) size_t capacity_;
 
   /// Anonymous union for use when Dynamic = false and true, respectively
   union {
@@ -1014,18 +1014,19 @@ class MPMCQueueBase<Derived<T, Atom, Dynamic>> : boost::noncopyable {
   Atom<size_t> dcapacity_;
 
   /// Enqueuers get tickets from here
-  Atom<uint64_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushTicket_;
+  alignas(hardware_destructive_interference_size) Atom<uint64_t> pushTicket_;
 
   /// Dequeuers get tickets from here
-  Atom<uint64_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popTicket_;
+  alignas(hardware_destructive_interference_size) Atom<uint64_t> popTicket_;
 
   /// This is how many times we will spin before using FUTEX_WAIT when
   /// the queue is full on enqueue, adaptively computed by occasionally
   /// spinning for longer and smoothing with an exponential moving average
-  Atom<uint32_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING pushSpinCutoff_;
+  alignas(
+      hardware_destructive_interference_size) Atom<uint32_t> pushSpinCutoff_;
 
   /// The adaptive spin cutoff when the queue is empty on dequeue
-  Atom<uint32_t> FOLLY_ALIGN_TO_AVOID_FALSE_SHARING popSpinCutoff_;
+  alignas(hardware_destructive_interference_size) Atom<uint32_t> popSpinCutoff_;
 
   /// Alignment doesn't prevent false sharing at the end of the struct,
   /// so fill out the last cache line
diff --git a/folly/Portability.h b/folly/Portability.h
index 11eb8f82..b41893b5 100644
--- a/folly/Portability.h
+++ b/folly/Portability.h
@@ -34,14 +34,6 @@ constexpr bool kHasUnalignedAccess = false;
 // compiler specific attribute translation
 // msvc should come first, so if clang is in msvc mode it gets the right defines
 
-#if defined(__clang__) || defined(__GNUC__)
-# define FOLLY_ALIGNED(size) __attribute__((__aligned__(size)))
-#elif defined(_MSC_VER)
-# define FOLLY_ALIGNED(size) __declspec(align(size))
-#else
-# error Cannot define FOLLY_ALIGNED on this platform
-#endif
-
 // NOTE: this will only do checking in msvc with versions that support /analyze
 #if _MSC_VER
 # ifdef _USE_ATTRIBUTES_FOR_SAL
diff --git a/folly/ProducerConsumerQueue.h b/folly/ProducerConsumerQueue.h
index 2a8f04c6..b020da84 100644
--- a/folly/ProducerConsumerQueue.h
+++ b/folly/ProducerConsumerQueue.h
@@ -177,8 +177,10 @@ struct ProducerConsumerQueue {
   const uint32_t size_;
   T* const records_;
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> readIndex_;
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned int> writeIndex_;
+  alignas(hardware_destructive_interference_size)
+      std::atomic<unsigned int> readIndex_;
+  alignas(hardware_destructive_interference_size)
+      std::atomic<unsigned int> writeIndex_;
 
   char pad1_[hardware_destructive_interference_size - sizeof(writeIndex_)];
 };
diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h
index b6a1a263..ce160f80 100644
--- a/folly/SharedMutex.h
+++ b/folly/SharedMutex.h
@@ -738,9 +738,8 @@ class SharedMutexImpl {
   typedef Atom<uintptr_t> DeferredReaderSlot;
 
  private:
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING static DeferredReaderSlot deferredReaders
-      [kMaxDeferredReaders *
-       kDeferredSeparationFactor];
+  alignas(hardware_destructive_interference_size) static DeferredReaderSlot
+      deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor];
 
   // Performs an exclusive lock, waiting for state_ & waitMask to be
   // zero first
@@ -1350,11 +1349,11 @@ template <
     typename Tag_,
     template <typename> class Atom,
     bool BlockImmediately>
-typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
-    DeferredReaderSlot
-        SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
-            deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] =
-                {};
+alignas(hardware_destructive_interference_size)
+    typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+        DeferredReaderSlot
+    SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
+        deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] = {};
 
 template <
     bool ReaderPriority,
diff --git a/folly/TokenBucket.h b/folly/TokenBucket.h
index 2abbfe37..7c18ea2c 100644
--- a/folly/TokenBucket.h
+++ b/folly/TokenBucket.h
@@ -221,7 +221,7 @@ class ParameterizedDynamicTokenBucket {
     return true;
   }
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<double> zeroTime_;
+  alignas(hardware_destructive_interference_size) std::atomic<double> zeroTime_;
 };
 
 /**
diff --git a/folly/concurrency/CacheLocality.h b/folly/concurrency/CacheLocality.h
index 420f5334..cd6c06e2 100644
--- a/folly/concurrency/CacheLocality.h
+++ b/folly/concurrency/CacheLocality.h
@@ -118,12 +118,6 @@ struct CacheLocality {
   static CacheLocality uniform(size_t numCpus);
 };
 
-// TODO replace with alignas(hardware_destructive_interference_size)
-
-/// An attribute that will cause a variable or field to be aligned so that
-/// it doesn't have false sharing with anything at a smaller memory address.
-#define FOLLY_ALIGN_TO_AVOID_FALSE_SHARING FOLLY_ALIGNED(128)
-
 /// Knows how to derive a function pointer to the VDSO implementation of
 /// getcpu(2), if available
 struct Getcpu {
diff --git a/folly/concurrency/UnboundedQueue.h b/folly/concurrency/UnboundedQueue.h
index 221e73ce..1aa2f416 100644
--- a/folly/concurrency/UnboundedQueue.h
+++ b/folly/concurrency/UnboundedQueue.h
@@ -648,8 +648,7 @@ class UnboundedQueue {
     Atom<Segment*> next_;
     const Ticket min_;
     bool marked_; // used for iterative deletion
-    FOLLY_ALIGNED(Align)
-    Entry b_[SegmentSize];
+    alignas(Align) Entry b_[SegmentSize];
 
    public:
     explicit Segment(const Ticket t)
diff --git a/folly/concurrency/detail/ConcurrentHashMap-detail.h b/folly/concurrency/detail/ConcurrentHashMap-detail.h
index 51c1375b..99b1ee49 100644
--- a/folly/concurrency/detail/ConcurrentHashMap-detail.h
+++ b/folly/concurrency/detail/ConcurrentHashMap-detail.h
@@ -197,7 +197,7 @@ template <
     typename Allocator = std::allocator<uint8_t>,
     template <typename> class Atom = std::atomic,
     class Mutex = std::mutex>
-class FOLLY_ALIGNED(64) ConcurrentHashMapSegment {
+class alignas(64) ConcurrentHashMapSegment {
   enum class InsertType {
     DOES_NOT_EXIST, // insert/emplace operations.  If key exists, return false.
     MUST_EXIST, // assign operations.  If key does not exist, return false.
diff --git a/folly/executors/IOThreadPoolExecutor.h b/folly/executors/IOThreadPoolExecutor.h
index b7c6688d..1d270cf5 100644
--- a/folly/executors/IOThreadPoolExecutor.h
+++ b/folly/executors/IOThreadPoolExecutor.h
@@ -73,7 +73,8 @@ class IOThreadPoolExecutor : public ThreadPoolExecutor, public IOExecutor {
   folly::EventBaseManager* getEventBaseManager();
 
  private:
-  struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING IOThread : public Thread {
+  struct alignas(hardware_destructive_interference_size) IOThread
+      : public Thread {
     IOThread(IOThreadPoolExecutor* pool)
         : Thread(pool), shouldRun(true), pendingTasks(0) {}
     std::atomic<bool> shouldRun;
diff --git a/folly/executors/ThreadPoolExecutor.h b/folly/executors/ThreadPoolExecutor.h
index 0e5bcc17..38154a86 100644
--- a/folly/executors/ThreadPoolExecutor.h
+++ b/folly/executors/ThreadPoolExecutor.h
@@ -129,7 +129,8 @@ class ThreadPoolExecutor : public virtual folly::Executor {
 
   struct TaskStatsCallbackRegistry;
 
-  struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Thread : public ThreadHandle {
+  struct alignas(hardware_destructive_interference_size) Thread
+      : public ThreadHandle {
     explicit Thread(ThreadPoolExecutor* pool)
         : id(nextId++),
           handle(),
diff --git a/folly/experimental/flat_combining/FlatCombining.h b/folly/experimental/flat_combining/FlatCombining.h
index bf19e235..bf6cc4e3 100644
--- a/folly/experimental/flat_combining/FlatCombining.h
+++ b/folly/experimental/flat_combining/FlatCombining.h
@@ -112,8 +112,8 @@ class FlatCombining {
  public:
   /// Combining request record.
   class Rec {
-    FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
-    folly::SaturatingSemaphore<false, Atom> valid_;
+    alignas(hardware_destructive_interference_size)
+        folly::SaturatingSemaphore<false, Atom> valid_;
     folly::SaturatingSemaphore<false, Atom> done_;
     folly::SaturatingSemaphore<false, Atom> disconnected_;
     size_t index_;
@@ -421,23 +421,20 @@ class FlatCombining {
   const uint64_t kDefaultNumRecs = 64;
   const uint64_t kIdleThreshold = 10;
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
-  Mutex m_;
+  alignas(hardware_destructive_interference_size) Mutex m_;
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
-  folly::SaturatingSemaphore<true, Atom> pending_;
+  alignas(hardware_destructive_interference_size)
+      folly::SaturatingSemaphore<true, Atom> pending_;
   Atom<bool> shutdown_{false};
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
-  uint32_t numRecs_;
+  alignas(hardware_destructive_interference_size) uint32_t numRecs_;
   uint32_t maxOps_;
   Atom<size_t> recs_;
   bool dedicated_;
   std::thread combiner_;
   Pool recsPool_;
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
-  uint64_t uncombined_ = 0;
+  alignas(hardware_destructive_interference_size) uint64_t uncombined_ = 0;
   uint64_t combined_ = 0;
   uint64_t passes_ = 0;
   uint64_t sessions_ = 0;
diff --git a/folly/experimental/flat_combining/test/FlatCombiningExamples.h b/folly/experimental/flat_combining/test/FlatCombiningExamples.h
index fe6ab184..9e4b1f52 100644
--- a/folly/experimental/flat_combining/test/FlatCombiningExamples.h
+++ b/folly/experimental/flat_combining/test/FlatCombiningExamples.h
@@ -25,8 +25,7 @@
 
 namespace folly {
 
-struct Line {
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
+struct alignas(hardware_destructive_interference_size) Line {
   uint64_t val_;
 };
 
diff --git a/folly/experimental/hazptr/hazptr-impl.h b/folly/experimental/hazptr/hazptr-impl.h
index 07564a43..07e6891d 100644
--- a/folly/experimental/hazptr/hazptr-impl.h
+++ b/folly/experimental/hazptr/hazptr-impl.h
@@ -265,12 +265,11 @@ inline bool hazptr_obj_base_refcounted<T, D>::release_ref() {
  *  hazptr_rec
  */
 
-class hazptr_rec {
+class alignas(hardware_destructive_interference_size) hazptr_rec {
   friend class hazptr_domain;
   friend class hazptr_holder;
   friend struct hazptr_tc_entry;
 
-  FOLLY_ALIGN_TO_AVOID_FALSE_SHARING
   std::atomic<const void*> hazptr_{nullptr};
   hazptr_rec* next_{nullptr};
   std::atomic<bool> active_{false};
-- 
2.34.1