From: Ananth Jasty Date: Fri, 19 Jun 2015 02:16:22 +0000 (-0700) Subject: ARM64 assembler fixes for Folly. X-Git-Tag: v0.47.0~2 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=6457c4429379011d12ee841e8325dddeebec538e;p=folly.git ARM64 assembler fixes for Folly. Summary: Wrap asm("pause") in an inline so that it becomes asm("wfe") on aarch64. Closes #187 Closes #190 Reviewed By: @yfeldblum Differential Revision: D2152868 Pulled By: @sgolemon --- diff --git a/folly/Baton.h b/folly/Baton.h index 97c0d1bf..7922ffd0 100644 --- a/folly/Baton.h +++ b/folly/Baton.h @@ -273,13 +273,11 @@ struct Baton : boost::noncopyable { // hooray! return true; } -#if FOLLY_X64 // The pause instruction is the polite way to spin, but it doesn't // actually affect correctness to omit it if we don't have it. // Pausing donates the full capabilities of the current core to // its other hyperthreads for a dozen cycles or so - asm volatile ("pause"); -#endif + asm_volatile_pause(); } return false; diff --git a/folly/Portability.h b/folly/Portability.h index d42f0031..fbd8e3ab 100644 --- a/folly/Portability.h +++ b/folly/Portability.h @@ -120,6 +120,12 @@ struct MaxAlign { char c; } __attribute__((__aligned__)); # define FOLLY_X64 0 #endif +#if defined(__aarch64__) +# define FOLLY_A64 1 +#else +# define FOLLY_A64 0 +#endif + // packing is very ugly in msvc #ifdef _MSC_VER # define FOLLY_PACK_ATTR /**/ @@ -278,4 +284,23 @@ inline size_t malloc_usable_size(void* ptr) { # define FOLLY_HAS_RTTI 1 #endif +namespace folly { + +inline void asm_volatile_pause() { +#if defined(__i386__) || FOLLY_X64 + asm volatile ("pause"); +#elif FOLLY_A64 + asm volatile ("wfe"); +#endif +} +inline void asm_pause() { +#if defined(__i386__) || FOLLY_X64 + asm ("pause"); +#elif FOLLY_A64 + asm ("wfe"); +#endif +} + +} + #endif // FOLLY_PORTABILITY_H_ diff --git a/folly/RWSpinLock.h b/folly/RWSpinLock.h index 392b8a04..8a7a8410 100644 --- a/folly/RWSpinLock.h +++ b/folly/RWSpinLock.h @@ -587,7 +587,7 @@ class RWTicketSpinLockT : boost::noncopyable { int count = 0; QuarterInt val = __sync_fetch_and_add(&ticket.users, 1); while (val != load_acquire(&ticket.write)) { - asm volatile("pause"); + asm_volatile_pause(); if (UNLIKELY(++count > 1000)) sched_yield(); } } @@ -636,7 +636,7 @@ class RWTicketSpinLockT : boost::noncopyable { // need to let threads that already have a shared lock complete int count = 0; while (!LIKELY(try_lock_shared())) { - asm volatile("pause"); + asm_volatile_pause(); if (UNLIKELY((++count & 1023) == 0)) sched_yield(); } } diff --git a/folly/SharedMutex.h b/folly/SharedMutex.h index 8bfd3262..69748161 100644 --- a/folly/SharedMutex.h +++ b/folly/SharedMutex.h @@ -796,9 +796,7 @@ class SharedMutexImpl { if ((state & goal) == 0) { return true; } -#if FOLLY_X64 - asm volatile("pause"); -#endif + asm_volatile_pause(); ++spinCount; if (UNLIKELY(spinCount >= kMaxSpinCount)) { return ctx.canBlock() && @@ -956,9 +954,7 @@ class SharedMutexImpl { return; } } -#if FOLLY_X64 - asm("pause"); -#endif + asm_pause(); if (UNLIKELY(++spinCount >= kMaxSpinCount)) { applyDeferredReaders(state, ctx, slot); return; diff --git a/folly/SmallLocks.h b/folly/SmallLocks.h index 6e7d68d1..0624e8a4 100644 --- a/folly/SmallLocks.h +++ b/folly/SmallLocks.h @@ -47,8 +47,8 @@ #include #include -#if !FOLLY_X64 -# error "SmallLocks.h is currently x64-only." +#if !FOLLY_X64 && !FOLLY_A64 +# error "SmallLocks.h is currently x64 and aarch64 only." #endif namespace folly { @@ -72,7 +72,7 @@ namespace detail { void wait() { if (spinCount < kMaxActiveSpin) { ++spinCount; - asm volatile("pause"); + asm_volatile_pause(); } else { /* * Always sleep 0.5ms, assuming this will make the kernel put @@ -217,6 +217,7 @@ struct PicoSpinLock { bool try_lock() const { bool ret = false; +#if FOLLY_X64 #define FB_DOBTS(size) \ asm volatile("lock; bts" #size " %1, (%2); setnc %0" \ : "=r" (ret) \ @@ -231,6 +232,11 @@ struct PicoSpinLock { } #undef FB_DOBTS +#elif FOLLY_A64 + ret = __atomic_fetch_or(&lock_, 1 << Bit, __ATOMIC_SEQ_CST); +#else +#error "x86 aarch64 only" +#endif return ret; } @@ -250,6 +256,7 @@ struct PicoSpinLock { * integer. */ void unlock() const { +#if FOLLY_X64 #define FB_DOBTR(size) \ asm volatile("lock; btr" #size " %0, (%1)" \ : \ @@ -267,6 +274,11 @@ struct PicoSpinLock { } #undef FB_DOBTR +#elif FOLLY_A64 + __atomic_fetch_and(&lock_, ~(1 << Bit), __ATOMIC_SEQ_CST); +#else +# error "x64 aarch64 only" +#endif } }; diff --git a/folly/detail/TurnSequencer.h b/folly/detail/TurnSequencer.h index 9fdb5b1d..c3dd4a43 100644 --- a/folly/detail/TurnSequencer.h +++ b/folly/detail/TurnSequencer.h @@ -124,9 +124,7 @@ struct TurnSequencer { // the first effectSpinCutoff tries are spins, after that we will // record ourself as a waiter and block with futexWait if (tries < effectiveSpinCutoff) { -#if defined(__i386__) || FOLLY_X64 - asm volatile ("pause"); -#endif + asm_volatile_pause(); continue; } diff --git a/folly/experimental/fibers/Baton.cpp b/folly/experimental/fibers/Baton.cpp index a33f8560..263dc7d2 100644 --- a/folly/experimental/fibers/Baton.cpp +++ b/folly/experimental/fibers/Baton.cpp @@ -65,13 +65,11 @@ bool Baton::spinWaitForEarlyPost() { // hooray! return true; } -#if FOLLY_X64 // The pause instruction is the polite way to spin, but it doesn't // actually affect correctness to omit it if we don't have it. // Pausing donates the full capabilities of the current core to // its other hyperthreads for a dozen cycles or so - asm volatile ("pause"); -#endif + asm_volatile_pause(); } return false; diff --git a/folly/test/SmallLocksTest.cpp b/folly/test/SmallLocksTest.cpp index f556886e..059e41c0 100644 --- a/folly/test/SmallLocksTest.cpp +++ b/folly/test/SmallLocksTest.cpp @@ -60,7 +60,7 @@ void splock_test() { const int max = 1000; unsigned int seed = (uintptr_t)pthread_self(); for (int i = 0; i < max; i++) { - asm("pause"); + folly::asm_pause(); MSLGuard g(v.lock); int first = v.ar[0]; @@ -84,7 +84,7 @@ template struct PslTest { std::lock_guard> guard(lock); lock.setData(ourVal); for (int n = 0; n < 10; ++n) { - asm volatile("pause"); + folly::asm_volatile_pause(); EXPECT_EQ(lock.getData(), ourVal); } } diff --git a/folly/test/SpinLockTest.cpp b/folly/test/SpinLockTest.cpp index b5a2b715..3be763d6 100644 --- a/folly/test/SpinLockTest.cpp +++ b/folly/test/SpinLockTest.cpp @@ -37,7 +37,7 @@ void spinlockTestThread(LockedVal* v) { const int max = 1000; unsigned int seed = (uintptr_t)pthread_self(); for (int i = 0; i < max; i++) { - asm("pause"); + folly::asm_pause(); SpinLockGuardImpl g(v->lock); int first = v->ar[0]; @@ -62,7 +62,7 @@ struct TryLockState { template void trylockTestThread(TryLockState* state, size_t count) { while (true) { - asm("pause"); + folly::asm_pause(); SpinLockGuardImpl g(state->lock1); if (state->obtained >= count) { break; @@ -81,7 +81,7 @@ void trylockTestThread(TryLockState* state, size_t count) { auto oldFailed = state->failed; while (state->failed == oldFailed && state->obtained < count) { state->lock1.unlock(); - asm("pause"); + folly::asm_pause(); state->lock1.lock(); }