From 10001685609b018c9d3864f924616450c59942fe Mon Sep 17 00:00:00 2001 From: Christopher Dykes Date: Thu, 31 Mar 2016 10:12:30 -0700 Subject: [PATCH] Support SSE 4.2 qfind under MSVC Summary:MSVC has support in the compiler for the intrinsics required, but both refuses to tell us that, and also gives them proper names. The code already checks for runtime support, this just enables compiling the SSE 4.2 version in the first place. Reviewed By: yfeldblum Differential Revision: D3104296 fb-gh-sync-id: 9143240bede9b756817691fdd86818001267dac1 fbshipit-source-id: 9143240bede9b756817691fdd86818001267dac1 --- folly/Portability.h | 7 +++-- folly/detail/RangeSse42.cpp | 51 +++++++++++++------------------------ 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/folly/Portability.h b/folly/Portability.h index fb4a6996..5bca1f54 100644 --- a/folly/Portability.h +++ b/folly/Portability.h @@ -285,10 +285,9 @@ namespace std { typedef ::max_align_t max_align_t; } // Hide a GCC specific thing that breaks MSVC if left alone. # define __extension__ -#ifdef _M_IX86_FP -# define FOLLY_SSE _M_IX86_FP -# define FOLLY_SSE_MINOR 0 -#endif +// We have compiler support for the newest of the new, but +// MSVC doesn't tell us that. +#define __SSE4_2__ 1 #endif diff --git a/folly/detail/RangeSse42.cpp b/folly/detail/RangeSse42.cpp index a7aeb8d4..b963e5e7 100644 --- a/folly/detail/RangeSse42.cpp +++ b/folly/detail/RangeSse42.cpp @@ -14,49 +14,35 @@ * limitations under the License. */ - - #include "RangeSse42.h" #include #include - - // Essentially, two versions of this file: one with an SSE42 implementation // and one with a fallback implementation. We determine which version to use by // testing for the presence of the required headers. // // TODO: Maybe this should be done by the build system.... #if !FOLLY_SSE_PREREQ(4, 2) - - - namespace folly { - namespace detail { - size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, const StringPieceLite needles) { CHECK(false) << "Function " << __func__ << " only works with SSE42!"; return qfind_first_byte_of_nosse(haystack, needles); } - } - } - - - # else - - - #include #include #include + #include +#include #include + #include // GCC 4.9 with ASAN has a problem: a function with no_sanitize_address calling @@ -68,10 +54,14 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, __GNUC_PREREQ(4, 9) # define _mm_load_si128(p) (*(p)) # define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p))) +# ifdef _mm_cmpestri +# undef _mm_cmpestri +# endif +# define _mm_cmpestri(a, b, c, d, e) \ + __builtin_ia32_pcmpestri128((__v16qi)(a), b, (__v16qi)(c), d, e) #endif namespace folly { - namespace detail { // It's okay if pages are bigger than this (as powers of two), but they should @@ -116,8 +106,8 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, // do an unaligned load for first block of haystack auto arr1 = _mm_loadu_si128( reinterpret_cast(haystack.data())); - auto index = __builtin_ia32_pcmpestri128((__v16qi)arr2, needles.size(), - (__v16qi)arr1, haystack.size(), 0); + auto index = _mm_cmpestri(arr2, needles.size(), + arr1, haystack.size(), 0); if (index < 16) { return index; } @@ -127,9 +117,9 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, for (; i < haystack.size(); i+= 16) { auto arr1 = _mm_load_si128( reinterpret_cast(haystack.data() + i)); - auto index = __builtin_ia32_pcmpestri128( - (__v16qi)arr2, needles.size(), - (__v16qi)arr1, haystack.size() - i, 0); + auto index = _mm_cmpestri( + arr2, needles.size(), + arr1, haystack.size() - i, 0); if (index < 16) { return i + index; } @@ -172,17 +162,17 @@ size_t scanHaystackBlock(const StringPieceLite haystack, // This load is safe because needles.size() >= 16 auto arr2 = _mm_loadu_si128( reinterpret_cast(needles.data())); - size_t b = __builtin_ia32_pcmpestri128( - (__v16qi)arr2, 16, (__v16qi)arr1, haystack.size() - blockStartIdx, 0); + size_t b = _mm_cmpestri( + arr2, 16, arr1, haystack.size() - blockStartIdx, 0); size_t j = nextAlignedIndex(needles.data()); for (; j < needles.size(); j += 16) { arr2 = _mm_load_si128( reinterpret_cast(needles.data() + j)); - auto index = __builtin_ia32_pcmpestri128( - (__v16qi)arr2, needles.size() - j, - (__v16qi)arr1, haystack.size() - blockStartIdx, 0); + auto index = _mm_cmpestri( + arr2, needles.size() - j, + arr1, haystack.size() - blockStartIdx, 0); b = std::min(index, b); } @@ -229,11 +219,6 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, return std::string::npos; } - } - } - - - #endif -- 2.34.1