2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/RangeSse42.h>
19 #include <glog/logging.h>
21 #include <folly/Portability.h>
23 // Essentially, two versions of this file: one with an SSE42 implementation
24 // and one with a fallback implementation. We determine which version to use by
25 // testing for the presence of the required headers.
27 // TODO: Maybe this should be done by the build system....
28 #if !FOLLY_SSE_PREREQ(4, 2)
31 size_t qfind_first_byte_of_sse42(
32 const StringPieceLite haystack,
33 const StringPieceLite needles) {
34 return qfind_first_byte_of_nosse(haystack, needles);
43 #include <emmintrin.h>
44 #include <nmmintrin.h>
45 #include <smmintrin.h>
47 #include <folly/Likely.h>
49 // GCC 4.9 with ASAN has a problem: a function with no_sanitize_address calling
50 // a function with always_inline fails to build. The _mm_* functions are marked
52 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
53 #if defined FOLLY_SANITIZE_ADDRESS && FOLLY_SANITIZE_ADDRESS == 1 && \
55 #define _mm_load_si128(p) (*(p))
56 #define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p)))
60 #define _mm_cmpestri(a, b, c, d, e) \
61 __builtin_ia32_pcmpestri128((__v16qi)(a), b, (__v16qi)(c), d, e)
67 // It's okay if pages are bigger than this (as powers of two), but they should
69 static constexpr size_t kMinPageSize = 4096;
72 "kMinPageSize must be at least SSE register size");
75 static inline uintptr_t page_for(T* addr) {
76 return reinterpret_cast<uintptr_t>(addr) / kMinPageSize;
79 static inline size_t nextAlignedIndex(const char* arr) {
80 auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
81 return 1 + // add 1 because the index starts at 'arr'
82 ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
86 static size_t qfind_first_byte_of_needles16(
87 const StringPieceLite haystack,
88 const StringPieceLite needles) FOLLY_DISABLE_ADDRESS_SANITIZER;
90 // helper method for case where needles.size() <= 16
91 size_t qfind_first_byte_of_needles16(
92 const StringPieceLite haystack,
93 const StringPieceLite needles) {
94 DCHECK_GT(haystack.size(), 0u);
95 DCHECK_GT(needles.size(), 0u);
96 DCHECK_LE(needles.size(), 16u);
97 if ((needles.size() <= 2 && haystack.size() >= 256) ||
98 // must bail if we can't even SSE-load a single segment of haystack
99 (haystack.size() < 16 &&
100 page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) ||
101 // can't load needles into SSE register if it could cross page boundary
102 page_for(needles.end() - 1) != page_for(needles.data() + 15)) {
103 return detail::qfind_first_byte_of_nosse(haystack, needles);
106 auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
107 // do an unaligned load for first block of haystack
109 _mm_loadu_si128(reinterpret_cast<const __m128i*>(haystack.data()));
111 _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
113 return size_t(index);
116 // Now, we can do aligned loads hereafter...
117 size_t i = nextAlignedIndex(haystack.data());
118 for (; i < haystack.size(); i += 16) {
120 _mm_load_si128(reinterpret_cast<const __m128i*>(haystack.data() + i));
121 index = _mm_cmpestri(
122 arr2, int(needles.size()), arr1, int(haystack.size() - i), 0);
127 return std::string::npos;
130 template <bool HAYSTACK_ALIGNED>
131 size_t scanHaystackBlock(
132 const StringPieceLite haystack,
133 const StringPieceLite needles,
135 // Turn off ASAN because the "arr2 = ..." assignment in the loop below reads
136 // up to 15 bytes beyond end of the buffer in #needles#. That is ok because
137 // ptr2 is always 16-byte aligned, so the read can never span a page
138 // boundary. Also, the extra data that may be read is never actually used.
139 FOLLY_DISABLE_ADDRESS_SANITIZER;
141 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
142 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
143 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
145 template <bool HAYSTACK_ALIGNED>
146 size_t scanHaystackBlock(
147 const StringPieceLite haystack,
148 const StringPieceLite needles,
149 uint64_t blockStartIdx) {
150 DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method
152 blockStartIdx + 16 <= haystack.size() ||
153 (page_for(haystack.data() + blockStartIdx) ==
154 page_for(haystack.data() + blockStartIdx + 15)));
157 if (HAYSTACK_ALIGNED) {
158 arr1 = _mm_load_si128(
159 reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
161 arr1 = _mm_loadu_si128(
162 reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
165 // This load is safe because needles.size() >= 16
166 auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
168 _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
170 size_t j = nextAlignedIndex(needles.data());
171 for (; j < needles.size(); j += 16) {
172 arr2 = _mm_load_si128(reinterpret_cast<const __m128i*>(needles.data() + j));
174 auto index = _mm_cmpestri(
176 int(needles.size() - j),
178 int(haystack.size() - blockStartIdx),
180 b = std::min(index, b);
184 return blockStartIdx + b;
186 return std::string::npos;
189 size_t qfind_first_byte_of_sse42(
190 const StringPieceLite haystack,
191 const StringPieceLite needles);
193 size_t qfind_first_byte_of_sse42(
194 const StringPieceLite haystack,
195 const StringPieceLite needles) {
196 if (UNLIKELY(needles.empty() || haystack.empty())) {
197 return std::string::npos;
198 } else if (needles.size() <= 16) {
199 // we can save some unnecessary load instructions by optimizing for
200 // the common case of needles.size() <= 16
201 return qfind_first_byte_of_needles16(haystack, needles);
204 if (haystack.size() < 16 &&
205 page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) {
206 // We can't safely SSE-load haystack. Use a different approach.
207 if (haystack.size() <= 2) {
208 return qfind_first_byte_of_std(haystack, needles);
210 return qfind_first_byte_of_byteset(haystack, needles);
213 auto ret = scanHaystackBlock<false>(haystack, needles, 0);
214 if (ret != std::string::npos) {
218 size_t i = nextAlignedIndex(haystack.data());
219 for (; i < haystack.size(); i += 16) {
220 ret = scanHaystackBlock<true>(haystack, needles, i);
221 if (ret != std::string::npos) {
226 return std::string::npos;