2 * Copyright 2013 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author Mark Rabkin (mrabkin@fb.com)
18 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
20 #include "folly/Range.h"
22 #include <emmintrin.h> // __v16qi
28 * Predicates that can be used with qfind and startsWith
30 const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive();
31 const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive();
33 std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
34 os.write(piece.start(), piece.size());
40 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
41 const StringPiece& needles) {
42 size_t best = haystack.size();
43 for (char needle: needles) {
44 const void* ptr = memchr(haystack.data(), needle, best);
46 auto found = static_cast<const char*>(ptr) - haystack.data();
47 best = std::min<size_t>(best, found);
50 if (best == haystack.size()) {
51 return StringPiece::npos;
60 // It's okay if pages are bigger than this (as powers of two), but they should
62 constexpr size_t kMinPageSize = 4096;
63 static_assert(kMinPageSize >= 16,
64 "kMinPageSize must be at least SSE register size");
65 #define PAGE_FOR(addr) \
66 (reinterpret_cast<uintptr_t>(addr) / kMinPageSize)
69 #if FOLLY_HAVE_EMMINTRIN_H
70 inline size_t nextAlignedIndex(const char* arr) {
71 auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
72 return 1 + // add 1 because the index starts at 'arr'
73 ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
77 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
78 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
79 const StringPiece& needles)
80 __attribute__ ((__target__("sse4.2"), noinline))
81 FOLLY_DISABLE_ADDRESS_SANITIZER;
83 // helper method for case where needles.size() <= 16
84 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
85 const StringPiece& needles) {
86 DCHECK(!haystack.empty());
87 DCHECK(!needles.empty());
88 DCHECK_LE(needles.size(), 16);
89 // benchmarking shows that memchr beats out SSE for small needle-sets
90 // with large haystacks.
91 if ((needles.size() <= 2 && haystack.size() >= 256) ||
92 // must bail if we can't even SSE-load a single segment of haystack
93 (haystack.size() < 16 &&
94 PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) ||
95 // can't load needles into SSE register if it could cross page boundary
96 PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) {
97 return detail::qfind_first_byte_of_memchr(haystack, needles);
100 auto arr2 = __builtin_ia32_loaddqu(needles.data());
101 // do an unaligned load for first block of haystack
102 auto arr1 = __builtin_ia32_loaddqu(haystack.data());
103 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
104 arr1, haystack.size(), 0);
109 // Now, we can do aligned loads hereafter...
110 size_t i = nextAlignedIndex(haystack.data());
111 for (; i < haystack.size(); i+= 16) {
112 void* ptr1 = __builtin_assume_aligned(haystack.data() + i, 16);
113 auto arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
114 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
115 arr1, haystack.size() - i, 0);
120 return StringPiece::npos;
122 #endif // FOLLY_HAVE_EMMINTRIN_H
124 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
125 // of Computer Algorithms" (1974), but the best description is here:
126 // http://research.swtch.com/sparse
129 FastByteSet() : size_(0) { } // no init of arrays required!
131 inline void add(uint8_t i) {
138 inline bool contains(uint8_t i) const {
139 DCHECK_LE(size_, 256);
140 return sparse_[i] < size_ && dense_[sparse_[i]] == i;
144 uint16_t size_; // can't use uint8_t because it would overflow if all
145 // possible values were inserted.
146 uint8_t sparse_[256];
154 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
155 const StringPiece& needles) {
157 for (auto needle: needles) {
160 for (size_t index = 0; index < haystack.size(); ++index) {
161 if (s.contains(haystack[index])) {
165 return StringPiece::npos;
168 #if FOLLY_HAVE_EMMINTRIN_H
170 template <bool HAYSTACK_ALIGNED>
171 inline size_t scanHaystackBlock(const StringPiece& haystack,
172 const StringPiece& needles,
174 // inline is okay because it's only called from other sse4.2 functions
175 __attribute__ ((__target__("sse4.2")));
177 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
178 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
179 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
181 template <bool HAYSTACK_ALIGNED>
182 inline size_t scanHaystackBlock(const StringPiece& haystack,
183 const StringPiece& needles,
184 int64_t blockStartIdx) {
185 DCHECK_GT(needles.size(), 16); // should handled by *needles16() method
186 DCHECK(blockStartIdx + 16 <= haystack.size() ||
187 (PAGE_FOR(haystack.data() + blockStartIdx) ==
188 PAGE_FOR(haystack.data() + blockStartIdx + 15)));
191 if (HAYSTACK_ALIGNED) {
192 void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16);
193 arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
195 arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
198 // This load is safe because needles.size() >= 16
199 auto arr2 = __builtin_ia32_loaddqu(needles.data());
200 size_t b = __builtin_ia32_pcmpestri128(
201 arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
203 size_t j = nextAlignedIndex(needles.data());
204 for (; j < needles.size(); j += 16) {
205 void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16);
206 arr2 = *reinterpret_cast<const __v16qi*>(ptr2);
208 auto index = __builtin_ia32_pcmpestri128(
209 arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0);
210 b = std::min<size_t>(index, b);
214 return blockStartIdx + b;
216 return StringPiece::npos;
219 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
220 const StringPiece& needles)
221 __attribute__ ((__target__("sse4.2"), noinline));
223 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
224 const StringPiece& needles) {
225 if (UNLIKELY(needles.empty() || haystack.empty())) {
226 return StringPiece::npos;
227 } else if (needles.size() <= 16) {
228 // we can save some unnecessary load instructions by optimizing for
229 // the common case of needles.size() <= 16
230 return qfind_first_byte_of_needles16(haystack, needles);
233 if (haystack.size() < 16 &&
234 PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) {
235 // We can't safely SSE-load haystack. Use a different approach.
236 if (haystack.size() <= 2) {
237 return qfind_first_of(haystack, needles, asciiCaseSensitive);
239 return qfind_first_byte_of_byteset(haystack, needles);
242 auto ret = scanHaystackBlock<false>(haystack, needles, 0);
243 if (ret != StringPiece::npos) {
247 size_t i = nextAlignedIndex(haystack.data());
248 for (; i < haystack.size(); i += 16) {
249 auto ret = scanHaystackBlock<true>(haystack, needles, i);
250 if (ret != StringPiece::npos) {
255 return StringPiece::npos;
257 #endif // FOLLY_HAVE_EMMINTRIN_H
259 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
260 const StringPiece& needles) {
261 if (UNLIKELY(needles.empty() || haystack.empty())) {
262 return StringPiece::npos;
264 // The thresholds below were empirically determined by benchmarking.
265 // This is not an exact science since it depends on the CPU, the size of
266 // needles, and the size of haystack.
267 if (haystack.size() == 1 ||
268 (haystack.size() < 4 && needles.size() <= 16)) {
269 return qfind_first_of(haystack, needles, asciiCaseSensitive);
270 } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
271 (needles.size() >= 16 && haystack.size() <= 64) ||
272 needles.size() >= 32) {
273 return qfind_first_byte_of_byteset(haystack, needles);
276 return qfind_first_byte_of_memchr(haystack, needles);
279 } // namespace detail