2 * Copyright 2013 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 // @author Mark Rabkin (mrabkin@fb.com)
19 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
22 #include "folly/Range.h"
24 #include "folly/CpuId.h"
25 #include "folly/Likely.h"
30 * Predicates that can be used with qfind and startsWith
32 const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive();
33 const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive();
35 std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
36 os.write(piece.start(), piece.size());
42 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
43 const StringPiece& needles) {
44 size_t best = haystack.size();
45 for (char needle: needles) {
46 const void* ptr = memchr(haystack.data(), needle, best);
48 auto found = static_cast<const char*>(ptr) - haystack.data();
49 best = std::min<size_t>(best, found);
52 if (best == haystack.size()) {
53 return StringPiece::npos;
62 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
63 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
64 const StringPiece& needles)
65 __attribute__ ((__target__("sse4.2"), noinline));
67 // helper method for case where needles.size() <= 16
68 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
69 const StringPiece& needles) {
70 DCHECK_LE(needles.size(), 16);
71 if (needles.size() <= 2 && haystack.size() >= 256) {
72 // benchmarking shows that memchr beats out SSE for small needle-sets
73 // with large haystacks.
74 // TODO(mcurtiss): could this be because of unaligned SSE loads?
75 return detail::qfind_first_byte_of_memchr(haystack, needles);
77 auto arr2 = __builtin_ia32_loaddqu(needles.data());
78 for (size_t i = 0; i < haystack.size(); i+= 16) {
79 auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
80 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
81 arr1, haystack.size() - i, 0);
86 return StringPiece::npos;
89 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
90 const StringPiece& needles)
91 __attribute__ ((__target__("sse4.2"), noinline));
93 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
94 const StringPiece& needles) {
95 if (UNLIKELY(needles.empty() || haystack.empty())) {
96 return StringPiece::npos;
97 } else if (needles.size() <= 16) {
98 // we can save some unnecessary load instructions by optimizing for
99 // the common case of needles.size() <= 16
100 return qfind_first_byte_of_needles16(haystack, needles);
103 size_t index = haystack.size();
104 for (size_t i = 0; i < haystack.size(); i += 16) {
106 auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
107 for (size_t j = 0; j < needles.size(); j += 16) {
108 auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
109 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
110 arr1, haystack.size() - i, 0);
111 b = std::min<size_t>(index, b);
117 return StringPiece::npos;
120 typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of;
122 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
123 // of Computer Algorithms" (1974), but the best description is here:
124 // http://research.swtch.com/sparse
127 FastByteSet() : size_(0) { } // no init of arrays required!
129 inline void add(uint8_t i) {
136 inline bool contains(uint8_t i) const {
137 DCHECK_LE(size_, 256);
138 return sparse_[i] < size_ && dense_[sparse_[i]] == i;
142 uint16_t size_; // can't use uint8_t because it would overflow if all
143 // possible values were inserted.
144 uint8_t sparse_[256];
152 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
153 const StringPiece& needles) {
155 for (auto needle: needles) {
158 for (size_t index = 0; index < haystack.size(); ++index) {
159 if (s.contains(haystack[index])) {
163 return StringPiece::npos;
166 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
167 const StringPiece& needles) {
168 if (UNLIKELY(needles.empty() || haystack.empty())) {
169 return StringPiece::npos;
171 // The thresholds below were empirically determined by benchmarking.
172 // This is not an exact science since it depends on the CPU, the size of
173 // needles, and the size of haystack.
174 if (haystack.size() == 1 ||
175 (haystack.size() < 4 && needles.size() <= 16)) {
176 return qfind_first_of(haystack, needles, asciiCaseSensitive);
177 } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
178 (needles.size() >= 16 && haystack.size() <= 64) ||
179 needles.size() >= 32) {
180 return qfind_first_byte_of_byteset(haystack, needles);
183 return qfind_first_byte_of_memchr(haystack, needles);
186 auto const qfind_first_byte_of_fn =
187 folly::CpuId().sse42() ? qfind_first_byte_of_sse42
188 : qfind_first_byte_of_nosse;
190 size_t qfind_first_byte_of(const StringPiece& haystack,
191 const StringPiece& needles) {
192 return qfind_first_byte_of_fn(haystack, needles);
195 } // namespace detail