// It's okay if pages are bigger than this (as powers of two), but they should
// not be smaller.
constexpr size_t kMinPageSize = 4096;
+static_assert(kMinPageSize >= 16,
+ "kMinPageSize must be at least SSE register size");
#define PAGE_FOR(addr) \
- (reinterpret_cast<intptr_t>(addr) / kMinPageSize)
+ (reinterpret_cast<uintptr_t>(addr) / kMinPageSize)
-// Rounds up to the next multiple of 16
-#define ROUND_UP_16(val) \
- ((val + 15) & ~0xF)
+inline size_t nextAlignedIndex(const char* arr) {
+ auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
+ return 1 + // add 1 because the index starts at 'arr'
+ ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
+ - firstPossible;
+}
// build sse4.2-optimized version even if -msse4.2 is not passed to GCC
size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
DCHECK(!haystack.empty());
DCHECK(!needles.empty());
DCHECK_LE(needles.size(), 16);
+ // benchmarking shows that memchr beats out SSE for small needle-sets
+ // with large haystacks.
if ((needles.size() <= 2 && haystack.size() >= 256) ||
- // we can't load needles into SSE register if it could cross page boundary
- (PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15))) {
- // benchmarking shows that memchr beats out SSE for small needle-sets
- // with large haystacks.
- // TODO(mcurtiss): could this be because of unaligned SSE loads?
+ // must bail if we can't even SSE-load a single segment of haystack
+ (haystack.size() < 16 &&
+ PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) ||
+ // can't load needles into SSE register if it could cross page boundary
+ PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) {
return detail::qfind_first_byte_of_memchr(haystack, needles);
}
- __v16qi arr2 = __builtin_ia32_loaddqu(needles.data());
-
- // If true, the last byte we want to load into the SSE register is on the
- // same page as the last byte of the actual Range. No risk of segfault.
- bool canSseLoadLastBlock =
- (PAGE_FOR(haystack.end() - 1) ==
- PAGE_FOR(haystack.data() + ROUND_UP_16(haystack.size()) - 1));
- int64_t lastSafeBlockIdx = canSseLoadLastBlock ?
- haystack.size() : static_cast<int64_t>(haystack.size()) - 16;
+ auto arr2 = __builtin_ia32_loaddqu(needles.data());
+ // do an unaligned load for first block of haystack
+ auto arr1 = __builtin_ia32_loaddqu(haystack.data());
+ auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
+ arr1, haystack.size(), 0);
+ if (index < 16) {
+ return index;
+ }
- int64_t i = 0;
- for (; i < lastSafeBlockIdx; i+= 16) {
- auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
+ // Now, we can do aligned loads hereafter...
+ size_t i = nextAlignedIndex(haystack.data());
+ for (; i < haystack.size(); i+= 16) {
+ void* ptr1 = __builtin_assume_aligned(haystack.data() + i, 16);
+ auto arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
arr1, haystack.size() - i, 0);
if (index < 16) {
return i + index;
}
}
-
- if (!canSseLoadLastBlock) {
- StringPiece tmp(haystack);
- tmp.advance(i);
- auto ret = detail::qfind_first_byte_of_memchr(tmp, needles);
- if (ret != StringPiece::npos) {
- return ret + i;
- }
- }
return StringPiece::npos;
}
return StringPiece::npos;
}
+template <bool HAYSTACK_ALIGNED>
inline size_t scanHaystackBlock(const StringPiece& haystack,
const StringPiece& needles,
int64_t idx)
-// inlining is okay because it's only called from other sse4.2 functions
+// inline is okay because it's only called from other sse4.2 functions
__attribute__ ((__target__("sse4.2")));
// Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
-// needle. If blockStartIdx is near the end of haystack, it may read a few bytes
-// past the end; it is the caller's responsibility to ensure this is safe.
+// needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
+// If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
+// block.
+template <bool HAYSTACK_ALIGNED>
inline size_t scanHaystackBlock(const StringPiece& haystack,
const StringPiece& needles,
int64_t blockStartIdx) {
- // small needle sets should be handled by qfind_first_byte_of_needles16()
- DCHECK_GT(needles.size(), 16);
+ DCHECK_GT(needles.size(), 16); // should handled by *needles16() method
DCHECK(blockStartIdx + 16 <= haystack.size() ||
(PAGE_FOR(haystack.data() + blockStartIdx) ==
PAGE_FOR(haystack.data() + blockStartIdx + 15)));
- size_t b = 16;
- auto arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
- int64_t j = 0;
- for (; j < static_cast<int64_t>(needles.size()) - 16; j += 16) {
- auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
- auto index = __builtin_ia32_pcmpestri128(
- arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
- b = std::min<size_t>(index, b);
+
+ __v16qi arr1;
+ if (HAYSTACK_ALIGNED) {
+ void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16);
+ arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
+ } else {
+ arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
}
- // Avoid reading any bytes past the end needles by just reading the last
- // 16 bytes of needles. We know this is safe because needles.size() > 16.
- auto arr2 = __builtin_ia32_loaddqu(needles.end() - 16);
- auto index = __builtin_ia32_pcmpestri128(
+ // This load is safe because needles.size() >= 16
+ auto arr2 = __builtin_ia32_loaddqu(needles.data());
+ size_t b = __builtin_ia32_pcmpestri128(
arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
- b = std::min<size_t>(index, b);
+
+ size_t j = nextAlignedIndex(needles.data());
+ for (; j < needles.size(); j += 16) {
+ void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16);
+ arr2 = *reinterpret_cast<const __v16qi*>(ptr2);
+
+ auto index = __builtin_ia32_pcmpestri128(
+ arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0);
+ b = std::min<size_t>(index, b);
+ }
if (b < 16) {
return blockStartIdx + b;
return qfind_first_byte_of_needles16(haystack, needles);
}
- int64_t i = 0;
- for (; i < static_cast<int64_t>(haystack.size()) - 16; i += 16) {
- auto ret = scanHaystackBlock(haystack, needles, i);
- if (ret != StringPiece::npos) {
- return ret;
+ if (haystack.size() < 16 &&
+ PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) {
+ // We can't safely SSE-load haystack. Use a different approach.
+ if (haystack.size() <= 2) {
+ return qfind_first_of(haystack, needles, asciiCaseSensitive);
}
- };
+ return qfind_first_byte_of_byteset(haystack, needles);
+ }
- if (i == haystack.size() - 16 ||
- PAGE_FOR(haystack.end() - 1) == PAGE_FOR(haystack.data() + i + 15)) {
- return scanHaystackBlock(haystack, needles, i);
- } else {
- auto ret = qfind_first_byte_of_nosse(StringPiece(haystack.data() + i,
- haystack.end()),
- needles);
+ auto ret = scanHaystackBlock<false>(haystack, needles, 0);
+ if (ret != StringPiece::npos) {
+ return ret;
+ }
+
+ size_t i = nextAlignedIndex(haystack.data());
+ for (; i < haystack.size(); i += 16) {
+ auto ret = scanHaystackBlock<true>(haystack, needles, i);
if (ret != StringPiece::npos) {
- return i + ret;
+ return ret;
}
}