folly/Range.cpp

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Mark Rabkin (mrabkin@fb.com)
  18 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  19
  20 #include "folly/Range.h"
  21
  22 #if FOLLY_HAVE_EMMINTRIN_H
  23 #include <emmintrin.h>  // __v16qi
  24 #endif
  25 #include <iostream>
  26
  27 namespace folly {
  28
  29 /**
  30  * Predicates that can be used with qfind and startsWith
  31  */
  32 const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive();
  33 const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive();
  34
  35 std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
  36   os.write(piece.start(), piece.size());
  37   return os;
  38 }
  39
  40 namespace detail {
  41
  42 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
  43                                   const StringPiece& needles) {
  44   size_t best = haystack.size();
  45   for (char needle: needles) {
  46     const void* ptr = memchr(haystack.data(), needle, best);
  47     if (ptr) {
  48       auto found = static_cast<const char*>(ptr) - haystack.data();
  49       best = std::min<size_t>(best, found);
  50     }
  51   }
  52   if (best == haystack.size()) {
  53     return StringPiece::npos;
  54   }
  55   return best;
  56 }
  57
  58 }  // namespace detail
  59
  60 namespace {
  61
  62 // It's okay if pages are bigger than this (as powers of two), but they should
  63 // not be smaller.
  64 constexpr size_t kMinPageSize = 4096;
  65 static_assert(kMinPageSize >= 16,
  66               "kMinPageSize must be at least SSE register size");
  67 #define PAGE_FOR(addr) \
  68   (reinterpret_cast<uintptr_t>(addr) / kMinPageSize)
  69
  70
  71 // Earlier versions of GCC (for example, Clang on Mac OS X, which is based on
  72 // GCC 4.2) do not have a full compliment of SSE builtins.
  73 #if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6)
  74 inline size_t nextAlignedIndex(const char* arr) {
  75    auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
  76    return 1 +                       // add 1 because the index starts at 'arr'
  77      ((firstPossible + 15) & ~0xF)  // round up to next multiple of 16
  78      - firstPossible;
  79 }
  80
  81 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
  82 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
  83                                      const StringPiece& needles)
  84   __attribute__ ((__target__("sse4.2"), noinline))
  85   FOLLY_DISABLE_ADDRESS_SANITIZER;
  86
  87 // helper method for case where needles.size() <= 16
  88 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
  89                                      const StringPiece& needles) {
  90   DCHECK(!haystack.empty());
  91   DCHECK(!needles.empty());
  92   DCHECK_LE(needles.size(), 16);
  93   // benchmarking shows that memchr beats out SSE for small needle-sets
  94   // with large haystacks.
  95   if ((needles.size() <= 2 && haystack.size() >= 256) ||
  96       // must bail if we can't even SSE-load a single segment of haystack
  97       (haystack.size() < 16 &&
  98        PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) ||
  99       // can't load needles into SSE register if it could cross page boundary
 100       PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) {
 101     return detail::qfind_first_byte_of_memchr(haystack, needles);
 102   }
 103
 104   auto arr2 = __builtin_ia32_loaddqu(needles.data());
 105   // do an unaligned load for first block of haystack
 106   auto arr1 = __builtin_ia32_loaddqu(haystack.data());
 107   auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
 108                                            arr1, haystack.size(), 0);
 109   if (index < 16) {
 110     return index;
 111   }
 112
 113   // Now, we can do aligned loads hereafter...
 114   size_t i = nextAlignedIndex(haystack.data());
 115   for (; i < haystack.size(); i+= 16) {
 116     void* ptr1 = __builtin_assume_aligned(haystack.data() + i, 16);
 117     auto arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
 118     auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
 119                                              arr1, haystack.size() - i, 0);
 120     if (index < 16) {
 121       return i + index;
 122     }
 123   }
 124   return StringPiece::npos;
 125 }
 126 #endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+
 127
 128 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
 129 // of Computer Algorithms" (1974), but the best description is here:
 130 // http://research.swtch.com/sparse
 131 class FastByteSet {
 132  public:
 133   FastByteSet() : size_(0) { }  // no init of arrays required!
 134
 135   inline void add(uint8_t i) {
 136     if (!contains(i)) {
 137       dense_[size_] = i;
 138       sparse_[i] = size_;
 139       size_++;
 140     }
 141   }
 142   inline bool contains(uint8_t i) const {
 143     DCHECK_LE(size_, 256);
 144     return sparse_[i] < size_ && dense_[sparse_[i]] == i;
 145   }
 146
 147  private:
 148   uint16_t size_;  // can't use uint8_t because it would overflow if all
 149                    // possible values were inserted.
 150   uint8_t sparse_[256];
 151   uint8_t dense_[256];
 152 };
 153
 154 }  // namespace
 155
 156 namespace detail {
 157
 158 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
 159                                    const StringPiece& needles) {
 160   FastByteSet s;
 161   for (auto needle: needles) {
 162     s.add(needle);
 163   }
 164   for (size_t index = 0; index < haystack.size(); ++index) {
 165     if (s.contains(haystack[index])) {
 166       return index;
 167     }
 168   }
 169   return StringPiece::npos;
 170 }
 171
 172 #if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6)
 173
 174 template <bool HAYSTACK_ALIGNED>
 175 inline size_t scanHaystackBlock(const StringPiece& haystack,
 176                                 const StringPiece& needles,
 177                                 int64_t idx)
 178 // inline is okay because it's only called from other sse4.2 functions
 179   __attribute__ ((__target__("sse4.2")));
 180
 181 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
 182 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
 183 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
 184 // block.
 185 template <bool HAYSTACK_ALIGNED>
 186 inline size_t scanHaystackBlock(const StringPiece& haystack,
 187                                 const StringPiece& needles,
 188                                 int64_t blockStartIdx) {
 189   DCHECK_GT(needles.size(), 16);  // should handled by *needles16() method
 190   DCHECK(blockStartIdx + 16 <= haystack.size() ||
 191          (PAGE_FOR(haystack.data() + blockStartIdx) ==
 192           PAGE_FOR(haystack.data() + blockStartIdx + 15)));
 193
 194   __v16qi arr1;
 195   if (HAYSTACK_ALIGNED) {
 196     void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16);
 197     arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
 198   } else {
 199     arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
 200   }
 201
 202   // This load is safe because needles.size() >= 16
 203   auto arr2 = __builtin_ia32_loaddqu(needles.data());
 204   size_t b = __builtin_ia32_pcmpestri128(
 205     arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
 206
 207   size_t j = nextAlignedIndex(needles.data());
 208   for (; j < needles.size(); j += 16) {
 209     void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16);
 210     arr2 = *reinterpret_cast<const __v16qi*>(ptr2);
 211
 212     auto index = __builtin_ia32_pcmpestri128(
 213       arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0);
 214     b = std::min<size_t>(index, b);
 215   }
 216
 217   if (b < 16) {
 218     return blockStartIdx + b;
 219   }
 220   return StringPiece::npos;
 221 }
 222
 223 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
 224                                  const StringPiece& needles)
 225   __attribute__ ((__target__("sse4.2"), noinline));
 226
 227 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
 228                                  const StringPiece& needles) {
 229   if (UNLIKELY(needles.empty() || haystack.empty())) {
 230     return StringPiece::npos;
 231   } else if (needles.size() <= 16) {
 232     // we can save some unnecessary load instructions by optimizing for
 233     // the common case of needles.size() <= 16
 234     return qfind_first_byte_of_needles16(haystack, needles);
 235   }
 236
 237   if (haystack.size() < 16 &&
 238       PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) {
 239     // We can't safely SSE-load haystack. Use a different approach.
 240     if (haystack.size() <= 2) {
 241       return qfind_first_of(haystack, needles, asciiCaseSensitive);
 242     }
 243     return qfind_first_byte_of_byteset(haystack, needles);
 244   }
 245
 246   auto ret = scanHaystackBlock<false>(haystack, needles, 0);
 247   if (ret != StringPiece::npos) {
 248     return ret;
 249   }
 250
 251   size_t i = nextAlignedIndex(haystack.data());
 252   for (; i < haystack.size(); i += 16) {
 253     auto ret = scanHaystackBlock<true>(haystack, needles, i);
 254     if (ret != StringPiece::npos) {
 255       return ret;
 256     }
 257   }
 258
 259   return StringPiece::npos;
 260 }
 261 #endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+
 262
 263 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
 264                                  const StringPiece& needles) {
 265   if (UNLIKELY(needles.empty() || haystack.empty())) {
 266     return StringPiece::npos;
 267   }
 268   // The thresholds below were empirically determined by benchmarking.
 269   // This is not an exact science since it depends on the CPU, the size of
 270   // needles, and the size of haystack.
 271   if (haystack.size() == 1 ||
 272       (haystack.size() < 4 && needles.size() <= 16)) {
 273     return qfind_first_of(haystack, needles, asciiCaseSensitive);
 274   } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
 275              (needles.size() >= 16 && haystack.size() <= 64) ||
 276              needles.size() >= 32) {
 277     return qfind_first_byte_of_byteset(haystack, needles);
 278   }
 279
 280   return qfind_first_byte_of_memchr(haystack, needles);
 281 }
 282
 283 }  // namespace detail
 284 }  // namespace folly