folly/Range.cpp

   1 /*
   2  * Copyright 2013 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 // @author Mark Rabkin (mrabkin@fb.com)
  18 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
  19
  20 #include "folly/Range.h"
  21
  22 #include <emmintrin.h>  // __v16qi
  23 #include "folly/Likely.h"
  24
  25 namespace folly {
  26
  27 /**
  28  * Predicates that can be used with qfind and startsWith
  29  */
  30 const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive();
  31 const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive();
  32
  33 std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
  34   os.write(piece.start(), piece.size());
  35   return os;
  36 }
  37
  38 namespace detail {
  39
  40 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
  41                                   const StringPiece& needles) {
  42   size_t best = haystack.size();
  43   for (char needle: needles) {
  44     const void* ptr = memchr(haystack.data(), needle, best);
  45     if (ptr) {
  46       auto found = static_cast<const char*>(ptr) - haystack.data();
  47       best = std::min<size_t>(best, found);
  48     }
  49   }
  50   if (best == haystack.size()) {
  51     return StringPiece::npos;
  52   }
  53   return best;
  54 }
  55
  56 }  // namespace detail
  57
  58 namespace {
  59
  60 // It's okay if pages are bigger than this (as powers of two), but they should
  61 // not be smaller.
  62 constexpr size_t kMinPageSize = 4096;
  63 static_assert(kMinPageSize >= 16,
  64               "kMinPageSize must be at least SSE register size");
  65 #define PAGE_FOR(addr) \
  66   (reinterpret_cast<uintptr_t>(addr) / kMinPageSize)
  67
  68 inline size_t nextAlignedIndex(const char* arr) {
  69   auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
  70   return 1 +                       // add 1 because the index starts at 'arr'
  71     ((firstPossible + 15) & ~0xF)  // round up to next multiple of 16
  72     - firstPossible;
  73 }
  74
  75 #if FOLLY_HAVE_EMMINTRIN_H
  76 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
  77 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
  78                                      const StringPiece& needles)
  79   __attribute__ ((__target__("sse4.2"), noinline));
  80
  81 // helper method for case where needles.size() <= 16
  82 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
  83                                      const StringPiece& needles) {
  84   DCHECK(!haystack.empty());
  85   DCHECK(!needles.empty());
  86   DCHECK_LE(needles.size(), 16);
  87   // benchmarking shows that memchr beats out SSE for small needle-sets
  88   // with large haystacks.
  89   if ((needles.size() <= 2 && haystack.size() >= 256) ||
  90       // must bail if we can't even SSE-load a single segment of haystack
  91       (haystack.size() < 16 &&
  92        PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) ||
  93       // can't load needles into SSE register if it could cross page boundary
  94       PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) {
  95     return detail::qfind_first_byte_of_memchr(haystack, needles);
  96   }
  97
  98   auto arr2 = __builtin_ia32_loaddqu(needles.data());
  99   // do an unaligned load for first block of haystack
 100   auto arr1 = __builtin_ia32_loaddqu(haystack.data());
 101   auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
 102                                            arr1, haystack.size(), 0);
 103   if (index < 16) {
 104     return index;
 105   }
 106
 107   // Now, we can do aligned loads hereafter...
 108   size_t i = nextAlignedIndex(haystack.data());
 109   for (; i < haystack.size(); i+= 16) {
 110     void* ptr1 = __builtin_assume_aligned(haystack.data() + i, 16);
 111     auto arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
 112     auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
 113                                              arr1, haystack.size() - i, 0);
 114     if (index < 16) {
 115       return i + index;
 116     }
 117   }
 118   return StringPiece::npos;
 119 }
 120 #endif // FOLLY_HAVE_EMMINTRIN_H
 121
 122 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
 123 // of Computer Algorithms" (1974), but the best description is here:
 124 // http://research.swtch.com/sparse
 125 class FastByteSet {
 126  public:
 127   FastByteSet() : size_(0) { }  // no init of arrays required!
 128
 129   inline void add(uint8_t i) {
 130     if (!contains(i)) {
 131       dense_[size_] = i;
 132       sparse_[i] = size_;
 133       size_++;
 134     }
 135   }
 136   inline bool contains(uint8_t i) const {
 137     DCHECK_LE(size_, 256);
 138     return sparse_[i] < size_ && dense_[sparse_[i]] == i;
 139   }
 140
 141  private:
 142   uint16_t size_;  // can't use uint8_t because it would overflow if all
 143                    // possible values were inserted.
 144   uint8_t sparse_[256];
 145   uint8_t dense_[256];
 146 };
 147
 148 }  // namespace
 149
 150 namespace detail {
 151
 152 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
 153                                    const StringPiece& needles) {
 154   FastByteSet s;
 155   for (auto needle: needles) {
 156     s.add(needle);
 157   }
 158   for (size_t index = 0; index < haystack.size(); ++index) {
 159     if (s.contains(haystack[index])) {
 160       return index;
 161     }
 162   }
 163   return StringPiece::npos;
 164 }
 165
 166 #if FOLLY_HAVE_EMMINTRIN_H
 167
 168 template <bool HAYSTACK_ALIGNED>
 169 inline size_t scanHaystackBlock(const StringPiece& haystack,
 170                                 const StringPiece& needles,
 171                                 int64_t idx)
 172 // inline is okay because it's only called from other sse4.2 functions
 173   __attribute__ ((__target__("sse4.2")));
 174
 175 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
 176 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
 177 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
 178 // block.
 179 template <bool HAYSTACK_ALIGNED>
 180 inline size_t scanHaystackBlock(const StringPiece& haystack,
 181                                 const StringPiece& needles,
 182                                 int64_t blockStartIdx) {
 183   DCHECK_GT(needles.size(), 16);  // should handled by *needles16() method
 184   DCHECK(blockStartIdx + 16 <= haystack.size() ||
 185          (PAGE_FOR(haystack.data() + blockStartIdx) ==
 186           PAGE_FOR(haystack.data() + blockStartIdx + 15)));
 187
 188   __v16qi arr1;
 189   if (HAYSTACK_ALIGNED) {
 190     void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16);
 191     arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
 192   } else {
 193     arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
 194   }
 195
 196   // This load is safe because needles.size() >= 16
 197   auto arr2 = __builtin_ia32_loaddqu(needles.data());
 198   size_t b = __builtin_ia32_pcmpestri128(
 199     arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
 200
 201   size_t j = nextAlignedIndex(needles.data());
 202   for (; j < needles.size(); j += 16) {
 203     void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16);
 204     arr2 = *reinterpret_cast<const __v16qi*>(ptr2);
 205
 206     auto index = __builtin_ia32_pcmpestri128(
 207       arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0);
 208     b = std::min<size_t>(index, b);
 209   }
 210
 211   if (b < 16) {
 212     return blockStartIdx + b;
 213   }
 214   return StringPiece::npos;
 215 }
 216
 217 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
 218                                  const StringPiece& needles)
 219   __attribute__ ((__target__("sse4.2"), noinline));
 220
 221 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
 222                                  const StringPiece& needles) {
 223   if (UNLIKELY(needles.empty() || haystack.empty())) {
 224     return StringPiece::npos;
 225   } else if (needles.size() <= 16) {
 226     // we can save some unnecessary load instructions by optimizing for
 227     // the common case of needles.size() <= 16
 228     return qfind_first_byte_of_needles16(haystack, needles);
 229   }
 230
 231   if (haystack.size() < 16 &&
 232       PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) {
 233     // We can't safely SSE-load haystack. Use a different approach.
 234     if (haystack.size() <= 2) {
 235       return qfind_first_of(haystack, needles, asciiCaseSensitive);
 236     }
 237     return qfind_first_byte_of_byteset(haystack, needles);
 238   }
 239
 240   auto ret = scanHaystackBlock<false>(haystack, needles, 0);
 241   if (ret != StringPiece::npos) {
 242     return ret;
 243   }
 244
 245   size_t i = nextAlignedIndex(haystack.data());
 246   for (; i < haystack.size(); i += 16) {
 247     auto ret = scanHaystackBlock<true>(haystack, needles, i);
 248     if (ret != StringPiece::npos) {
 249       return ret;
 250     }
 251   }
 252
 253   return StringPiece::npos;
 254 }
 255 #endif // FOLLY_HAVE_EMMINTRIN_H
 256
 257 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
 258                                  const StringPiece& needles) {
 259   if (UNLIKELY(needles.empty() || haystack.empty())) {
 260     return StringPiece::npos;
 261   }
 262   // The thresholds below were empirically determined by benchmarking.
 263   // This is not an exact science since it depends on the CPU, the size of
 264   // needles, and the size of haystack.
 265   if (haystack.size() == 1 ||
 266       (haystack.size() < 4 && needles.size() <= 16)) {
 267     return qfind_first_of(haystack, needles, asciiCaseSensitive);
 268   } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
 269              (needles.size() >= 16 && haystack.size() <= 64) ||
 270              needles.size() >= 32) {
 271     return qfind_first_byte_of_byteset(haystack, needles);
 272   }
 273
 274   return qfind_first_byte_of_memchr(haystack, needles);
 275 }
 276
 277 }  // namespace detail
 278 }  // namespace folly