From: Tom Jackson Date: Wed, 4 Feb 2015 20:51:11 +0000 (-0800) Subject: Fixing find_first_of O(n) case X-Git-Tag: v0.25.0~23 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=c7e6c224e1ca2fa9e27344acce8fe156c3435ed0;p=folly.git Fixing find_first_of O(n) case Summary: The `memchr()`-based `find_first_of()` behaves extremely badly when it's used in a loop and the input string doesn't contain all the needles. This was discovered when a reasonable line-breaking routine tried to use it to break lines by a mix of '\r' and '\n'. The entire remainder of the file was scanned each time a line was read. Before: ``` CountDelimsBase 1.26s 794.86m CountDelimsNoSSE 100.03% 1.26s 795.12m CountDelimsStd 40501.63% 3.11ms 321.93 CountDelimsMemchr 98.31% 1.28s 781.41m CountDelimsByteSet 23162.41% 5.43ms 184.11 ``` After: ``` CountDelimsBase 3.20ms 312.08 <-- Base impl no longer considers memchr CountDelimsNoSSE 102.37% 3.13ms 319.47 CountDelimsStd 103.19% 3.11ms 322.05 CountDelimsMemchr 0.25% 1.27s 788.39m CountDelimsByteSet 59.68% 5.37ms 186.27 ``` Test Plan: Benchmarks Reviewed By: njormrod@fb.com Subscribers: folly-diffs@, yfeldblum FB internal diff: D1823536 Signature: t1:1823536:1423081687:bb2ec8cdea477ee9b9c8d8ad2bfdecdc52657515 --- diff --git a/folly/Range.cpp b/folly/Range.cpp index 343ba728..9e8944f9 100644 --- a/folly/Range.cpp +++ b/folly/Range.cpp @@ -42,26 +42,6 @@ std::ostream& operator<<(std::ostream& os, const MutableStringPiece piece) { return os; } -namespace detail { - -size_t qfind_first_byte_of_memchr(const StringPiece haystack, - const StringPiece needles) { - size_t best = haystack.size(); - for (char needle: needles) { - const void* ptr = memchr(haystack.data(), needle, best); - if (ptr) { - auto found = static_cast(ptr) - haystack.data(); - best = std::min(best, found); - } - } - if (best == haystack.size()) { - return StringPiece::npos; - } - return best; -} - -} // namespace detail - namespace { // It's okay if pages are bigger than this (as powers of two), but they should @@ -95,15 +75,13 @@ size_t qfind_first_byte_of_needles16(const StringPiece haystack, DCHECK(!haystack.empty()); DCHECK(!needles.empty()); DCHECK_LE(needles.size(), 16); - // benchmarking shows that memchr beats out SSE for small needle-sets - // with large haystacks. if ((needles.size() <= 2 && haystack.size() >= 256) || // must bail if we can't even SSE-load a single segment of haystack (haystack.size() < 16 && PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) || // can't load needles into SSE register if it could cross page boundary PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) { - return detail::qfind_first_byte_of_memchr(haystack, needles); + return detail::qfind_first_byte_of_nosse(haystack, needles); } auto arr2 = __builtin_ia32_loaddqu(needles.data()); @@ -278,16 +256,12 @@ size_t qfind_first_byte_of_nosse(const StringPiece haystack, // The thresholds below were empirically determined by benchmarking. // This is not an exact science since it depends on the CPU, the size of // needles, and the size of haystack. - if (haystack.size() == 1 || - (haystack.size() < 4 && needles.size() <= 16)) { - return qfind_first_of(haystack, needles, asciiCaseSensitive); - } else if ((needles.size() >= 4 && haystack.size() <= 10) || + if ((needles.size() >= 4 && haystack.size() <= 10) || (needles.size() >= 16 && haystack.size() <= 64) || needles.size() >= 32) { return qfind_first_byte_of_byteset(haystack, needles); } - - return qfind_first_byte_of_memchr(haystack, needles); + return qfind_first_of(haystack, needles, asciiCaseSensitive); } } // namespace detail diff --git a/folly/test/RangeFindBenchmark.cpp b/folly/test/RangeFindBenchmark.cpp index 306aa7ab..9aacc6a0 100644 --- a/folly/test/RangeFindBenchmark.cpp +++ b/folly/test/RangeFindBenchmark.cpp @@ -24,9 +24,6 @@ namespace folly { namespace detail { // declaration of functions in Range.cpp -size_t qfind_first_byte_of_memchr(const StringPiece haystack, - const StringPiece needles); - size_t qfind_first_byte_of_byteset(const StringPiece haystack, const StringPiece needles); @@ -43,6 +40,7 @@ std::string str; constexpr int kVstrSize = 16; std::vector vstr; std::vector vstrp; +std::string file; void initStr(int len) { cout << "string length " << len << ':' << endl; @@ -74,6 +72,19 @@ string ffoTestString; const size_t ffoDelimSize = 128; vector ffoDelim; +void initFile(int len) { + std::uniform_int_distribution validChar(1, 64); + file.clear(); + while (len--) { + char ch = validChar(rnd); + if (ch == '\r') { + ch = '\n'; + } + file.push_back(ch); + } +} + + string generateString(int len) { std::uniform_int_distribution validChar(1, 255); // no null-char string ret; @@ -136,6 +147,20 @@ inline size_t qfind_first_byte_of_std(const StringPiece haystack, return qfind_first_of(haystack, needles, asciiCaseSensitive); } +template +void countHits(Func func, size_t n) { + StringPiece needles = "\r\n\1"; + FOR_EACH_RANGE (i, 0, n) { + size_t p, n = 0; + for (StringPiece left = file; + (p = func(left, needles)) != StringPiece::npos; + left.advance(p + 1)) { + ++n; + } + doNotOptimizeAway(n); + } +} + template void findFirstOfRange(StringPiece needles, Func func, size_t n) { FOR_EACH_RANGE (i, 0, n) { @@ -146,6 +171,26 @@ void findFirstOfRange(StringPiece needles, Func func, size_t n) { } } +const string delims1 = "b"; + +BENCHMARK(FindFirstOf1NeedlesBase, n) { + findFirstOfRange(delims1, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf1NeedlesNoSSE, n) { + findFirstOfRange(delims1, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf1NeedlesStd, n) { + findFirstOfRange(delims1, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf1NeedlesByteSet, n) { + findFirstOfRange(delims1, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + const string delims2 = "bc"; BENCHMARK(FindFirstOf2NeedlesBase, n) { @@ -160,10 +205,6 @@ BENCHMARK_RELATIVE(FindFirstOf2NeedlesStd, n) { findFirstOfRange(delims2, qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOf2NeedlesMemchr, n) { - findFirstOfRange(delims2, detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOf2NeedlesByteSet, n) { findFirstOfRange(delims2, detail::qfind_first_byte_of_byteset, n); } @@ -184,10 +225,6 @@ BENCHMARK_RELATIVE(FindFirstOf4NeedlesStd, n) { findFirstOfRange(delims4, qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOf4NeedlesMemchr, n) { - findFirstOfRange(delims4, detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOf4NeedlesByteSet, n) { findFirstOfRange(delims4, detail::qfind_first_byte_of_byteset, n); } @@ -208,10 +245,6 @@ BENCHMARK_RELATIVE(FindFirstOf8NeedlesStd, n) { findFirstOfRange(delims8, qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOf8NeedlesMemchr, n) { - findFirstOfRange(delims8, detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOf8NeedlesByteSet, n) { findFirstOfRange(delims8, detail::qfind_first_byte_of_byteset, n); } @@ -232,10 +265,6 @@ BENCHMARK_RELATIVE(FindFirstOf16NeedlesStd, n) { findFirstOfRange(delims16, qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOf16NeedlesMemchr, n) { - findFirstOfRange(delims16, detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOf16NeedlesByteSet, n) { findFirstOfRange(delims16, detail::qfind_first_byte_of_byteset, n); } @@ -256,10 +285,6 @@ BENCHMARK_RELATIVE(FindFirstOf32NeedlesStd, n) { findFirstOfRange(delims32, qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOf32NeedlesMemchr, n) { - findFirstOfRange(delims32, detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOf32NeedlesByteSet, n) { findFirstOfRange(delims32, detail::qfind_first_byte_of_byteset, n); } @@ -281,10 +306,6 @@ BENCHMARK_RELATIVE(FindFirstOf64NeedlesStd, n) { findFirstOfRange(delims64, qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOf64NeedlesMemchr, n) { - findFirstOfRange(delims64, detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOf64NeedlesByteSet, n) { findFirstOfRange(delims64, detail::qfind_first_byte_of_byteset, n); } @@ -312,16 +333,30 @@ BENCHMARK_RELATIVE(FindFirstOfRandomStd, n) { findFirstOfRandom(qfind_first_byte_of_std, n); } -BENCHMARK_RELATIVE(FindFirstOfRandomMemchr, n) { - findFirstOfRandom(detail::qfind_first_byte_of_memchr, n); -} - BENCHMARK_RELATIVE(FindFirstOfRandomByteSet, n) { findFirstOfRandom(detail::qfind_first_byte_of_byteset, n); } BENCHMARK_DRAW_LINE(); +BENCHMARK(CountDelimsBase, n) { + countHits(detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(CountDelimsNoSSE, n) { + countHits(detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(CountDelimsStd, n) { + countHits(qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(CountDelimsByteSet, n) { + countHits(detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + BENCHMARK(FindFirstOfOffsetRange, n) { StringPiece haystack(str); folly::StringPiece needles("bc"); @@ -342,6 +377,7 @@ int main(int argc, char** argv) { for (int len : {1, 8, 10, 16, 32, 64, 128, 256, 10*1024, 1024*1024}) { initStr(len); initDelims(len); + initFile(len); runBenchmarks(); } return 0; diff --git a/folly/test/RangeTest.cpp b/folly/test/RangeTest.cpp index 850a9a67..a66e65fb 100644 --- a/folly/test/RangeTest.cpp +++ b/folly/test/RangeTest.cpp @@ -34,9 +34,6 @@ namespace folly { namespace detail { // declaration of functions in Range.cpp -size_t qfind_first_byte_of_memchr(const StringPiece haystack, - const StringPiece needles); - size_t qfind_first_byte_of_byteset(const StringPiece haystack, const StringPiece needles); @@ -850,19 +847,14 @@ struct NoSseNeedleFinder { } }; -struct MemchrNeedleFinder { - static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { - return detail::qfind_first_byte_of_memchr(haystack, needles); - } -}; - struct ByteSetNeedleFinder { static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { return detail::qfind_first_byte_of_byteset(haystack, needles); } }; -typedef ::testing::Types NeedleFinders; TYPED_TEST_CASE(NeedleFinderTest, NeedleFinders);