From: Mike Curtiss Date: Fri, 23 Nov 2012 21:44:46 +0000 (-0800) Subject: Speed-up StringPiece::find_first_of() X-Git-Tag: v0.22.0~1093 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=06719763136dac97720baefbca7b4045afc8386a;p=folly.git Speed-up StringPiece::find_first_of() Summary: Wrote an SSE4.2-optimized version of find_first_of (>10x faster in some cases). For cases where SSE4.2 is not supported, rewrote find_first_of to use Aho/Hopcroft/Ullman's "sparse, lazy" set (which is faster than std::find_first_of in most cases). Note that the overhead of ifunc (especially the lack of inlining) means that the new implementations could be slightly slower for super-tiny strings, but the inflection point is around 3-4 characters in haystack, which seems reasonable. Test Plan: Added tests and benchmarks: string length 1: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 5.91ns 169.16M FindSingleCharRange 130.02% 4.55ns 219.95M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 11.37ns 87.98M FindFirstOf2NeedlesNoSSE 108.69% 10.46ns 95.63M FindFirstOf2NeedlesStd 147.04% 7.73ns 129.37M FindFirstOf2NeedlesMemchr 57.66% 19.71ns 50.73M FindFirstOf2NeedlesByteSet 83.32% 13.64ns 73.30M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 10.91ns 91.64M FindFirstOf4NeedlesNoSSE 88.87% 12.28ns 81.45M FindFirstOf4NeedlesStd 114.28% 9.55ns 104.73M FindFirstOf4NeedlesMemchr 34.77% 31.38ns 31.87M FindFirstOf4NeedlesByteSet 60.00% 18.19ns 54.98M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 10.91ns 91.64M FindFirstOf8NeedlesNoSSE 48.00% 22.73ns 43.99M FindFirstOf8NeedlesStd 54.54% 20.01ns 49.99M FindFirstOf8NeedlesMemchr 16.27% 67.06ns 14.91M FindFirstOf8NeedlesByteSet 39.99% 27.28ns 36.65M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 10.91ns 91.64M FindFirstOf16NeedlesNoSSE 33.33% 32.74ns 30.54M FindFirstOf16NeedlesStd 36.36% 30.01ns 33.32M FindFirstOf16NeedlesMemchr 10.25% 106.42ns 9.40M FindFirstOf16NeedlesByteSet 24.00% 45.46ns 22.00M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 18.91ns 52.89M FindFirstOf32NeedlesNoSSE 21.00% 90.02ns 11.11M FindFirstOf32NeedlesStd 39.99% 47.28ns 21.15M FindFirstOf32NeedlesMemchr 8.48% 223.04ns 4.48M FindFirstOf32NeedlesByteSet 22.35% 84.60ns 11.82M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 25.92ns 38.58M FindFirstOf64NeedlesNoSSE 17.45% 148.51ns 6.73M FindFirstOf64NeedlesStd 33.93% 76.39ns 13.09M FindFirstOf64NeedlesMemchr 6.07% 426.94ns 2.34M FindFirstOf64NeedlesByteSet 18.10% 143.22ns 6.98M ---------------------------------------------------------------------------- FindFirstOfRandomBase 23.28ns 42.95M FindFirstOfRandomNoSSE 88.96% 26.17ns 38.21M FindFirstOfRandomStd 112.78% 20.64ns 48.44M FindFirstOfRandomMemchr 35.68% 65.24ns 15.33M FindFirstOfRandomByteSet 62.62% 37.18ns 26.90M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 12.73ns 78.54M ---------------------------------------------------------------------------- ============================================================================ string length 8: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 7.05ns 141.75M FindSingleCharRange 50.05% 14.10ns 70.95M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 11.37ns 87.98M FindFirstOf2NeedlesNoSSE 53.04% 21.43ns 46.67M FindFirstOf2NeedlesStd 37.87% 30.01ns 33.32M FindFirstOf2NeedlesMemchr 54.81% 20.74ns 48.22M FindFirstOf2NeedlesByteSet 33.78% 33.65ns 29.72M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 10.91ns 91.64M FindFirstOf4NeedlesNoSSE 25.53% 42.74ns 23.40M FindFirstOf4NeedlesStd 24.49% 44.56ns 22.44M FindFirstOf4NeedlesMemchr 33.66% 32.42ns 30.85M FindFirstOf4NeedlesByteSet 28.57% 38.19ns 26.18M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 10.91ns 91.64M FindFirstOf8NeedlesNoSSE 21.05% 51.84ns 19.29M FindFirstOf8NeedlesStd 13.56% 80.48ns 12.43M FindFirstOf8NeedlesMemchr 17.32% 62.99ns 15.88M FindFirstOf8NeedlesByteSet 23.08% 47.28ns 21.15M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 10.91ns 91.64M FindFirstOf16NeedlesNoSSE 15.58% 70.02ns 14.28M FindFirstOf16NeedlesStd 7.23% 150.84ns 6.63M FindFirstOf16NeedlesMemchr 9.52% 114.63ns 8.72M FindFirstOf16NeedlesByteSet 16.67% 65.47ns 15.27M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 18.91ns 52.89M FindFirstOf32NeedlesNoSSE 18.42% 102.62ns 9.74M FindFirstOf32NeedlesStd 7.08% 266.97ns 3.75M FindFirstOf32NeedlesMemchr 8.43% 224.41ns 4.46M FindFirstOf32NeedlesByteSet 19.29% 98.00ns 10.20M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 25.92ns 38.58M FindFirstOf64NeedlesNoSSE 16.13% 160.73ns 6.22M FindFirstOf64NeedlesStd 4.58% 565.53ns 1.77M FindFirstOf64NeedlesMemchr 6.05% 428.22ns 2.34M FindFirstOf64NeedlesByteSet 16.58% 156.33ns 6.40M ---------------------------------------------------------------------------- FindFirstOfRandomBase 23.28ns 42.96M FindFirstOfRandomNoSSE 44.00% 52.91ns 18.90M FindFirstOfRandomStd 24.62% 94.56ns 10.58M FindFirstOfRandomMemchr 30.88% 75.38ns 13.27M FindFirstOfRandomByteSet 43.33% 53.72ns 18.62M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 12.73ns 78.54M ---------------------------------------------------------------------------- ============================================================================ string length 10: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 7.06ns 141.61M FindSingleCharRange 41.98% 16.82ns 59.44M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 11.37ns 87.98M FindFirstOf2NeedlesNoSSE 52.05% 21.84ns 45.79M FindFirstOf2NeedlesStd 31.25% 36.37ns 27.49M FindFirstOf2NeedlesMemchr 52.48% 21.66ns 46.17M FindFirstOf2NeedlesByteSet 29.07% 39.10ns 25.57M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 10.91ns 91.64M FindFirstOf4NeedlesNoSSE 28.93% 37.71ns 26.52M FindFirstOf4NeedlesStd 20.00% 54.57ns 18.33M FindFirstOf4NeedlesMemchr 30.39% 35.91ns 27.85M FindFirstOf4NeedlesByteSet 25.00% 43.65ns 22.91M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 10.91ns 91.64M FindFirstOf8NeedlesNoSSE 17.02% 64.12ns 15.60M FindFirstOf8NeedlesStd 11.16% 97.77ns 10.23M FindFirstOf8NeedlesMemchr 17.52% 62.30ns 16.05M FindFirstOf8NeedlesByteSet 25.00% 43.65ns 22.91M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 10.91ns 91.64M FindFirstOf16NeedlesNoSSE 16.28% 67.02ns 14.92M FindFirstOf16NeedlesStd 5.98% 182.32ns 5.48M FindFirstOf16NeedlesMemchr 9.09% 120.06ns 8.33M FindFirstOf16NeedlesByteSet 17.65% 61.84ns 16.17M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 19.10ns 52.36M FindFirstOf32NeedlesNoSSE 17.91% 106.63ns 9.38M FindFirstOf32NeedlesStd 5.79% 329.70ns 3.03M FindFirstOf32NeedlesMemchr 7.89% 241.91ns 4.13M FindFirstOf32NeedlesByteSet 18.92% 100.95ns 9.91M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 26.15ns 38.24M FindFirstOf64NeedlesNoSSE 15.84% 165.05ns 6.06M FindFirstOf64NeedlesStd 3.71% 704.28ns 1.42M FindFirstOf64NeedlesMemchr 5.49% 476.63ns 2.10M FindFirstOf64NeedlesByteSet 16.48% 158.68ns 6.30M ---------------------------------------------------------------------------- FindFirstOfRandomBase 22.83ns 43.81M FindFirstOfRandomNoSSE 43.25% 52.78ns 18.95M FindFirstOfRandomStd 22.33% 102.23ns 9.78M FindFirstOfRandomMemchr 31.61% 72.23ns 13.85M FindFirstOfRandomByteSet 41.64% 54.82ns 18.24M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 12.73ns 78.54M ---------------------------------------------------------------------------- ============================================================================ string length 16: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 7.06ns 141.72M FindSingleCharRange 28.21% 25.01ns 39.98M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 15.91ns 62.84M FindFirstOf2NeedlesNoSSE 72.89% 21.84ns 45.80M FindFirstOf2NeedlesStd 28.68% 55.48ns 18.02M FindFirstOf2NeedlesMemchr 74.47% 21.37ns 46.79M FindFirstOf2NeedlesByteSet 23.34% 68.19ns 14.66M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 15.46ns 64.68M FindFirstOf4NeedlesNoSSE 40.77% 37.92ns 26.37M FindFirstOf4NeedlesStd 18.28% 84.59ns 11.82M FindFirstOf4NeedlesMemchr 42.97% 35.97ns 27.80M FindFirstOf4NeedlesByteSet 25.76% 60.02ns 16.66M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 15.46ns 64.68M FindFirstOf8NeedlesNoSSE 24.03% 64.34ns 15.54M FindFirstOf8NeedlesStd 9.74% 158.74ns 6.30M FindFirstOf8NeedlesMemchr 24.55% 62.98ns 15.88M FindFirstOf8NeedlesByteSet 28.33% 54.57ns 18.33M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 15.46ns 64.68M FindFirstOf16NeedlesNoSSE 19.83% 77.98ns 12.82M FindFirstOf16NeedlesStd 5.56% 277.82ns 3.60M FindFirstOf16NeedlesMemchr 12.95% 119.35ns 8.38M FindFirstOf16NeedlesByteSet 21.25% 72.75ns 13.75M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 32.80ns 30.49M FindFirstOf32NeedlesNoSSE 27.86% 117.69ns 8.50M FindFirstOf32NeedlesStd 6.33% 517.97ns 1.93M FindFirstOf32NeedlesMemchr 13.72% 239.09ns 4.18M FindFirstOf32NeedlesByteSet 29.06% 112.85ns 8.86M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 46.83ns 21.35M FindFirstOf64NeedlesNoSSE 26.68% 175.50ns 5.70M FindFirstOf64NeedlesStd 4.20% 1.11us 897.48K FindFirstOf64NeedlesMemchr 10.04% 466.39ns 2.14M FindFirstOf64NeedlesByteSet 27.47% 170.50ns 5.87M ---------------------------------------------------------------------------- FindFirstOfRandomBase 23.41ns 42.72M FindFirstOfRandomNoSSE 38.00% 61.61ns 16.23M FindFirstOfRandomStd 13.91% 168.34ns 5.94M FindFirstOfRandomMemchr 29.03% 80.64ns 12.40M FindFirstOfRandomByteSet 33.31% 70.28ns 14.23M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 15.12ns 66.15M ---------------------------------------------------------------------------- ============================================================================ string length 32: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 8.23ns 121.52M FindSingleCharRange 17.57% 46.83ns 21.35M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 20.46ns 48.88M FindFirstOf2NeedlesNoSSE 82.29% 24.86ns 40.22M FindFirstOf2NeedlesStd 17.69% 115.65ns 8.65M FindFirstOf2NeedlesMemchr 85.17% 24.02ns 41.63M FindFirstOf2NeedlesByteSet 28.19% 72.58ns 13.78M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 20.01ns 49.99M FindFirstOf4NeedlesNoSSE 48.57% 41.19ns 24.28M FindFirstOf4NeedlesStd 11.52% 173.72ns 5.76M FindFirstOf4NeedlesMemchr 50.55% 39.58ns 25.27M FindFirstOf4NeedlesByteSet 26.33% 75.99ns 13.16M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 20.01ns 49.99M FindFirstOf8NeedlesNoSSE 26.94% 74.27ns 13.46M FindFirstOf8NeedlesStd 6.73% 297.31ns 3.36M FindFirstOf8NeedlesMemchr 27.44% 72.90ns 13.72M FindFirstOf8NeedlesByteSet 23.91% 83.66ns 11.95M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 20.01ns 49.99M FindFirstOf16NeedlesNoSSE 18.37% 108.92ns 9.18M FindFirstOf16NeedlesStd 3.75% 532.80ns 1.88M FindFirstOf16NeedlesMemchr 14.53% 137.71ns 7.26M FindFirstOf16NeedlesByteSet 19.55% 102.32ns 9.77M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 45.92ns 21.78M FindFirstOf32NeedlesNoSSE 31.17% 147.32ns 6.79M FindFirstOf32NeedlesStd 4.50% 1.02us 980.43K FindFirstOf32NeedlesMemchr 16.13% 284.64ns 3.51M FindFirstOf32NeedlesByteSet 32.63% 140.73ns 7.11M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 68.20ns 14.66M FindFirstOf64NeedlesNoSSE 29.97% 227.55ns 4.39M FindFirstOf64NeedlesStd 3.08% 2.21us 452.08K FindFirstOf64NeedlesMemchr 12.51% 545.17ns 1.83M FindFirstOf64NeedlesByteSet 30.74% 221.86ns 4.51M ---------------------------------------------------------------------------- FindFirstOfRandomBase 29.99ns 33.35M FindFirstOfRandomNoSSE 45.10% 66.49ns 15.04M FindFirstOfRandomStd 10.28% 291.67ns 3.43M FindFirstOfRandomMemchr 34.56% 86.76ns 11.53M FindFirstOfRandomByteSet 28.64% 104.72ns 9.55M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 19.55ns 51.15M ---------------------------------------------------------------------------- ============================================================================ string length 64: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 10.91ns 91.65M FindSingleCharRange 13.26% 82.29ns 12.15M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 29.56ns 33.83M FindFirstOf2NeedlesNoSSE 100.77% 29.33ns 34.09M FindFirstOf2NeedlesStd 13.59% 217.44ns 4.60M FindFirstOf2NeedlesMemchr 104.83% 28.19ns 35.47M FindFirstOf2NeedlesByteSet 22.01% 134.28ns 7.45M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 29.10ns 34.36M FindFirstOf4NeedlesNoSSE 56.14% 51.84ns 19.29M FindFirstOf4NeedlesStd 8.72% 333.84ns 3.00M FindFirstOf4NeedlesMemchr 58.18% 50.02ns 19.99M FindFirstOf4NeedlesByteSet 19.73% 147.48ns 6.78M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 29.10ns 34.36M FindFirstOf8NeedlesNoSSE 30.48% 95.48ns 10.47M FindFirstOf8NeedlesStd 5.07% 573.76ns 1.74M FindFirstOf8NeedlesMemchr 30.92% 94.11ns 10.63M FindFirstOf8NeedlesByteSet 19.26% 151.13ns 6.62M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 29.10ns 34.36M FindFirstOf16NeedlesNoSSE 15.84% 183.68ns 5.44M FindFirstOf16NeedlesStd 2.79% 1.04us 959.63K FindFirstOf16NeedlesMemchr 16.04% 181.41ns 5.51M FindFirstOf16NeedlesByteSet 16.54% 175.95ns 5.68M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 73.21ns 13.66M FindFirstOf32NeedlesNoSSE 32.76% 223.49ns 4.47M FindFirstOf32NeedlesStd 3.62% 2.02us 494.08K FindFirstOf32NeedlesMemchr 19.49% 375.70ns 2.66M FindFirstOf32NeedlesByteSet 33.45% 218.87ns 4.57M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 109.95ns 9.09M FindFirstOf64NeedlesNoSSE 38.99% 282.01ns 3.55M FindFirstOf64NeedlesStd 2.49% 4.41us 226.78K FindFirstOf64NeedlesMemchr 15.21% 723.03ns 1.38M FindFirstOf64NeedlesByteSet 39.68% 277.13ns 3.61M ---------------------------------------------------------------------------- FindFirstOfRandomBase 40.57ns 24.65M FindFirstOfRandomNoSSE 47.65% 85.15ns 11.74M FindFirstOfRandomStd 7.62% 532.10ns 1.88M FindFirstOfRandomMemchr 39.23% 103.43ns 9.67M FindFirstOfRandomByteSet 22.95% 176.82ns 5.66M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 28.65ns 34.91M ---------------------------------------------------------------------------- ============================================================================ string length 128: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 16.37ns 61.09M FindSingleCharRange 11.62% 140.85ns 7.10M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 47.74ns 20.95M FindFirstOf2NeedlesNoSSE 118.64% 40.24ns 24.85M FindFirstOf2NeedlesStd 11.33% 421.18ns 2.37M FindFirstOf2NeedlesMemchr 120.68% 39.56ns 25.28M FindFirstOf2NeedlesByteSet 21.47% 222.36ns 4.50M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 47.28ns 21.15M FindFirstOf4NeedlesNoSSE 63.80% 74.11ns 13.49M FindFirstOf4NeedlesStd 7.23% 653.94ns 1.53M FindFirstOf4NeedlesMemchr 65.40% 72.30ns 13.83M FindFirstOf4NeedlesByteSet 19.96% 236.85ns 4.22M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 47.28ns 21.15M FindFirstOf8NeedlesNoSSE 33.87% 139.59ns 7.16M FindFirstOf8NeedlesStd 4.20% 1.13us 887.82K FindFirstOf8NeedlesMemchr 34.43% 137.32ns 7.28M FindFirstOf8NeedlesByteSet 18.98% 249.17ns 4.01M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 47.28ns 21.15M FindFirstOf16NeedlesNoSSE 16.83% 281.00ns 3.56M FindFirstOf16NeedlesStd 2.30% 2.06us 485.36K FindFirstOf16NeedlesMemchr 16.98% 278.50ns 3.59M FindFirstOf16NeedlesByteSet 15.75% 300.13ns 3.33M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 128.45ns 7.79M FindFirstOf32NeedlesNoSSE 37.09% 346.28ns 2.89M FindFirstOf32NeedlesStd 3.19% 4.03us 248.02K FindFirstOf32NeedlesMemchr 23.13% 555.26ns 1.80M FindFirstOf32NeedlesByteSet 37.74% 340.32ns 2.94M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 193.23ns 5.18M FindFirstOf64NeedlesNoSSE 47.76% 404.60ns 2.47M FindFirstOf64NeedlesStd 2.20% 8.80us 113.61K FindFirstOf64NeedlesMemchr 17.91% 1.08us 926.70K FindFirstOf64NeedlesByteSet 48.35% 399.64ns 2.50M ---------------------------------------------------------------------------- FindFirstOfRandomBase 59.66ns 16.76M FindFirstOfRandomNoSSE 53.67% 111.17ns 9.00M FindFirstOfRandomStd 6.41% 930.67ns 1.07M FindFirstOfRandomMemchr 46.01% 129.68ns 7.71M FindFirstOfRandomByteSet 19.80% 301.38ns 3.32M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 46.83ns 21.35M ---------------------------------------------------------------------------- ============================================================================ string length 256: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 27.28ns 36.65M FindSingleCharRange 10.62% 256.90ns 3.89M ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 61.39ns 16.29M FindFirstOf2NeedlesNoSSE 99.28% 61.84ns 16.17M FindFirstOf2NeedlesStd 7.41% 828.62ns 1.21M FindFirstOf2NeedlesMemchr 100.01% 61.39ns 16.29M FindFirstOf2NeedlesByteSet 15.36% 399.65ns 2.50M ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 83.65ns 11.95M FindFirstOf4NeedlesNoSSE 71.03% 117.77ns 8.49M FindFirstOf4NeedlesStd 6.46% 1.29us 772.77K FindFirstOf4NeedlesMemchr 72.14% 115.95ns 8.62M FindFirstOf4NeedlesByteSet 20.66% 404.81ns 2.47M ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 83.66ns 11.95M FindFirstOf8NeedlesNoSSE 35.38% 236.46ns 4.23M FindFirstOf8NeedlesStd 3.75% 2.23us 447.99K FindFirstOf8NeedlesMemchr 35.71% 234.26ns 4.27M FindFirstOf8NeedlesByteSet 20.13% 415.56ns 2.41M ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 83.66ns 11.95M FindFirstOf16NeedlesNoSSE 18.04% 463.82ns 2.16M FindFirstOf16NeedlesStd 2.04% 4.10us 244.06K FindFirstOf16NeedlesMemchr 18.14% 461.09ns 2.17M FindFirstOf16NeedlesByteSet 14.81% 564.87ns 1.77M ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 237.14ns 4.22M FindFirstOf32NeedlesNoSSE 38.92% 609.24ns 1.64M FindFirstOf32NeedlesStd 2.95% 8.05us 124.26K FindFirstOf32NeedlesMemchr 25.90% 915.44ns 1.09M FindFirstOf32NeedlesByteSet 39.21% 604.86ns 1.65M ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 360.78ns 2.77M FindFirstOf64NeedlesNoSSE 54.03% 667.71ns 1.50M FindFirstOf64NeedlesStd 2.05% 17.59us 56.86K FindFirstOf64NeedlesMemchr 20.04% 1.80us 555.45K FindFirstOf64NeedlesByteSet 54.61% 660.63ns 1.51M ---------------------------------------------------------------------------- FindFirstOfRandomBase 98.24ns 10.18M FindFirstOfRandomNoSSE 47.37% 207.40ns 4.82M FindFirstOfRandomStd 5.24% 1.88us 533.28K FindFirstOfRandomMemchr 39.75% 247.14ns 4.05M FindFirstOfRandomByteSet 17.69% 555.45ns 1.80M ---------------------------------------------------------------------------- FindFirstOfOffsetRange 62.75ns 15.94M ---------------------------------------------------------------------------- ============================================================================ string length 10240: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 613.80ns 1.63M FindSingleCharRange 6.57% 9.34us 107.12K ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 1.23us 813.01K FindFirstOf2NeedlesNoSSE 100.01% 1.23us 813.07K FindFirstOf2NeedlesStd 3.77% 32.61us 30.67K FindFirstOf2NeedlesMemchr 100.08% 1.23us 813.67K FindFirstOf2NeedlesByteSet 8.65% 14.21us 70.37K ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 2.94us 340.63K FindFirstOf4NeedlesNoSSE 119.61% 2.45us 407.44K FindFirstOf4NeedlesStd 5.73% 51.23us 19.52K FindFirstOf4NeedlesMemchr 119.77% 2.45us 407.97K FindFirstOf4NeedlesByteSet 20.66% 14.21us 70.38K ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 2.94us 340.63K FindFirstOf8NeedlesNoSSE 59.95% 4.90us 204.21K FindFirstOf8NeedlesStd 3.32% 88.48us 11.30K FindFirstOf8NeedlesMemchr 59.96% 4.90us 204.25K FindFirstOf8NeedlesByteSet 20.68% 14.20us 70.43K ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 2.94us 340.63K FindFirstOf16NeedlesNoSSE 29.98% 9.79us 102.13K FindFirstOf16NeedlesStd 1.80% 162.97us 6.14K FindFirstOf16NeedlesMemchr 29.98% 9.79us 102.11K FindFirstOf16NeedlesByteSet 20.65% 14.22us 70.33K ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 8.77us 114.07K FindFirstOf32NeedlesNoSSE 44.71% 19.61us 51.00K FindFirstOf32NeedlesStd 2.73% 321.22us 3.11K FindFirstOf32NeedlesMemchr 43.44% 20.18us 49.55K FindFirstOf32NeedlesByteSet 44.67% 19.63us 50.95K ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 13.43us 74.44K FindFirstOf64NeedlesNoSSE 68.26% 19.68us 50.81K FindFirstOf64NeedlesStd 1.91% 702.62us 1.42K FindFirstOf64NeedlesMemchr 33.81% 39.74us 25.17K FindFirstOf64NeedlesByteSet 68.25% 19.68us 50.81K ---------------------------------------------------------------------------- FindFirstOfRandomBase 3.01us 331.81K FindFirstOfRandomNoSSE 75.38% 4.00us 250.10K FindFirstOfRandomStd 6.81% 44.25us 22.60K FindFirstOfRandomMemchr 76.46% 3.94us 253.71K FindFirstOfRandomByteSet 15.01% 20.08us 49.81K ---------------------------------------------------------------------------- FindFirstOfOffsetRange 1.23us 811.29K ---------------------------------------------------------------------------- ============================================================================ string length 1048576: ============================================================================ folly/test/RangeFindBenchmark.cpp relative time/iter iters/s ============================================================================ FindSingleCharMemchr 85.07us 11.76K FindSingleCharRange 8.92% 953.48us 1.05K ---------------------------------------------------------------------------- FindFirstOf2NeedlesBase 170.23us 5.87K FindFirstOf2NeedlesNoSSE 100.01% 170.21us 5.87K FindFirstOf2NeedlesStd 5.09% 3.34ms 299.18 FindFirstOf2NeedlesMemchr 100.02% 170.20us 5.88K FindFirstOf2NeedlesByteSet 11.64% 1.46ms 683.69 ---------------------------------------------------------------------------- FindFirstOf4NeedlesBase 298.04us 3.36K FindFirstOf4NeedlesNoSSE 87.48% 340.68us 2.94K FindFirstOf4NeedlesStd 5.68% 5.25ms 190.41 FindFirstOf4NeedlesMemchr 87.53% 340.51us 2.94K FindFirstOf4NeedlesByteSet 20.37% 1.46ms 683.55 ---------------------------------------------------------------------------- FindFirstOf8NeedlesBase 298.04us 3.36K FindFirstOf8NeedlesNoSSE 43.75% 681.27us 1.47K FindFirstOf8NeedlesStd 3.29% 9.07ms 110.24 FindFirstOf8NeedlesMemchr 43.74% 681.36us 1.47K FindFirstOf8NeedlesByteSet 20.37% 1.46ms 683.55 ---------------------------------------------------------------------------- FindFirstOf16NeedlesBase 298.03us 3.36K FindFirstOf16NeedlesNoSSE 21.83% 1.37ms 732.40 FindFirstOf16NeedlesStd 1.78% 16.72ms 59.81 FindFirstOf16NeedlesMemchr 21.83% 1.37ms 732.49 FindFirstOf16NeedlesByteSet 20.37% 1.46ms 683.60 ---------------------------------------------------------------------------- FindFirstOf32NeedlesBase 896.95us 1.11K FindFirstOf32NeedlesNoSSE 44.21% 2.03ms 492.89 FindFirstOf32NeedlesStd 2.67% 33.53ms 29.82 FindFirstOf32NeedlesMemchr 31.84% 2.82ms 354.97 FindFirstOf32NeedlesByteSet 44.25% 2.03ms 493.31 ---------------------------------------------------------------------------- FindFirstOf64NeedlesBase 1.38ms 725.72 FindFirstOf64NeedlesNoSSE 67.96% 2.03ms 493.18 FindFirstOf64NeedlesStd 1.90% 72.34ms 13.82 FindFirstOf64NeedlesMemchr 24.82% 5.55ms 180.11 FindFirstOf64NeedlesByteSet 67.97% 2.03ms 493.30 ---------------------------------------------------------------------------- FindFirstOfRandomBase 657.10us 1.52K FindFirstOfRandomNoSSE 31.60% 2.08ms 480.94 FindFirstOfRandomStd 2.05% 32.07ms 31.18 FindFirstOfRandomMemchr 24.06% 2.73ms 366.13 FindFirstOfRandomByteSet 31.56% 2.08ms 480.22 ---------------------------------------------------------------------------- FindFirstOfOffsetRange 170.28us 5.87K ---------------------------------------------------------------------------- ============================================================================ Reviewed By: philipp@fb.com FB internal diff: D638500 --- diff --git a/folly/Range.cpp b/folly/Range.cpp index 45ffd39d..4aee27de 100644 --- a/folly/Range.cpp +++ b/folly/Range.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2012 Facebook, Inc. + * Copyright 2013 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,9 @@ #include "folly/Range.h" +#include "folly/CpuId.h" +#include "folly/Likely.h" + namespace folly { /** @@ -34,4 +37,156 @@ std::ostream& operator<<(std::ostream& os, const StringPiece& piece) { return os; } -} // namespace folly +namespace detail { +size_t qfind_first_byte_of_memchr(const StringPiece& haystack, + const StringPiece& needles) { + size_t best = haystack.size(); + for (char needle: needles) { + const void* ptr = memchr(haystack.data(), needle, best); + if (ptr) { + auto found = static_cast(ptr) - haystack.data(); + best = std::min(best, found); + } + } + if (best == haystack.size()) { + return StringPiece::npos; + } + return best; +} +} // namespace detail + +namespace { +// build sse4.2-optimized version even if -msse4.2 is not passed to GCC +size_t qfind_first_byte_of_needles16(const StringPiece& haystack, + const StringPiece& needles) + __attribute__ ((__target__("sse4.2"))); + +// helper method for case where needles.size() <= 16 +size_t qfind_first_byte_of_needles16(const StringPiece& haystack, + const StringPiece& needles) { + DCHECK_LE(needles.size(), 16); + if (needles.size() <= 2 && haystack.size() >= 256) { + // benchmarking shows that memchr beats out SSE for small needle-sets + // with large haystacks. + // TODO(mcurtiss): could this be because of unaligned SSE loads? + return detail::qfind_first_byte_of_memchr(haystack, needles); + } + auto arr2 = __builtin_ia32_loaddqu(needles.data()); + for (size_t i = 0; i < haystack.size(); i+= 16) { + auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i); + auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(), + arr1, haystack.size() - i, 0); + if (index < 16) { + return i + index; + } + } + return StringPiece::npos; +} + +size_t qfind_first_byte_of_sse42(const StringPiece& haystack, + const StringPiece& needles) + __attribute__ ((__target__("sse4.2"))); + +size_t qfind_first_byte_of_sse42(const StringPiece& haystack, + const StringPiece& needles) { + if (UNLIKELY(needles.empty() || haystack.empty())) { + return StringPiece::npos; + } else if (needles.size() <= 16) { + // we can save some unnecessary load instructions by optimizing for + // the common case of needles.size() <= 16 + return qfind_first_byte_of_needles16(haystack, needles); + } + + size_t index = haystack.size(); + for (size_t i = 0; i < haystack.size(); i += 16) { + size_t b = 16; + auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i); + for (size_t j = 0; j < needles.size(); j += 16) { + auto arr2 = __builtin_ia32_loaddqu(needles.data() + j); + auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j, + arr1, haystack.size() - i, 0); + b = std::min(index, b); + } + if (b < 16) { + return i + b; + } + }; + return StringPiece::npos; +} + +typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of; + +// Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis +// of Computer Algorithms" (1974), but the best description is here: +// http://research.swtch.com/sparse +class FastByteSet { + public: + FastByteSet() : size_(0) { } // no init of arrays required! + + inline void add(uint8_t i) { + if (!contains(i)) { + dense_[size_] = i; + sparse_[i] = size_; + size_++; + } + } + inline bool contains(uint8_t i) const { + DCHECK_LE(size_, 256); + return sparse_[i] < size_ && dense_[sparse_[i]] == i; + } + + private: + uint16_t size_; // can't use uint8_t because it would overflow if all + // possible values were inserted. + uint8_t sparse_[256]; + uint8_t dense_[256]; +}; +} // namespace + +namespace detail { +size_t qfind_first_byte_of_byteset(const StringPiece& haystack, + const StringPiece& needles) { + FastByteSet s; + for (auto needle: needles) { + s.add(needle); + } + for (size_t index = 0; index < haystack.size(); ++index) { + if (s.contains(haystack[index])) { + return index; + } + } + return StringPiece::npos; +} + +size_t qfind_first_byte_of_nosse(const StringPiece& haystack, + const StringPiece& needles) { + if (UNLIKELY(needles.empty() || haystack.empty())) { + return StringPiece::npos; + } + // The thresholds below were empirically determined by benchmarking. + // This is not an exact science since it depends on the CPU, the size of + // needles, and the size of haystack. + if (haystack.size() == 1 || + (haystack.size() < 4 && needles.size() <= 16)) { + return qfind_first_of(haystack, needles, asciiCaseSensitive); + } else if ((needles.size() >= 4 && haystack.size() <= 10) || + (needles.size() >= 16 && haystack.size() <= 64) || + needles.size() >= 32) { + return qfind_first_byte_of_byteset(haystack, needles); + } + + return qfind_first_byte_of_memchr(haystack, needles); +} + +// This function is called on startup to resolve folly::qfind_first_byte_of +extern "C" Type_qfind_first_byte_of* qfind_first_byte_of_ifunc() { + return folly::CpuId().sse42() ? qfind_first_byte_of_sse42 : + qfind_first_byte_of_nosse; +} + +size_t qfind_first_byte_of(const StringPiece& haystack, + const StringPiece& needles) + __attribute__((ifunc("qfind_first_byte_of_ifunc"))); + +} // namespace detail +} // namespace folly diff --git a/folly/Range.h b/folly/Range.h index e214bee9..196e8d3d 100644 --- a/folly/Range.h +++ b/folly/Range.h @@ -1,5 +1,5 @@ /* - * Copyright 2012 Facebook, Inc. + * Copyright 2013 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -592,12 +592,17 @@ size_t qfind(const Range& haystack, return std::string::npos; } +namespace detail { +size_t qfind_first_byte_of(const StringPiece& haystack, + const StringPiece& needles); +} // namespace detail + template size_t qfind_first_of(const Range & haystack, - const Range & needle, + const Range & needles, Comp eq) { auto ret = std::find_first_of(haystack.begin(), haystack.end(), - needle.begin(), needle.end(), + needles.begin(), needles.end(), eq); return ret == haystack.end() ? std::string::npos : ret - haystack.begin(); } @@ -649,10 +654,24 @@ inline size_t qfind(const Range& haystack, template size_t qfind_first_of(const Range& haystack, - const Range& needle) { - return qfind_first_of(haystack, needle, asciiCaseSensitive); + const Range& needles) { + return qfind_first_of(haystack, needles, asciiCaseSensitive); +} + +// specialization for StringPiece +template <> +inline size_t qfind_first_of(const Range& haystack, + const Range& needles) { + return detail::qfind_first_byte_of(haystack, needles); } +// specialization for ByteRange +template <> +inline size_t qfind_first_of(const Range& haystack, + const Range& needles) { + return detail::qfind_first_byte_of(StringPiece(haystack), + StringPiece(needles)); +} } // !namespace folly FOLLY_ASSUME_FBVECTOR_COMPATIBLE_1(folly::Range); diff --git a/folly/test/RangeFindBenchmark.cpp b/folly/test/RangeFindBenchmark.cpp index 654e00b2..031c67b8 100644 --- a/folly/test/RangeFindBenchmark.cpp +++ b/folly/test/RangeFindBenchmark.cpp @@ -19,8 +19,21 @@ #include "folly/Foreach.h" #include #include +#include #include +namespace folly { namespace detail { +// declaration of functions in Range.cpp +size_t qfind_first_byte_of_memchr(const StringPiece& haystack, + const StringPiece& needles); + +size_t qfind_first_byte_of_byteset(const StringPiece& haystack, + const StringPiece& needles); + +size_t qfind_first_byte_of_nosse(const StringPiece& haystack, + const StringPiece& needles); +}} + using namespace folly; using namespace std; @@ -36,6 +49,43 @@ void initStr(int len) { str.append(1, 'b'); } +std::mt19937 rnd; +string ffoTestString; +const size_t ffoDelimSize = 128; +vector ffoDelim; + +string generateString(int len) { + std::uniform_int_distribution validChar(1, 255); // no null-char + string ret; + while (len--) { + ret.push_back(validChar(rnd)); + } + return ret; +} + +void initDelims(int len) { + ffoDelim.clear(); + + string s(len - 1, '\0'); // find_first_of won't finish until last char + s.push_back('a'); + ffoTestString = s; + + for (int i = 0; i < ffoDelimSize; ++i) { + // most delimiter sets are pretty small, but occasionally there could + // be a big one. + auto n = rnd() % 8 + 1; + if (n == 8) { + n = 32; + } + auto s = generateString(n); + if (rnd() % 2) { + // ~half of tests will find a hit + s[rnd() % s.size()] = 'a'; // yes, this could mean 'a' is a duplicate + } + ffoDelim.push_back(s); + } +} + } // anonymous namespace BENCHMARK(FindSingleCharMemchr, n) { @@ -60,17 +110,198 @@ BENCHMARK_RELATIVE(FindSingleCharRange, n) { BENCHMARK_DRAW_LINE(); -BENCHMARK(FindFirstOfRange, n) { +// it's useful to compare our custom implementations vs. the standard library +inline size_t qfind_first_byte_of_std(const StringPiece& haystack, + const StringPiece& needles) { + return qfind_first_of(haystack, needles, asciiCaseSensitive); +} + +template +void findFirstOfRange(StringPiece needles, Func func, size_t n) { StringPiece haystack(str); - folly::StringPiece needles("bc"); - DCHECK_EQ(haystack.size() - 1, haystack.find_first_of(needles)); // it works! FOR_EACH_RANGE (i, 0, n) { - doNotOptimizeAway(haystack.find_first_of(needles)); + doNotOptimizeAway(func(haystack, needles)); char x = haystack[0]; doNotOptimizeAway(&x); } } +const string delims2 = "bc"; + +BENCHMARK(FindFirstOf2NeedlesBase, n) { + findFirstOfRange(delims2, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf2NeedlesNoSSE, n) { + findFirstOfRange(delims2, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf2NeedlesStd, n) { + findFirstOfRange(delims2, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf2NeedlesMemchr, n) { + findFirstOfRange(delims2, detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOf2NeedlesByteSet, n) { + findFirstOfRange(delims2, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + +const string delims4 = "bcde"; + +BENCHMARK(FindFirstOf4NeedlesBase, n) { + findFirstOfRange(delims4, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf4NeedlesNoSSE, n) { + findFirstOfRange(delims4, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf4NeedlesStd, n) { + findFirstOfRange(delims4, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf4NeedlesMemchr, n) { + findFirstOfRange(delims4, detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOf4NeedlesByteSet, n) { + findFirstOfRange(delims4, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + +const string delims8 = "0123456b"; + +BENCHMARK(FindFirstOf8NeedlesBase, n) { + findFirstOfRange(delims8, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf8NeedlesNoSSE, n) { + findFirstOfRange(delims8, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf8NeedlesStd, n) { + findFirstOfRange(delims8, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf8NeedlesMemchr, n) { + findFirstOfRange(delims8, detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOf8NeedlesByteSet, n) { + findFirstOfRange(delims8, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + +const string delims16 = "0123456789bcdefg"; + +BENCHMARK(FindFirstOf16NeedlesBase, n) { + findFirstOfRange(delims16, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf16NeedlesNoSSE, n) { + findFirstOfRange(delims16, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf16NeedlesStd, n) { + findFirstOfRange(delims16, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf16NeedlesMemchr, n) { + findFirstOfRange(delims16, detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOf16NeedlesByteSet, n) { + findFirstOfRange(delims16, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + +const string delims32 = "!bcdefghijklmnopqrstuvwxyz_012345"; + +BENCHMARK(FindFirstOf32NeedlesBase, n) { + findFirstOfRange(delims32, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf32NeedlesNoSSE, n) { + findFirstOfRange(delims32, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf32NeedlesStd, n) { + findFirstOfRange(delims32, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf32NeedlesMemchr, n) { + findFirstOfRange(delims32, detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOf32NeedlesByteSet, n) { + findFirstOfRange(delims32, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + +const string delims64 = "!bcdefghijklmnopqrstuvwxyz_" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ-0123456789$"; + +BENCHMARK(FindFirstOf64NeedlesBase, n) { + findFirstOfRange(delims64, detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOf64NeedlesNoSSE, n) { + findFirstOfRange(delims64, detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOf64NeedlesStd, n) { + findFirstOfRange(delims64, qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOf64NeedlesMemchr, n) { + findFirstOfRange(delims64, detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOf64NeedlesByteSet, n) { + findFirstOfRange(delims64, detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + +template +void findFirstOfRandom(Func func, size_t iters) { + for (int i = 0; i < iters; ++i) { + auto test = i % ffoDelim.size(); + auto p = func(ffoTestString, ffoDelim[test]); + doNotOptimizeAway(p); + } +} + +BENCHMARK(FindFirstOfRandomBase, n) { + findFirstOfRandom(detail::qfind_first_byte_of, n); +} + +BENCHMARK_RELATIVE(FindFirstOfRandomNoSSE, n) { + findFirstOfRandom(detail::qfind_first_byte_of_nosse, n); +} + +BENCHMARK_RELATIVE(FindFirstOfRandomStd, n) { + findFirstOfRandom(qfind_first_byte_of_std, n); +} + +BENCHMARK_RELATIVE(FindFirstOfRandomMemchr, n) { + findFirstOfRandom(detail::qfind_first_byte_of_memchr, n); +} + +BENCHMARK_RELATIVE(FindFirstOfRandomByteSet, n) { + findFirstOfRandom(detail::qfind_first_byte_of_byteset, n); +} + +BENCHMARK_DRAW_LINE(); + BENCHMARK(FindFirstOfOffsetRange, n) { StringPiece haystack(str); folly::StringPiece needles("bc"); @@ -83,11 +314,14 @@ BENCHMARK(FindFirstOfOffsetRange, n) { } } +BENCHMARK_DRAW_LINE(); + int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); - for (int len : {1, 10, 256, 10*1024, 10*1024*1024}) { + for (int len : {1, 8, 10, 16, 32, 64, 128, 256, 10*1024, 1024*1024}) { initStr(len); + initDelims(len); runBenchmarks(); } return 0; diff --git a/folly/test/RangeTest.cpp b/folly/test/RangeTest.cpp index e1b055ef..cb8aec8a 100644 --- a/folly/test/RangeTest.cpp +++ b/folly/test/RangeTest.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2012 Facebook, Inc. + * Copyright 2013 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,24 @@ // @author Kristina Holst (kholst@fb.com) // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com) +#include +#include #include #include #include "folly/Range.h" +namespace folly { namespace detail { +// declaration of functions in Range.cpp +size_t qfind_first_byte_of_memchr(const StringPiece& haystack, + const StringPiece& needles); + +size_t qfind_first_byte_of_byteset(const StringPiece& haystack, + const StringPiece& needles); + +size_t qfind_first_byte_of_nosse(const StringPiece& haystack, + const StringPiece& needles); +}} + using namespace folly; using namespace std; @@ -199,4 +213,128 @@ TEST(StringPiece, ToByteRange) { static_cast(b.begin())); EXPECT_EQ(static_cast(a.end()), static_cast(b.end())); + + // and convert back again + StringPiece c(b); + EXPECT_EQ(a.begin(), c.begin()); + EXPECT_EQ(a.end(), c.end()); +} + +template +class NeedleFinderTest : public ::testing::Test { + public: + static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { + return NeedleFinder::find_first_byte_of(haystack, needles); + } +}; + +struct SseNeedleFinder { + static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { + // This will only use the SSE version if it is supported on this CPU + // (selected using ifunc). + return detail::qfind_first_byte_of(haystack, needles); + } +}; + +struct NoSseNeedleFinder { + static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { + return detail::qfind_first_byte_of_nosse(haystack, needles); + } +}; + +struct MemchrNeedleFinder { + static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { + return detail::qfind_first_byte_of_memchr(haystack, needles); + } +}; + +struct ByteSetNeedleFinder { + static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) { + return detail::qfind_first_byte_of_byteset(haystack, needles); + } +}; + +typedef ::testing::Types NeedleFinders; +TYPED_TEST_CASE(NeedleFinderTest, NeedleFinders); + +TYPED_TEST(NeedleFinderTest, Null) { + { // null characters in the string + string s(10, char(0)); + s[5] = 'b'; + string delims("abc"); + EXPECT_EQ(5, this->find_first_byte_of(s, delims)); + } + { // null characters in delim + string s("abc"); + string delims(10, char(0)); + delims[3] = 'c'; + delims[7] = 'b'; + EXPECT_EQ(1, this->find_first_byte_of(s, delims)); + } + { // range not terminated by null character + string buf = "abcdefghijklmnopqrstuvwxyz"; + StringPiece s(buf.data() + 5, 3); + StringPiece delims("z"); + EXPECT_EQ(string::npos, this->find_first_byte_of(s, delims)); + } +} + +TYPED_TEST(NeedleFinderTest, DelimDuplicates) { + string delims(1000, 'b'); + EXPECT_EQ(1, this->find_first_byte_of("abc", delims)); + EXPECT_EQ(string::npos, this->find_first_byte_of("ac", delims)); +} + +TYPED_TEST(NeedleFinderTest, Empty) { + string a = "abc"; + string b = ""; + EXPECT_EQ(string::npos, this->find_first_byte_of(a, b)); + EXPECT_EQ(string::npos, this->find_first_byte_of(b, a)); + EXPECT_EQ(string::npos, this->find_first_byte_of(b, b)); +} + +TYPED_TEST(NeedleFinderTest, Unaligned) { + // works correctly even if input buffers are not 16-byte aligned + string s = "0123456789ABCDEFGH"; + for (int i = 0; i < s.size(); ++i) { + StringPiece a(s.c_str() + i); + for (int j = 0; j < s.size(); ++j) { + StringPiece b(s.c_str() + j); + EXPECT_EQ((i > j) ? 0 : j - i, this->find_first_byte_of(a, b)); + } + } +} + +// for some algorithms (specifically those that create a set of needles), +// we check for the edge-case of _all_ possible needles being sought. +TYPED_TEST(NeedleFinderTest, Needles256) { + string needles; + const auto minValue = std::numeric_limits::min(); + const auto maxValue = std::numeric_limits::max(); + // make the size ~big to avoid any edge-case branches for tiny haystacks + const int haystackSize = 50; + for (int i = minValue; i <= maxValue; i++) { // <= + needles.push_back(i); + } + EXPECT_EQ(StringPiece::npos, this->find_first_byte_of("", needles)); + for (int i = minValue; i <= maxValue; i++) { + EXPECT_EQ(0, this->find_first_byte_of(string(haystackSize, i), needles)); + } + + needles.append("these are redundant characters"); + EXPECT_EQ(StringPiece::npos, this->find_first_byte_of("", needles)); + for (int i = minValue; i <= maxValue; i++) { + EXPECT_EQ(0, this->find_first_byte_of(string(haystackSize, i), needles)); + } +} + +TYPED_TEST(NeedleFinderTest, Base) { + for (int i = 0; i < 32; ++i) { + for (int j = 0; j < 32; ++j) { + string s = string(i, 'X') + "abca" + string(i, 'X'); + string delims = string(j, 'Y') + "a" + string(j, 'Y'); + EXPECT_EQ(i, this->find_first_byte_of(s, delims)); + } + } }