Speed-up StringPiece::find_first_of()
authorMike Curtiss <mcurtiss@fb.com>
Fri, 23 Nov 2012 21:44:46 +0000 (13:44 -0800)
committerJordan DeLong <jdelong@fb.com>
Sat, 19 Jan 2013 00:37:53 +0000 (16:37 -0800)
Summary:
Wrote an SSE4.2-optimized version of find_first_of (>10x faster in
some cases).  For cases where SSE4.2 is not supported, rewrote
find_first_of to use Aho/Hopcroft/Ullman's "sparse, lazy" set (which
is faster than std::find_first_of in most cases).

Note that the overhead of ifunc (especially the lack of inlining)
means that the new implementations could be slightly slower for
super-tiny strings, but the inflection point is around 3-4 characters
in haystack, which seems reasonable.

Test Plan:
Added tests and benchmarks:

string length 1:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                         5.91ns  169.16M
FindSingleCharRange                              130.02%     4.55ns  219.95M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     11.37ns   87.98M
FindFirstOf2NeedlesNoSSE                         108.69%    10.46ns   95.63M
FindFirstOf2NeedlesStd                           147.04%     7.73ns  129.37M
FindFirstOf2NeedlesMemchr                         57.66%    19.71ns   50.73M
FindFirstOf2NeedlesByteSet                        83.32%    13.64ns   73.30M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     10.91ns   91.64M
FindFirstOf4NeedlesNoSSE                          88.87%    12.28ns   81.45M
FindFirstOf4NeedlesStd                           114.28%     9.55ns  104.73M
FindFirstOf4NeedlesMemchr                         34.77%    31.38ns   31.87M
FindFirstOf4NeedlesByteSet                        60.00%    18.19ns   54.98M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     10.91ns   91.64M
FindFirstOf8NeedlesNoSSE                          48.00%    22.73ns   43.99M
FindFirstOf8NeedlesStd                            54.54%    20.01ns   49.99M
FindFirstOf8NeedlesMemchr                         16.27%    67.06ns   14.91M
FindFirstOf8NeedlesByteSet                        39.99%    27.28ns   36.65M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    10.91ns   91.64M
FindFirstOf16NeedlesNoSSE                         33.33%    32.74ns   30.54M
FindFirstOf16NeedlesStd                           36.36%    30.01ns   33.32M
FindFirstOf16NeedlesMemchr                        10.25%   106.42ns    9.40M
FindFirstOf16NeedlesByteSet                       24.00%    45.46ns   22.00M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                    18.91ns   52.89M
FindFirstOf32NeedlesNoSSE                         21.00%    90.02ns   11.11M
FindFirstOf32NeedlesStd                           39.99%    47.28ns   21.15M
FindFirstOf32NeedlesMemchr                         8.48%   223.04ns    4.48M
FindFirstOf32NeedlesByteSet                       22.35%    84.60ns   11.82M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                    25.92ns   38.58M
FindFirstOf64NeedlesNoSSE                         17.45%   148.51ns    6.73M
FindFirstOf64NeedlesStd                           33.93%    76.39ns   13.09M
FindFirstOf64NeedlesMemchr                         6.07%   426.94ns    2.34M
FindFirstOf64NeedlesByteSet                       18.10%   143.22ns    6.98M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       23.28ns   42.95M
FindFirstOfRandomNoSSE                            88.96%    26.17ns   38.21M
FindFirstOfRandomStd                             112.78%    20.64ns   48.44M
FindFirstOfRandomMemchr                           35.68%    65.24ns   15.33M
FindFirstOfRandomByteSet                          62.62%    37.18ns   26.90M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      12.73ns   78.54M
----------------------------------------------------------------------------
============================================================================
string length 8:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                         7.05ns  141.75M
FindSingleCharRange                               50.05%    14.10ns   70.95M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     11.37ns   87.98M
FindFirstOf2NeedlesNoSSE                          53.04%    21.43ns   46.67M
FindFirstOf2NeedlesStd                            37.87%    30.01ns   33.32M
FindFirstOf2NeedlesMemchr                         54.81%    20.74ns   48.22M
FindFirstOf2NeedlesByteSet                        33.78%    33.65ns   29.72M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     10.91ns   91.64M
FindFirstOf4NeedlesNoSSE                          25.53%    42.74ns   23.40M
FindFirstOf4NeedlesStd                            24.49%    44.56ns   22.44M
FindFirstOf4NeedlesMemchr                         33.66%    32.42ns   30.85M
FindFirstOf4NeedlesByteSet                        28.57%    38.19ns   26.18M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     10.91ns   91.64M
FindFirstOf8NeedlesNoSSE                          21.05%    51.84ns   19.29M
FindFirstOf8NeedlesStd                            13.56%    80.48ns   12.43M
FindFirstOf8NeedlesMemchr                         17.32%    62.99ns   15.88M
FindFirstOf8NeedlesByteSet                        23.08%    47.28ns   21.15M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    10.91ns   91.64M
FindFirstOf16NeedlesNoSSE                         15.58%    70.02ns   14.28M
FindFirstOf16NeedlesStd                            7.23%   150.84ns    6.63M
FindFirstOf16NeedlesMemchr                         9.52%   114.63ns    8.72M
FindFirstOf16NeedlesByteSet                       16.67%    65.47ns   15.27M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                    18.91ns   52.89M
FindFirstOf32NeedlesNoSSE                         18.42%   102.62ns    9.74M
FindFirstOf32NeedlesStd                            7.08%   266.97ns    3.75M
FindFirstOf32NeedlesMemchr                         8.43%   224.41ns    4.46M
FindFirstOf32NeedlesByteSet                       19.29%    98.00ns   10.20M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                    25.92ns   38.58M
FindFirstOf64NeedlesNoSSE                         16.13%   160.73ns    6.22M
FindFirstOf64NeedlesStd                            4.58%   565.53ns    1.77M
FindFirstOf64NeedlesMemchr                         6.05%   428.22ns    2.34M
FindFirstOf64NeedlesByteSet                       16.58%   156.33ns    6.40M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       23.28ns   42.96M
FindFirstOfRandomNoSSE                            44.00%    52.91ns   18.90M
FindFirstOfRandomStd                              24.62%    94.56ns   10.58M
FindFirstOfRandomMemchr                           30.88%    75.38ns   13.27M
FindFirstOfRandomByteSet                          43.33%    53.72ns   18.62M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      12.73ns   78.54M
----------------------------------------------------------------------------
============================================================================
string length 10:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                         7.06ns  141.61M
FindSingleCharRange                               41.98%    16.82ns   59.44M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     11.37ns   87.98M
FindFirstOf2NeedlesNoSSE                          52.05%    21.84ns   45.79M
FindFirstOf2NeedlesStd                            31.25%    36.37ns   27.49M
FindFirstOf2NeedlesMemchr                         52.48%    21.66ns   46.17M
FindFirstOf2NeedlesByteSet                        29.07%    39.10ns   25.57M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     10.91ns   91.64M
FindFirstOf4NeedlesNoSSE                          28.93%    37.71ns   26.52M
FindFirstOf4NeedlesStd                            20.00%    54.57ns   18.33M
FindFirstOf4NeedlesMemchr                         30.39%    35.91ns   27.85M
FindFirstOf4NeedlesByteSet                        25.00%    43.65ns   22.91M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     10.91ns   91.64M
FindFirstOf8NeedlesNoSSE                          17.02%    64.12ns   15.60M
FindFirstOf8NeedlesStd                            11.16%    97.77ns   10.23M
FindFirstOf8NeedlesMemchr                         17.52%    62.30ns   16.05M
FindFirstOf8NeedlesByteSet                        25.00%    43.65ns   22.91M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    10.91ns   91.64M
FindFirstOf16NeedlesNoSSE                         16.28%    67.02ns   14.92M
FindFirstOf16NeedlesStd                            5.98%   182.32ns    5.48M
FindFirstOf16NeedlesMemchr                         9.09%   120.06ns    8.33M
FindFirstOf16NeedlesByteSet                       17.65%    61.84ns   16.17M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                    19.10ns   52.36M
FindFirstOf32NeedlesNoSSE                         17.91%   106.63ns    9.38M
FindFirstOf32NeedlesStd                            5.79%   329.70ns    3.03M
FindFirstOf32NeedlesMemchr                         7.89%   241.91ns    4.13M
FindFirstOf32NeedlesByteSet                       18.92%   100.95ns    9.91M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                    26.15ns   38.24M
FindFirstOf64NeedlesNoSSE                         15.84%   165.05ns    6.06M
FindFirstOf64NeedlesStd                            3.71%   704.28ns    1.42M
FindFirstOf64NeedlesMemchr                         5.49%   476.63ns    2.10M
FindFirstOf64NeedlesByteSet                       16.48%   158.68ns    6.30M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       22.83ns   43.81M
FindFirstOfRandomNoSSE                            43.25%    52.78ns   18.95M
FindFirstOfRandomStd                              22.33%   102.23ns    9.78M
FindFirstOfRandomMemchr                           31.61%    72.23ns   13.85M
FindFirstOfRandomByteSet                          41.64%    54.82ns   18.24M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      12.73ns   78.54M
----------------------------------------------------------------------------
============================================================================
string length 16:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                         7.06ns  141.72M
FindSingleCharRange                               28.21%    25.01ns   39.98M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     15.91ns   62.84M
FindFirstOf2NeedlesNoSSE                          72.89%    21.84ns   45.80M
FindFirstOf2NeedlesStd                            28.68%    55.48ns   18.02M
FindFirstOf2NeedlesMemchr                         74.47%    21.37ns   46.79M
FindFirstOf2NeedlesByteSet                        23.34%    68.19ns   14.66M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     15.46ns   64.68M
FindFirstOf4NeedlesNoSSE                          40.77%    37.92ns   26.37M
FindFirstOf4NeedlesStd                            18.28%    84.59ns   11.82M
FindFirstOf4NeedlesMemchr                         42.97%    35.97ns   27.80M
FindFirstOf4NeedlesByteSet                        25.76%    60.02ns   16.66M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     15.46ns   64.68M
FindFirstOf8NeedlesNoSSE                          24.03%    64.34ns   15.54M
FindFirstOf8NeedlesStd                             9.74%   158.74ns    6.30M
FindFirstOf8NeedlesMemchr                         24.55%    62.98ns   15.88M
FindFirstOf8NeedlesByteSet                        28.33%    54.57ns   18.33M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    15.46ns   64.68M
FindFirstOf16NeedlesNoSSE                         19.83%    77.98ns   12.82M
FindFirstOf16NeedlesStd                            5.56%   277.82ns    3.60M
FindFirstOf16NeedlesMemchr                        12.95%   119.35ns    8.38M
FindFirstOf16NeedlesByteSet                       21.25%    72.75ns   13.75M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                    32.80ns   30.49M
FindFirstOf32NeedlesNoSSE                         27.86%   117.69ns    8.50M
FindFirstOf32NeedlesStd                            6.33%   517.97ns    1.93M
FindFirstOf32NeedlesMemchr                        13.72%   239.09ns    4.18M
FindFirstOf32NeedlesByteSet                       29.06%   112.85ns    8.86M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                    46.83ns   21.35M
FindFirstOf64NeedlesNoSSE                         26.68%   175.50ns    5.70M
FindFirstOf64NeedlesStd                            4.20%     1.11us  897.48K
FindFirstOf64NeedlesMemchr                        10.04%   466.39ns    2.14M
FindFirstOf64NeedlesByteSet                       27.47%   170.50ns    5.87M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       23.41ns   42.72M
FindFirstOfRandomNoSSE                            38.00%    61.61ns   16.23M
FindFirstOfRandomStd                              13.91%   168.34ns    5.94M
FindFirstOfRandomMemchr                           29.03%    80.64ns   12.40M
FindFirstOfRandomByteSet                          33.31%    70.28ns   14.23M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      15.12ns   66.15M
----------------------------------------------------------------------------
============================================================================
string length 32:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                         8.23ns  121.52M
FindSingleCharRange                               17.57%    46.83ns   21.35M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     20.46ns   48.88M
FindFirstOf2NeedlesNoSSE                          82.29%    24.86ns   40.22M
FindFirstOf2NeedlesStd                            17.69%   115.65ns    8.65M
FindFirstOf2NeedlesMemchr                         85.17%    24.02ns   41.63M
FindFirstOf2NeedlesByteSet                        28.19%    72.58ns   13.78M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     20.01ns   49.99M
FindFirstOf4NeedlesNoSSE                          48.57%    41.19ns   24.28M
FindFirstOf4NeedlesStd                            11.52%   173.72ns    5.76M
FindFirstOf4NeedlesMemchr                         50.55%    39.58ns   25.27M
FindFirstOf4NeedlesByteSet                        26.33%    75.99ns   13.16M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     20.01ns   49.99M
FindFirstOf8NeedlesNoSSE                          26.94%    74.27ns   13.46M
FindFirstOf8NeedlesStd                             6.73%   297.31ns    3.36M
FindFirstOf8NeedlesMemchr                         27.44%    72.90ns   13.72M
FindFirstOf8NeedlesByteSet                        23.91%    83.66ns   11.95M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    20.01ns   49.99M
FindFirstOf16NeedlesNoSSE                         18.37%   108.92ns    9.18M
FindFirstOf16NeedlesStd                            3.75%   532.80ns    1.88M
FindFirstOf16NeedlesMemchr                        14.53%   137.71ns    7.26M
FindFirstOf16NeedlesByteSet                       19.55%   102.32ns    9.77M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                    45.92ns   21.78M
FindFirstOf32NeedlesNoSSE                         31.17%   147.32ns    6.79M
FindFirstOf32NeedlesStd                            4.50%     1.02us  980.43K
FindFirstOf32NeedlesMemchr                        16.13%   284.64ns    3.51M
FindFirstOf32NeedlesByteSet                       32.63%   140.73ns    7.11M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                    68.20ns   14.66M
FindFirstOf64NeedlesNoSSE                         29.97%   227.55ns    4.39M
FindFirstOf64NeedlesStd                            3.08%     2.21us  452.08K
FindFirstOf64NeedlesMemchr                        12.51%   545.17ns    1.83M
FindFirstOf64NeedlesByteSet                       30.74%   221.86ns    4.51M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       29.99ns   33.35M
FindFirstOfRandomNoSSE                            45.10%    66.49ns   15.04M
FindFirstOfRandomStd                              10.28%   291.67ns    3.43M
FindFirstOfRandomMemchr                           34.56%    86.76ns   11.53M
FindFirstOfRandomByteSet                          28.64%   104.72ns    9.55M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      19.55ns   51.15M
----------------------------------------------------------------------------
============================================================================
string length 64:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                        10.91ns   91.65M
FindSingleCharRange                               13.26%    82.29ns   12.15M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     29.56ns   33.83M
FindFirstOf2NeedlesNoSSE                         100.77%    29.33ns   34.09M
FindFirstOf2NeedlesStd                            13.59%   217.44ns    4.60M
FindFirstOf2NeedlesMemchr                        104.83%    28.19ns   35.47M
FindFirstOf2NeedlesByteSet                        22.01%   134.28ns    7.45M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     29.10ns   34.36M
FindFirstOf4NeedlesNoSSE                          56.14%    51.84ns   19.29M
FindFirstOf4NeedlesStd                             8.72%   333.84ns    3.00M
FindFirstOf4NeedlesMemchr                         58.18%    50.02ns   19.99M
FindFirstOf4NeedlesByteSet                        19.73%   147.48ns    6.78M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     29.10ns   34.36M
FindFirstOf8NeedlesNoSSE                          30.48%    95.48ns   10.47M
FindFirstOf8NeedlesStd                             5.07%   573.76ns    1.74M
FindFirstOf8NeedlesMemchr                         30.92%    94.11ns   10.63M
FindFirstOf8NeedlesByteSet                        19.26%   151.13ns    6.62M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    29.10ns   34.36M
FindFirstOf16NeedlesNoSSE                         15.84%   183.68ns    5.44M
FindFirstOf16NeedlesStd                            2.79%     1.04us  959.63K
FindFirstOf16NeedlesMemchr                        16.04%   181.41ns    5.51M
FindFirstOf16NeedlesByteSet                       16.54%   175.95ns    5.68M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                    73.21ns   13.66M
FindFirstOf32NeedlesNoSSE                         32.76%   223.49ns    4.47M
FindFirstOf32NeedlesStd                            3.62%     2.02us  494.08K
FindFirstOf32NeedlesMemchr                        19.49%   375.70ns    2.66M
FindFirstOf32NeedlesByteSet                       33.45%   218.87ns    4.57M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                   109.95ns    9.09M
FindFirstOf64NeedlesNoSSE                         38.99%   282.01ns    3.55M
FindFirstOf64NeedlesStd                            2.49%     4.41us  226.78K
FindFirstOf64NeedlesMemchr                        15.21%   723.03ns    1.38M
FindFirstOf64NeedlesByteSet                       39.68%   277.13ns    3.61M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       40.57ns   24.65M
FindFirstOfRandomNoSSE                            47.65%    85.15ns   11.74M
FindFirstOfRandomStd                               7.62%   532.10ns    1.88M
FindFirstOfRandomMemchr                           39.23%   103.43ns    9.67M
FindFirstOfRandomByteSet                          22.95%   176.82ns    5.66M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      28.65ns   34.91M
----------------------------------------------------------------------------
============================================================================
string length 128:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                        16.37ns   61.09M
FindSingleCharRange                               11.62%   140.85ns    7.10M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     47.74ns   20.95M
FindFirstOf2NeedlesNoSSE                         118.64%    40.24ns   24.85M
FindFirstOf2NeedlesStd                            11.33%   421.18ns    2.37M
FindFirstOf2NeedlesMemchr                        120.68%    39.56ns   25.28M
FindFirstOf2NeedlesByteSet                        21.47%   222.36ns    4.50M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     47.28ns   21.15M
FindFirstOf4NeedlesNoSSE                          63.80%    74.11ns   13.49M
FindFirstOf4NeedlesStd                             7.23%   653.94ns    1.53M
FindFirstOf4NeedlesMemchr                         65.40%    72.30ns   13.83M
FindFirstOf4NeedlesByteSet                        19.96%   236.85ns    4.22M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     47.28ns   21.15M
FindFirstOf8NeedlesNoSSE                          33.87%   139.59ns    7.16M
FindFirstOf8NeedlesStd                             4.20%     1.13us  887.82K
FindFirstOf8NeedlesMemchr                         34.43%   137.32ns    7.28M
FindFirstOf8NeedlesByteSet                        18.98%   249.17ns    4.01M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    47.28ns   21.15M
FindFirstOf16NeedlesNoSSE                         16.83%   281.00ns    3.56M
FindFirstOf16NeedlesStd                            2.30%     2.06us  485.36K
FindFirstOf16NeedlesMemchr                        16.98%   278.50ns    3.59M
FindFirstOf16NeedlesByteSet                       15.75%   300.13ns    3.33M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                   128.45ns    7.79M
FindFirstOf32NeedlesNoSSE                         37.09%   346.28ns    2.89M
FindFirstOf32NeedlesStd                            3.19%     4.03us  248.02K
FindFirstOf32NeedlesMemchr                        23.13%   555.26ns    1.80M
FindFirstOf32NeedlesByteSet                       37.74%   340.32ns    2.94M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                   193.23ns    5.18M
FindFirstOf64NeedlesNoSSE                         47.76%   404.60ns    2.47M
FindFirstOf64NeedlesStd                            2.20%     8.80us  113.61K
FindFirstOf64NeedlesMemchr                        17.91%     1.08us  926.70K
FindFirstOf64NeedlesByteSet                       48.35%   399.64ns    2.50M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       59.66ns   16.76M
FindFirstOfRandomNoSSE                            53.67%   111.17ns    9.00M
FindFirstOfRandomStd                               6.41%   930.67ns    1.07M
FindFirstOfRandomMemchr                           46.01%   129.68ns    7.71M
FindFirstOfRandomByteSet                          19.80%   301.38ns    3.32M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      46.83ns   21.35M
----------------------------------------------------------------------------
============================================================================
string length 256:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                        27.28ns   36.65M
FindSingleCharRange                               10.62%   256.90ns    3.89M
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                     61.39ns   16.29M
FindFirstOf2NeedlesNoSSE                          99.28%    61.84ns   16.17M
FindFirstOf2NeedlesStd                             7.41%   828.62ns    1.21M
FindFirstOf2NeedlesMemchr                        100.01%    61.39ns   16.29M
FindFirstOf2NeedlesByteSet                        15.36%   399.65ns    2.50M
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                     83.65ns   11.95M
FindFirstOf4NeedlesNoSSE                          71.03%   117.77ns    8.49M
FindFirstOf4NeedlesStd                             6.46%     1.29us  772.77K
FindFirstOf4NeedlesMemchr                         72.14%   115.95ns    8.62M
FindFirstOf4NeedlesByteSet                        20.66%   404.81ns    2.47M
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                     83.66ns   11.95M
FindFirstOf8NeedlesNoSSE                          35.38%   236.46ns    4.23M
FindFirstOf8NeedlesStd                             3.75%     2.23us  447.99K
FindFirstOf8NeedlesMemchr                         35.71%   234.26ns    4.27M
FindFirstOf8NeedlesByteSet                        20.13%   415.56ns    2.41M
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                    83.66ns   11.95M
FindFirstOf16NeedlesNoSSE                         18.04%   463.82ns    2.16M
FindFirstOf16NeedlesStd                            2.04%     4.10us  244.06K
FindFirstOf16NeedlesMemchr                        18.14%   461.09ns    2.17M
FindFirstOf16NeedlesByteSet                       14.81%   564.87ns    1.77M
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                   237.14ns    4.22M
FindFirstOf32NeedlesNoSSE                         38.92%   609.24ns    1.64M
FindFirstOf32NeedlesStd                            2.95%     8.05us  124.26K
FindFirstOf32NeedlesMemchr                        25.90%   915.44ns    1.09M
FindFirstOf32NeedlesByteSet                       39.21%   604.86ns    1.65M
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                   360.78ns    2.77M
FindFirstOf64NeedlesNoSSE                         54.03%   667.71ns    1.50M
FindFirstOf64NeedlesStd                            2.05%    17.59us   56.86K
FindFirstOf64NeedlesMemchr                        20.04%     1.80us  555.45K
FindFirstOf64NeedlesByteSet                       54.61%   660.63ns    1.51M
----------------------------------------------------------------------------
FindFirstOfRandomBase                                       98.24ns   10.18M
FindFirstOfRandomNoSSE                            47.37%   207.40ns    4.82M
FindFirstOfRandomStd                               5.24%     1.88us  533.28K
FindFirstOfRandomMemchr                           39.75%   247.14ns    4.05M
FindFirstOfRandomByteSet                          17.69%   555.45ns    1.80M
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                      62.75ns   15.94M
----------------------------------------------------------------------------
============================================================================
string length 10240:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                       613.80ns    1.63M
FindSingleCharRange                                6.57%     9.34us  107.12K
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                      1.23us  813.01K
FindFirstOf2NeedlesNoSSE                         100.01%     1.23us  813.07K
FindFirstOf2NeedlesStd                             3.77%    32.61us   30.67K
FindFirstOf2NeedlesMemchr                        100.08%     1.23us  813.67K
FindFirstOf2NeedlesByteSet                         8.65%    14.21us   70.37K
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                      2.94us  340.63K
FindFirstOf4NeedlesNoSSE                         119.61%     2.45us  407.44K
FindFirstOf4NeedlesStd                             5.73%    51.23us   19.52K
FindFirstOf4NeedlesMemchr                        119.77%     2.45us  407.97K
FindFirstOf4NeedlesByteSet                        20.66%    14.21us   70.38K
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                      2.94us  340.63K
FindFirstOf8NeedlesNoSSE                          59.95%     4.90us  204.21K
FindFirstOf8NeedlesStd                             3.32%    88.48us   11.30K
FindFirstOf8NeedlesMemchr                         59.96%     4.90us  204.25K
FindFirstOf8NeedlesByteSet                        20.68%    14.20us   70.43K
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                     2.94us  340.63K
FindFirstOf16NeedlesNoSSE                         29.98%     9.79us  102.13K
FindFirstOf16NeedlesStd                            1.80%   162.97us    6.14K
FindFirstOf16NeedlesMemchr                        29.98%     9.79us  102.11K
FindFirstOf16NeedlesByteSet                       20.65%    14.22us   70.33K
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                     8.77us  114.07K
FindFirstOf32NeedlesNoSSE                         44.71%    19.61us   51.00K
FindFirstOf32NeedlesStd                            2.73%   321.22us    3.11K
FindFirstOf32NeedlesMemchr                        43.44%    20.18us   49.55K
FindFirstOf32NeedlesByteSet                       44.67%    19.63us   50.95K
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                    13.43us   74.44K
FindFirstOf64NeedlesNoSSE                         68.26%    19.68us   50.81K
FindFirstOf64NeedlesStd                            1.91%   702.62us    1.42K
FindFirstOf64NeedlesMemchr                        33.81%    39.74us   25.17K
FindFirstOf64NeedlesByteSet                       68.25%    19.68us   50.81K
----------------------------------------------------------------------------
FindFirstOfRandomBase                                        3.01us  331.81K
FindFirstOfRandomNoSSE                            75.38%     4.00us  250.10K
FindFirstOfRandomStd                               6.81%    44.25us   22.60K
FindFirstOfRandomMemchr                           76.46%     3.94us  253.71K
FindFirstOfRandomByteSet                          15.01%    20.08us   49.81K
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                       1.23us  811.29K
----------------------------------------------------------------------------
============================================================================
string length 1048576:
============================================================================
folly/test/RangeFindBenchmark.cpp               relative  time/iter  iters/s
============================================================================
FindSingleCharMemchr                                        85.07us   11.76K
FindSingleCharRange                                8.92%   953.48us    1.05K
----------------------------------------------------------------------------
FindFirstOf2NeedlesBase                                    170.23us    5.87K
FindFirstOf2NeedlesNoSSE                         100.01%   170.21us    5.87K
FindFirstOf2NeedlesStd                             5.09%     3.34ms   299.18
FindFirstOf2NeedlesMemchr                        100.02%   170.20us    5.88K
FindFirstOf2NeedlesByteSet                        11.64%     1.46ms   683.69
----------------------------------------------------------------------------
FindFirstOf4NeedlesBase                                    298.04us    3.36K
FindFirstOf4NeedlesNoSSE                          87.48%   340.68us    2.94K
FindFirstOf4NeedlesStd                             5.68%     5.25ms   190.41
FindFirstOf4NeedlesMemchr                         87.53%   340.51us    2.94K
FindFirstOf4NeedlesByteSet                        20.37%     1.46ms   683.55
----------------------------------------------------------------------------
FindFirstOf8NeedlesBase                                    298.04us    3.36K
FindFirstOf8NeedlesNoSSE                          43.75%   681.27us    1.47K
FindFirstOf8NeedlesStd                             3.29%     9.07ms   110.24
FindFirstOf8NeedlesMemchr                         43.74%   681.36us    1.47K
FindFirstOf8NeedlesByteSet                        20.37%     1.46ms   683.55
----------------------------------------------------------------------------
FindFirstOf16NeedlesBase                                   298.03us    3.36K
FindFirstOf16NeedlesNoSSE                         21.83%     1.37ms   732.40
FindFirstOf16NeedlesStd                            1.78%    16.72ms    59.81
FindFirstOf16NeedlesMemchr                        21.83%     1.37ms   732.49
FindFirstOf16NeedlesByteSet                       20.37%     1.46ms   683.60
----------------------------------------------------------------------------
FindFirstOf32NeedlesBase                                   896.95us    1.11K
FindFirstOf32NeedlesNoSSE                         44.21%     2.03ms   492.89
FindFirstOf32NeedlesStd                            2.67%    33.53ms    29.82
FindFirstOf32NeedlesMemchr                        31.84%     2.82ms   354.97
FindFirstOf32NeedlesByteSet                       44.25%     2.03ms   493.31
----------------------------------------------------------------------------
FindFirstOf64NeedlesBase                                     1.38ms   725.72
FindFirstOf64NeedlesNoSSE                         67.96%     2.03ms   493.18
FindFirstOf64NeedlesStd                            1.90%    72.34ms    13.82
FindFirstOf64NeedlesMemchr                        24.82%     5.55ms   180.11
FindFirstOf64NeedlesByteSet                       67.97%     2.03ms   493.30
----------------------------------------------------------------------------
FindFirstOfRandomBase                                      657.10us    1.52K
FindFirstOfRandomNoSSE                            31.60%     2.08ms   480.94
FindFirstOfRandomStd                               2.05%    32.07ms    31.18
FindFirstOfRandomMemchr                           24.06%     2.73ms   366.13
FindFirstOfRandomByteSet                          31.56%     2.08ms   480.22
----------------------------------------------------------------------------
FindFirstOfOffsetRange                                     170.28us    5.87K
----------------------------------------------------------------------------
============================================================================

Reviewed By: philipp@fb.com

FB internal diff: D638500

folly/Range.cpp
folly/Range.h
folly/test/RangeFindBenchmark.cpp
folly/test/RangeTest.cpp

index 45ffd39d445d64ca96c815ad8fa00c4b9f1c24b7..4aee27de6bdbb7e2dc9c78c0de8fda0068f1cca8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2012 Facebook, Inc.
+ * Copyright 2013 Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 
 #include "folly/Range.h"
 
+#include "folly/CpuId.h"
+#include "folly/Likely.h"
+
 namespace folly {
 
 /**
@@ -34,4 +37,156 @@ std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
   return os;
 }
 
-} // namespace folly
+namespace detail {
+size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
+                                  const StringPiece& needles) {
+  size_t best = haystack.size();
+  for (char needle: needles) {
+    const void* ptr = memchr(haystack.data(), needle, best);
+    if (ptr) {
+      auto found = static_cast<const char*>(ptr) - haystack.data();
+      best = std::min<size_t>(best, found);
+    }
+  }
+  if (best == haystack.size()) {
+    return StringPiece::npos;
+  }
+  return best;
+}
+}  // namespace detail
+
+namespace {
+// build sse4.2-optimized version even if -msse4.2 is not passed to GCC
+size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
+                                     const StringPiece& needles)
+  __attribute__ ((__target__("sse4.2")));
+
+// helper method for case where needles.size() <= 16
+size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
+                                     const StringPiece& needles) {
+  DCHECK_LE(needles.size(), 16);
+  if (needles.size() <= 2 && haystack.size() >= 256) {
+    // benchmarking shows that memchr beats out SSE for small needle-sets
+    // with large haystacks.
+    // TODO(mcurtiss): could this be because of unaligned SSE loads?
+    return detail::qfind_first_byte_of_memchr(haystack, needles);
+  }
+  auto arr2 = __builtin_ia32_loaddqu(needles.data());
+  for (size_t i = 0; i < haystack.size(); i+= 16) {
+    auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
+    auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
+                                             arr1, haystack.size() - i, 0);
+    if (index < 16) {
+      return i + index;
+    }
+  }
+  return StringPiece::npos;
+}
+
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
+                                 const StringPiece& needles)
+  __attribute__ ((__target__("sse4.2")));
+
+size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
+                                 const StringPiece& needles) {
+  if (UNLIKELY(needles.empty() || haystack.empty())) {
+    return StringPiece::npos;
+  } else if (needles.size() <= 16) {
+    // we can save some unnecessary load instructions by optimizing for
+    // the common case of needles.size() <= 16
+    return qfind_first_byte_of_needles16(haystack, needles);
+  }
+
+  size_t index = haystack.size();
+  for (size_t i = 0; i < haystack.size(); i += 16) {
+    size_t b = 16;
+    auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
+    for (size_t j = 0; j < needles.size(); j += 16) {
+      auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
+      auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
+                                               arr1, haystack.size() - i, 0);
+      b = std::min<size_t>(index, b);
+    }
+    if (b < 16) {
+      return i + b;
+    }
+  };
+  return StringPiece::npos;
+}
+
+typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of;
+
+// Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
+// of Computer Algorithms" (1974), but the best description is here:
+// http://research.swtch.com/sparse
+class FastByteSet {
+ public:
+  FastByteSet() : size_(0) { }  // no init of arrays required!
+
+  inline void add(uint8_t i) {
+    if (!contains(i)) {
+      dense_[size_] = i;
+      sparse_[i] = size_;
+      size_++;
+    }
+  }
+  inline bool contains(uint8_t i) const {
+    DCHECK_LE(size_, 256);
+    return sparse_[i] < size_ && dense_[sparse_[i]] == i;
+  }
+
+ private:
+  uint16_t size_;  // can't use uint8_t because it would overflow if all
+                   // possible values were inserted.
+  uint8_t sparse_[256];
+  uint8_t dense_[256];
+};
+}  // namespace
+
+namespace detail {
+size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
+                                   const StringPiece& needles) {
+  FastByteSet s;
+  for (auto needle: needles) {
+    s.add(needle);
+  }
+  for (size_t index = 0; index < haystack.size(); ++index) {
+    if (s.contains(haystack[index])) {
+      return index;
+    }
+  }
+  return StringPiece::npos;
+}
+
+size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
+                                 const StringPiece& needles) {
+  if (UNLIKELY(needles.empty() || haystack.empty())) {
+    return StringPiece::npos;
+  }
+  // The thresholds below were empirically determined by benchmarking.
+  // This is not an exact science since it depends on the CPU, the size of
+  // needles, and the size of haystack.
+  if (haystack.size() == 1 ||
+      (haystack.size() < 4 && needles.size() <= 16)) {
+    return qfind_first_of(haystack, needles, asciiCaseSensitive);
+  } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
+             (needles.size() >= 16 && haystack.size() <= 64) ||
+             needles.size() >= 32) {
+    return qfind_first_byte_of_byteset(haystack, needles);
+  }
+
+  return qfind_first_byte_of_memchr(haystack, needles);
+}
+
+// This function is called on startup to resolve folly::qfind_first_byte_of
+extern "C" Type_qfind_first_byte_of* qfind_first_byte_of_ifunc() {
+  return folly::CpuId().sse42() ? qfind_first_byte_of_sse42 :
+    qfind_first_byte_of_nosse;
+}
+
+size_t qfind_first_byte_of(const StringPiece& haystack,
+                           const StringPiece& needles)
+  __attribute__((ifunc("qfind_first_byte_of_ifunc")));
+
+}  // namespace detail
+}  // namespace folly
index e214bee91c86f57197c240bd8bc19c1b52bb0c66..196e8d3dc7d29285f4b7bf31a54eeb7e326d17c5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2012 Facebook, Inc.
+ * Copyright 2013 Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -592,12 +592,17 @@ size_t qfind(const Range<T>& haystack,
   return std::string::npos;
 }
 
+namespace detail {
+size_t qfind_first_byte_of(const StringPiece& haystack,
+                           const StringPiece& needles);
+} // namespace detail
+
 template <class T, class Comp>
 size_t qfind_first_of(const Range<T> & haystack,
-                      const Range<T> & needle,
+                      const Range<T> & needles,
                       Comp eq) {
   auto ret = std::find_first_of(haystack.begin(), haystack.end(),
-                                needle.begin(), needle.end(),
+                                needles.begin(), needles.end(),
                                 eq);
   return ret == haystack.end() ? std::string::npos : ret - haystack.begin();
 }
@@ -649,10 +654,24 @@ inline size_t qfind(const Range<const unsigned char*>& haystack,
 
 template <class T>
 size_t qfind_first_of(const Range<T>& haystack,
-                      const Range<T>& needle) {
-  return qfind_first_of(haystack, needle, asciiCaseSensitive);
+                      const Range<T>& needles) {
+  return qfind_first_of(haystack, needles, asciiCaseSensitive);
+}
+
+// specialization for StringPiece
+template <>
+inline size_t qfind_first_of(const Range<const char*>& haystack,
+                             const Range<const char*>& needles) {
+  return detail::qfind_first_byte_of(haystack, needles);
 }
 
+// specialization for ByteRange
+template <>
+inline size_t qfind_first_of(const Range<const unsigned char*>& haystack,
+                             const Range<const unsigned char*>& needles) {
+  return detail::qfind_first_byte_of(StringPiece(haystack),
+                                     StringPiece(needles));
+}
 }  // !namespace folly
 
 FOLLY_ASSUME_FBVECTOR_COMPATIBLE_1(folly::Range);
index 654e00b2c649f97eb705cd81770e6cb7c0e0748a..031c67b8e4bbcc41a496ca5605162b5fe2e9b5f2 100644 (file)
 #include "folly/Foreach.h"
 #include <algorithm>
 #include <iostream>
+#include <random>
 #include <string>
 
+namespace folly { namespace detail {
+// declaration of functions in Range.cpp
+size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
+                                  const StringPiece& needles);
+
+size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
+                                   const StringPiece& needles);
+
+size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
+                                 const StringPiece& needles);
+}}
+
 using namespace folly;
 using namespace std;
 
@@ -36,6 +49,43 @@ void initStr(int len) {
   str.append(1, 'b');
 }
 
+std::mt19937 rnd;
+string ffoTestString;
+const size_t ffoDelimSize = 128;
+vector<string> ffoDelim;
+
+string generateString(int len) {
+  std::uniform_int_distribution<uint32_t> validChar(1, 255);  // no null-char
+  string ret;
+  while (len--) {
+    ret.push_back(validChar(rnd));
+  }
+  return ret;
+}
+
+void initDelims(int len) {
+  ffoDelim.clear();
+
+  string s(len - 1, '\0');  // find_first_of won't finish until last char
+  s.push_back('a');
+  ffoTestString = s;
+
+  for (int i = 0; i < ffoDelimSize; ++i) {
+    // most delimiter sets are pretty small, but occasionally there could
+    // be a big one.
+    auto n = rnd() % 8 + 1;
+    if (n == 8) {
+      n = 32;
+    }
+    auto s = generateString(n);
+    if (rnd() % 2) {
+      // ~half of tests will find a hit
+      s[rnd() % s.size()] = 'a';  // yes, this could mean 'a' is a duplicate
+    }
+    ffoDelim.push_back(s);
+  }
+}
+
 }  // anonymous namespace
 
 BENCHMARK(FindSingleCharMemchr, n) {
@@ -60,17 +110,198 @@ BENCHMARK_RELATIVE(FindSingleCharRange, n) {
 
 BENCHMARK_DRAW_LINE();
 
-BENCHMARK(FindFirstOfRange, n) {
+// it's useful to compare our custom implementations vs. the standard library
+inline size_t qfind_first_byte_of_std(const StringPiece& haystack,
+                                      const StringPiece& needles) {
+  return qfind_first_of(haystack, needles, asciiCaseSensitive);
+}
+
+template <class Func>
+void findFirstOfRange(StringPiece needles, Func func, size_t n) {
   StringPiece haystack(str);
-  folly::StringPiece needles("bc");
-  DCHECK_EQ(haystack.size() - 1, haystack.find_first_of(needles)); // it works!
   FOR_EACH_RANGE (i, 0, n) {
-    doNotOptimizeAway(haystack.find_first_of(needles));
+    doNotOptimizeAway(func(haystack, needles));
     char x = haystack[0];
     doNotOptimizeAway(&x);
   }
 }
 
+const string delims2 = "bc";
+
+BENCHMARK(FindFirstOf2NeedlesBase, n) {
+  findFirstOfRange(delims2, detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf2NeedlesNoSSE, n) {
+  findFirstOfRange(delims2, detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf2NeedlesStd, n) {
+  findFirstOfRange(delims2, qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf2NeedlesMemchr, n) {
+  findFirstOfRange(delims2, detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf2NeedlesByteSet, n) {
+  findFirstOfRange(delims2, detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
+const string delims4 = "bcde";
+
+BENCHMARK(FindFirstOf4NeedlesBase, n) {
+  findFirstOfRange(delims4, detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf4NeedlesNoSSE, n) {
+  findFirstOfRange(delims4, detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf4NeedlesStd, n) {
+  findFirstOfRange(delims4, qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf4NeedlesMemchr, n) {
+  findFirstOfRange(delims4, detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf4NeedlesByteSet, n) {
+  findFirstOfRange(delims4, detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
+const string delims8 = "0123456b";
+
+BENCHMARK(FindFirstOf8NeedlesBase, n) {
+  findFirstOfRange(delims8, detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf8NeedlesNoSSE, n) {
+  findFirstOfRange(delims8, detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf8NeedlesStd, n) {
+  findFirstOfRange(delims8, qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf8NeedlesMemchr, n) {
+  findFirstOfRange(delims8, detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf8NeedlesByteSet, n) {
+  findFirstOfRange(delims8, detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
+const string delims16 = "0123456789bcdefg";
+
+BENCHMARK(FindFirstOf16NeedlesBase, n) {
+  findFirstOfRange(delims16, detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf16NeedlesNoSSE, n) {
+  findFirstOfRange(delims16, detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf16NeedlesStd, n) {
+  findFirstOfRange(delims16, qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf16NeedlesMemchr, n) {
+  findFirstOfRange(delims16, detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf16NeedlesByteSet, n) {
+  findFirstOfRange(delims16, detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
+const string delims32 = "!bcdefghijklmnopqrstuvwxyz_012345";
+
+BENCHMARK(FindFirstOf32NeedlesBase, n) {
+  findFirstOfRange(delims32, detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf32NeedlesNoSSE, n) {
+  findFirstOfRange(delims32, detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf32NeedlesStd, n) {
+  findFirstOfRange(delims32, qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf32NeedlesMemchr, n) {
+  findFirstOfRange(delims32, detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf32NeedlesByteSet, n) {
+  findFirstOfRange(delims32, detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
+const string delims64 = "!bcdefghijklmnopqrstuvwxyz_"
+                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ-0123456789$";
+
+BENCHMARK(FindFirstOf64NeedlesBase, n) {
+  findFirstOfRange(delims64, detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf64NeedlesNoSSE, n) {
+  findFirstOfRange(delims64, detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf64NeedlesStd, n) {
+  findFirstOfRange(delims64, qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf64NeedlesMemchr, n) {
+  findFirstOfRange(delims64, detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOf64NeedlesByteSet, n) {
+  findFirstOfRange(delims64, detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
+template <class Func>
+void findFirstOfRandom(Func func, size_t iters) {
+  for (int i = 0; i < iters; ++i) {
+    auto test = i % ffoDelim.size();
+    auto p = func(ffoTestString, ffoDelim[test]);
+    doNotOptimizeAway(p);
+  }
+}
+
+BENCHMARK(FindFirstOfRandomBase, n) {
+  findFirstOfRandom(detail::qfind_first_byte_of, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOfRandomNoSSE, n) {
+  findFirstOfRandom(detail::qfind_first_byte_of_nosse, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOfRandomStd, n) {
+  findFirstOfRandom(qfind_first_byte_of_std, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOfRandomMemchr, n) {
+  findFirstOfRandom(detail::qfind_first_byte_of_memchr, n);
+}
+
+BENCHMARK_RELATIVE(FindFirstOfRandomByteSet, n) {
+  findFirstOfRandom(detail::qfind_first_byte_of_byteset, n);
+}
+
+BENCHMARK_DRAW_LINE();
+
 BENCHMARK(FindFirstOfOffsetRange, n) {
   StringPiece haystack(str);
   folly::StringPiece needles("bc");
@@ -83,11 +314,14 @@ BENCHMARK(FindFirstOfOffsetRange, n) {
   }
 }
 
+BENCHMARK_DRAW_LINE();
+
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
 
-  for (int len : {1, 10, 256, 10*1024, 10*1024*1024}) {
+  for (int len : {1, 8, 10, 16, 32, 64, 128, 256, 10*1024, 1024*1024}) {
     initStr(len);
+    initDelims(len);
     runBenchmarks();
   }
   return 0;
index e1b055ef1bb7c4c6d6bb7e73fd1e2664e1618ae3..cb8aec8a1cf46d3671109318132156b5929d6fce 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2012 Facebook, Inc.
+ * Copyright 2013 Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 // @author Kristina Holst (kholst@fb.com)
 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
 
+#include <limits>
+#include <string>
 #include <boost/range/concepts.hpp>
 #include <gtest/gtest.h>
 #include "folly/Range.h"
 
+namespace folly { namespace detail {
+// declaration of functions in Range.cpp
+size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
+                                  const StringPiece& needles);
+
+size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
+                                   const StringPiece& needles);
+
+size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
+                                 const StringPiece& needles);
+}}
+
 using namespace folly;
 using namespace std;
 
@@ -199,4 +213,128 @@ TEST(StringPiece, ToByteRange) {
             static_cast<const void*>(b.begin()));
   EXPECT_EQ(static_cast<const void*>(a.end()),
             static_cast<const void*>(b.end()));
+
+  // and convert back again
+  StringPiece c(b);
+  EXPECT_EQ(a.begin(), c.begin());
+  EXPECT_EQ(a.end(), c.end());
+}
+
+template <typename NeedleFinder>
+class NeedleFinderTest : public ::testing::Test {
+ public:
+  static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) {
+    return NeedleFinder::find_first_byte_of(haystack, needles);
+  }
+};
+
+struct SseNeedleFinder {
+  static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) {
+    // This will only use the SSE version if it is supported on this CPU
+    // (selected using ifunc).
+    return detail::qfind_first_byte_of(haystack, needles);
+  }
+};
+
+struct NoSseNeedleFinder {
+  static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) {
+    return detail::qfind_first_byte_of_nosse(haystack, needles);
+  }
+};
+
+struct MemchrNeedleFinder {
+  static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) {
+    return detail::qfind_first_byte_of_memchr(haystack, needles);
+  }
+};
+
+struct ByteSetNeedleFinder {
+  static size_t find_first_byte_of(StringPiece haystack, StringPiece needles) {
+    return detail::qfind_first_byte_of_byteset(haystack, needles);
+  }
+};
+
+typedef ::testing::Types<SseNeedleFinder, NoSseNeedleFinder, MemchrNeedleFinder,
+                         ByteSetNeedleFinder> NeedleFinders;
+TYPED_TEST_CASE(NeedleFinderTest, NeedleFinders);
+
+TYPED_TEST(NeedleFinderTest, Null) {
+  { // null characters in the string
+    string s(10, char(0));
+    s[5] = 'b';
+    string delims("abc");
+    EXPECT_EQ(5, this->find_first_byte_of(s, delims));
+  }
+  { // null characters in delim
+    string s("abc");
+    string delims(10, char(0));
+    delims[3] = 'c';
+    delims[7] = 'b';
+    EXPECT_EQ(1, this->find_first_byte_of(s, delims));
+  }
+  { // range not terminated by null character
+    string buf = "abcdefghijklmnopqrstuvwxyz";
+    StringPiece s(buf.data() + 5, 3);
+    StringPiece delims("z");
+    EXPECT_EQ(string::npos, this->find_first_byte_of(s, delims));
+  }
+}
+
+TYPED_TEST(NeedleFinderTest, DelimDuplicates) {
+  string delims(1000, 'b');
+  EXPECT_EQ(1, this->find_first_byte_of("abc", delims));
+  EXPECT_EQ(string::npos, this->find_first_byte_of("ac", delims));
+}
+
+TYPED_TEST(NeedleFinderTest, Empty) {
+  string a = "abc";
+  string b = "";
+  EXPECT_EQ(string::npos, this->find_first_byte_of(a, b));
+  EXPECT_EQ(string::npos, this->find_first_byte_of(b, a));
+  EXPECT_EQ(string::npos, this->find_first_byte_of(b, b));
+}
+
+TYPED_TEST(NeedleFinderTest, Unaligned) {
+  // works correctly even if input buffers are not 16-byte aligned
+  string s = "0123456789ABCDEFGH";
+  for (int i = 0; i < s.size(); ++i) {
+    StringPiece a(s.c_str() + i);
+    for (int j = 0; j < s.size(); ++j) {
+      StringPiece b(s.c_str() + j);
+      EXPECT_EQ((i > j) ? 0 : j - i, this->find_first_byte_of(a, b));
+    }
+  }
+}
+
+// for some algorithms (specifically those that create a set of needles),
+// we check for the edge-case of _all_ possible needles being sought.
+TYPED_TEST(NeedleFinderTest, Needles256) {
+  string needles;
+  const auto minValue = std::numeric_limits<StringPiece::value_type>::min();
+  const auto maxValue = std::numeric_limits<StringPiece::value_type>::max();
+  // make the size ~big to avoid any edge-case branches for tiny haystacks
+  const int haystackSize = 50;
+  for (int i = minValue; i <= maxValue; i++) {  // <=
+    needles.push_back(i);
+  }
+  EXPECT_EQ(StringPiece::npos, this->find_first_byte_of("", needles));
+  for (int i = minValue; i <= maxValue; i++) {
+    EXPECT_EQ(0, this->find_first_byte_of(string(haystackSize, i), needles));
+  }
+
+  needles.append("these are redundant characters");
+  EXPECT_EQ(StringPiece::npos, this->find_first_byte_of("", needles));
+  for (int i = minValue; i <= maxValue; i++) {
+    EXPECT_EQ(0, this->find_first_byte_of(string(haystackSize, i), needles));
+  }
+}
+
+TYPED_TEST(NeedleFinderTest, Base) {
+  for (int i = 0; i < 32; ++i) {
+    for (int j = 0; j < 32; ++j) {
+      string s = string(i, 'X') + "abca" + string(i, 'X');
+      string delims = string(j, 'Y') + "a" + string(j, 'Y');
+      EXPECT_EQ(i, this->find_first_byte_of(s, delims));
+    }
+  }
 }