From: Lucian Grijincu Date: Thu, 8 May 2014 18:50:50 +0000 (-0700) Subject: folly: fbstring: conditionally write terminator in c_str() X-Git-Tag: v0.22.0~550 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=9529709d994bdd827e81a269ed5417a6aabaae3f;p=folly.git folly: fbstring: conditionally write terminator in c_str() Summary: c_str/data writes a terminator '\0' on each call. When multiple threads call c_str/data on the same string (e.g. small global constant) the string's cache line will bounce from cpu to cpu. Cpus report that most instructions are stalled (stalled-cycles-frontend/stalled-cycles-backend near 100%). Fix: check before setting '\0'. Test Plan: ** BEFORE ** - nothing defined ** gcc-4.8.1-glibc-2.17-fb $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 4.10s experimental/lucian/bench/StringTerminatorBenchmark.cpp Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark... 2.40s _build/opt/experimental/lucian/bench/string_terminator_benchmark I0507 17:27:39.373522 28745 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 934.10ps 1.07G privat_std_1t 100.32% 931.08ps 1.07G static_fbs_1t 100.30% 931.33ps 1.07G privat_fbs_1t 100.42% 930.22ps 1.08G static_sp__1t 86986.52% 1.07ps 931.24G privat_sp__1t 81459.50% 1.15ps 872.07G static_std_32t 392.41ns 2.55M privat_std_32t 15072.69% 2.60ns 384.10M static_fbs_32t 102.41% 383.17ns 2.61M privat_fbs_32t 13643.61% 2.88ns 347.68M static_sp__32t 61032.25% 642.96ps 1.56G privat_sp__32t 74172.72% 529.06ps 1.89G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 2417478.451740 task-clock # 27.005 CPUs utilized 65,374 context-switches # 0.027 K/sec 4,270 CPU-migrations # 0.002 K/sec 2,594 page-faults # 0.001 K/sec 5,261,195,623,299 cycles # 2.176 GHz [25.09%] 5,072,634,235,091 stalled-cycles-frontend # 96.42% frontend cycles idle [30.01%] 3,848,759,938,052 stalled-cycles-backend # 73.15% backend cycles idle [30.01%] 614,983,033,652 instructions # 0.12 insns per cycle # 8.25 stalled cycles per insn [25.09%] 152,995,596,390 branches # 63.287 M/sec [30.01%] 24,501,018 branch-misses # 0.02% of all branches [25.94%] 66,298,370,215 L1-dcache-loads # 27.425 M/sec [30.03%] 1,642,188,850 L1-dcache-load-misses # 2.48% of all L1-dcache hits [30.02%] 968,142,977 LLC-loads # 0.400 M/sec [30.02%] 480,786,261 LLC-load-misses # 49.66% of all LL-cache hits [30.02%] 89.520960614 seconds time elapsed - #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17-fb $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 4.25s experimental/lucian/bench/StringTerminatorBenchmark.cpp Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark... 2.45s _build/opt/experimental/lucian/bench/string_terminator_benchmark I0507 17:24:26.487848 23720 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 932.63ps 1.07G privat_std_1t 100.95% 923.85ps 1.08G static_fbs_1t 100.04% 932.23ps 1.07G privat_fbs_1t 101.14% 922.07ps 1.08G static_sp__1t 30418.15% 3.07ps 326.16G privat_sp__1t 56775.39% 1.64ps 608.77G static_std_32t 388.57ns 2.57M privat_std_32t 16465.70% 2.36ns 423.75M static_fbs_32t 96.19% 403.98ns 2.48M privat_fbs_32t 16248.69% 2.39ns 418.16M static_sp__32t 81185.06% 478.63ps 2.09G privat_sp__32t 81470.69% 476.95ps 2.10G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 2486009.171842 task-clock # 27.241 CPUs utilized 64,538 context-switches # 0.026 K/sec 4,501 CPU-migrations # 0.002 K/sec 2,863 page-faults # 0.001 K/sec 5,413,507,764,726 cycles # 2.178 GHz [25.62%] 5,207,550,928,974 stalled-cycles-frontend # 96.20% frontend cycles idle [30.01%] 3,919,627,007,209 stalled-cycles-backend # 72.40% backend cycles idle [30.01%] 685,365,706,158 instructions # 0.13 insns per cycle # 7.60 stalled cycles per insn [25.62%] 148,798,151,350 branches # 59.854 M/sec [30.01%] 24,386,492 branch-misses # 0.02% of all branches [26.08%] 72,683,962,699 L1-dcache-loads # 29.237 M/sec [30.02%] 1,687,684,775 L1-dcache-load-misses # 2.32% of all L1-dcache hits [30.01%] 989,352,938 LLC-loads # 0.398 M/sec [30.02%] 484,825,665 LLC-load-misses # 49.00% of all LL-cache hits [30.02%] 91.259265010 seconds time elapsed - #define FBSTRING_CONSERVATIVE ** gcc-4.8.1-glibc-2.17-fb $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 0.18s experimental/lucian/bench/StringTerminatorBenchmark.cpp Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark... 2.43s _build/opt/experimental/lucian/bench/string_terminator_benchmark I0507 17:30:58.246834 1571 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 1.44ps 692.61G privat_std_1t 193.01% 748.06fs 1.34T static_fbs_1t 399.47% 361.43fs 2.77T privat_fbs_1t 129.29% 1.12ps 895.50G static_sp__1t 78.99% 1.83ps 547.10G privat_sp__1t 56.22% 2.57ps 389.36G static_std_32t 478.67ps 2.09G privat_std_32t 100.30% 477.26ps 2.10G static_fbs_32t 100.52% 476.21ps 2.10G privat_fbs_32t 100.52% 476.22ps 2.10G static_sp__32t 99.57% 480.76ps 2.08G privat_sp__32t 100.06% 478.40ps 2.09G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 198000.887222 task-clock # 14.738 CPUs utilized 5,016 context-switches # 0.025 K/sec 2,536 CPU-migrations # 0.013 K/sec 4,717 page-faults # 0.024 K/sec 421,850,242,358 cycles # 2.131 GHz [25.31%] 209,633,195,159 stalled-cycles-frontend # 49.69% frontend cycles idle [30.32%] 6,252,451,713 stalled-cycles-backend # 1.48% backend cycles idle [30.21%] 874,096,942,441 instructions # 2.07 insns per cycle # 0.24 stalled cycles per insn [25.37%] 218,114,341,006 branches # 1101.583 M/sec [30.34%] 3,080,267 branch-misses # 0.00% of all branches [26.23%] 240,532,859 L1-dcache-loads # 1.215 M/sec [30.73%] 23,151,832 L1-dcache-load-misses # 9.63% of all L1-dcache hits [30.61%] 5,283,803 LLC-loads # 0.027 M/sec [30.60%] 1,079,973 LLC-load-misses # 20.44% of all LL-cache hits [30.60%] 13.434898734 seconds time elapsed - #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17 $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 I0507 17:37:01.095785 28744 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 2.80ps 357.44G privat_std_1t 118.65% 2.36ps 424.09G static_fbs_1t 0.30% 937.44ps 1.07G privat_fbs_1t 0.30% 924.33ps 1.08G static_sp__1t 229.14% 1.22ps 819.03G privat_sp__1t 212.22% 1.32ps 758.55G static_std_32t 662.00ps 1.51G privat_std_32t 134.15% 493.47ps 2.03G static_fbs_32t 1.27% 52.24ns 19.14M privat_fbs_32t 28.02% 2.36ns 423.31M static_sp__32t 138.51% 477.94ps 2.09G privat_sp__32t 138.39% 478.38ps 2.09G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 316918.699522 task-clock # 17.561 CPUs utilized 8,761 context-switches # 0.028 K/sec 2,190 CPU-migrations # 0.007 K/sec 5,361 page-faults # 0.017 K/sec 683,482,498,044 cycles # 2.157 GHz [25.13%] 488,046,572,692 stalled-cycles-frontend # 71.41% frontend cycles idle [30.18%] 269,765,079,972 stalled-cycles-backend # 39.47% backend cycles idle [30.17%] 756,213,442,723 instructions # 1.11 insns per cycle # 0.65 stalled cycles per insn [25.17%] 175,643,190,326 branches # 554.222 M/sec [30.14%] 3,737,621 branch-misses # 0.00% of all branches [25.63%] 35,169,313,417 L1-dcache-loads # 110.973 M/sec [30.31%] 215,454,174 L1-dcache-load-misses # 0.61% of all L1-dcache hits [30.25%] 148,883,234 LLC-loads # 0.470 M/sec [30.25%] 48,768,585 LLC-load-misses # 32.76% of all LL-cache hits [30.26%] 18.046902132 seconds time elapsed ** AFTER ** - define nothing ** gcc-4.8.1-glibc-2.17-fb $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 I0507 20:34:15.943284 16852 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 936.05ps 1.07G privat_std_1t 100.67% 929.82ps 1.08G static_fbs_1t 98.63% 949.07ps 1.05G privat_fbs_1t 101.56% 921.69ps 1.08G static_sp__1t 40342.12% 2.32ps 430.98G privat_sp__1t 16370.35% 5.72ps 174.89G static_std_32t 409.26ns 2.44M privat_std_32t 16777.41% 2.44ns 409.95M static_fbs_32t 17398.58% 2.35ns 425.13M privat_fbs_32t 17475.47% 2.34ns 427.01M static_sp__32t 85426.96% 479.07ps 2.09G privat_sp__32t 85033.14% 481.29ps 2.08G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 1387268.222731 task-clock # 25.553 CPUs utilized 34,698 context-switches # 0.025 K/sec 2,770 CPU-migrations # 0.002 K/sec 3,039 page-faults # 0.002 K/sec 3,019,637,816,074 cycles # 2.177 GHz [25.21%] 2,801,192,631,479 stalled-cycles-frontend # 92.77% frontend cycles idle [30.02%] 2,020,857,118,698 stalled-cycles-backend # 66.92% backend cycles idle [30.00%] 801,090,224,478 instructions # 0.27 insns per cycle # 3.50 stalled cycles per insn [25.20%] 206,099,842,649 branches # 148.565 M/sec [30.03%] 12,954,936 branch-misses # 0.01% of all branches [25.64%] 104,129,435,254 L1-dcache-loads # 75.061 M/sec [30.07%] 883,390,641 L1-dcache-load-misses # 0.85% of all L1-dcache hits [30.06%] 516,975,218 LLC-loads # 0.373 M/sec [30.04%] 255,887,523 LLC-load-misses # 49.50% of all LL-cache hits [30.04%] 54.289185806 seconds time elapsed - #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17-fb $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 4.20s experimental/lucian/bench/StringTerminatorBenchmark.cpp Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark... 2.47s _build/opt/experimental/lucian/bench/string_terminator_benchmark I0507 20:19:07.272021 18610 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 936.15ps 1.07G privat_std_1t 100.84% 928.34ps 1.08G static_fbs_1t 97.92% 956.06ps 1.05G privat_fbs_1t 101.41% 923.14ps 1.08G static_sp__1t inf% 0.00fs infinity privat_sp__1t inf% 0.00fs infinity static_std_32t 413.49ns 2.42M privat_std_32t 17375.60% 2.38ns 420.22M static_fbs_32t 17443.07% 2.37ns 421.85M privat_fbs_32t 15354.32% 2.69ns 371.33M static_sp__32t 82034.91% 504.05ps 1.98G privat_sp__32t 63367.22% 652.53ps 1.53G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 1390538.848103 task-clock # 25.373 CPUs utilized 37,417 context-switches # 0.027 K/sec 3,147 CPU-migrations # 0.002 K/sec 2,876 page-faults # 0.002 K/sec 3,024,513,016,946 cycles # 2.175 GHz [25.08%] 2,813,021,108,191 stalled-cycles-frontend # 93.01% frontend cycles idle [30.02%] 2,043,124,392,473 stalled-cycles-backend # 67.55% backend cycles idle [30.01%] 774,663,686,661 instructions # 0.26 insns per cycle # 3.63 stalled cycles per insn [25.09%] 197,666,485,664 branches # 142.151 M/sec [30.03%] 15,077,576 branch-misses # 0.01% of all branches [25.73%] 104,720,369,589 L1-dcache-loads # 75.309 M/sec [30.05%] 886,090,434 L1-dcache-load-misses # 0.85% of all L1-dcache hits [30.04%] 520,015,584 LLC-loads # 0.374 M/sec [30.03%] 256,990,100 LLC-load-misses # 49.42% of all LL-cache hits [30.04%] 54.804099454 seconds time elapsed - #define FBSTRING_CONSERVATIVE ** gcc-4.8.1-glibc-2.17-fb $ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000 4.02s experimental/lucian/bench/StringTerminatorBenchmark.cpp Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark... 2.45s _build/opt/experimental/lucian/bench/string_terminator_benchmark I0507 20:31:31.120209 10543 StringTerminatorBenchmark.cpp:98] --bm_min_iters=100000000 ============================================================================ experimental/lucian/bench/StringTerminatorBenchmark.cpprelative time/iter iters/s ============================================================================ static_std_1t 929.96ps 1.08G privat_std_1t 100.37% 926.53ps 1.08G static_fbs_1t inf% 0.00fs infinity privat_fbs_1t inf% 0.00fs infinity static_sp__1t inf% 0.00fs infinity privat_sp__1t inf% 0.00fs infinity static_std_32t 381.41ns 2.62M privat_std_32t 16080.74% 2.37ns 421.62M static_fbs_32t 80498.05% 473.81ps 2.11G privat_fbs_32t 80368.84% 474.57ps 2.11G static_sp__32t 80287.07% 475.05ps 2.11G privat_sp__32t 80410.51% 474.33ps 2.11G ============================================================================ Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000': 1293727.223839 task-clock # 25.403 CPUs utilized 33,881 context-switches # 0.026 K/sec 3,718 CPU-migrations # 0.003 K/sec 3,829 page-faults # 0.003 K/sec 2,813,078,184,066 cycles # 2.174 GHz [25.11%] 2,597,561,032,630 stalled-cycles-frontend # 92.34% frontend cycles idle [30.04%] 1,883,652,860,583 stalled-cycles-backend # 66.96% backend cycles idle [30.02%] 800,150,465,648 instructions # 0.28 insns per cycle # 3.25 stalled cycles per insn [25.10%] 197,970,559,157 branches # 153.023 M/sec [30.02%] 15,385,773 branch-misses # 0.01% of all branches [25.95%] 36,541,946,374 L1-dcache-loads # 28.245 M/sec [30.08%] 813,994,917 L1-dcache-load-misses # 2.23% of all L1-dcache hits [30.07%] 467,241,616 LLC-loads # 0.361 M/sec [30.07%] 247,980,582 LLC-load-misses # 53.07% of all LL-cache hits [30.08%] 50.928965545 seconds time elapsed Reviewed By: njormrod@fb.com FB internal diff: D1318048 --- diff --git a/folly/FBString.h b/folly/FBString.h index dd5b0642..09407bda 100644 --- a/folly/FBString.h +++ b/folly/FBString.h @@ -528,12 +528,16 @@ public: if (c == isSmall) { assert(small_[smallSize()] == TERMINATOR || smallSize() == maxSmallSize || small_[smallSize()] == '\0'); - small_[smallSize()] = '\0'; + if (small_[smallSize()] != '\0') { + small_[smallSize()] = '\0'; + } return small_; } assert(c == isMedium || c == isLarge); assert(ml_.data_[ml_.size_] == TERMINATOR || ml_.data_[ml_.size_] == '\0'); - ml_.data_[ml_.size_] = '\0'; + if (ml_.data_[ml_.size_] != '\0') { + ml_.data_[ml_.size_] = '\0'; + } #elif defined(FBSTRING_CONSERVATIVE) if (c == isSmall) { assert(small_[smallSize()] == '\0'); @@ -543,11 +547,15 @@ public: assert(ml_.data_[ml_.size_] == '\0'); #else if (c == isSmall) { - small_[smallSize()] = '\0'; + if (small_[smallSize()] != '\0') { + small_[smallSize()] = '\0'; + } return small_; } assert(c == isMedium || c == isLarge); - ml_.data_[ml_.size_] = '\0'; + if (ml_.data_[ml_.size_] != '\0') { + ml_.data_[ml_.size_] = '\0'; + } #endif return ml_.data_; } diff --git a/folly/test/FBStringTerminatorBenchmark.cpp b/folly/test/FBStringTerminatorBenchmark.cpp new file mode 100644 index 00000000..cbd0d59d --- /dev/null +++ b/folly/test/FBStringTerminatorBenchmark.cpp @@ -0,0 +1,134 @@ +/* + * Copyright 2014 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +/** + * fbstring::c_str/data used to always write '\0' if + * FBSTRING_CONSERVATIVE was not defined. + * + * Multiple threads calling c_str/data on the same fbstring leads to + * cache thrashing. + * + * fbstring::c_str was changed to conditionally write '\0'. + * + */ + +// #define FBSTRING_PERVERSE +// #define FBSTRING_CONSERVATIVE +#include "folly/FBString.h" +#include "folly/Benchmark.h" +#include "folly/Range.h" + + +template +class Bench { + public: + void benchStatic(size_t n) const { + static S what = "small string"; + for (size_t i = 0; i < n; i++) { + folly::doNotOptimizeAway(what.data()); + } + } + + void benchPrivate(size_t n) const { + S what = "small string"; + for (size_t i = 0; i < n; i++) { + folly::doNotOptimizeAway(what.data()); + } + } +}; + +void threadify(std::function fn, int nrThreads) { + std::vector threads; + for (int i = 0; i < nrThreads; i++) { + threads.emplace_back(fn); + } + for (auto& t : threads) { t.join(); } +} + +/* + * Private/static benchmark pairs should give the same numbers if run + * in single threaded mode (regardless of '\0' being conditionally + * written or not). + */ +BENCHMARK(static_std_1t, n) { + threadify([n] { Bench().benchStatic(n); }, 1); +} + +BENCHMARK_RELATIVE(privat_std_1t, n) { + threadify([n] { Bench().benchPrivate(n); }, 1); +} + +BENCHMARK_RELATIVE(static_fbs_1t, n) { + threadify([n] { Bench().benchStatic(n); }, 1); +} + +BENCHMARK_RELATIVE(privat_fbs_1t, n) { + threadify([n] { Bench().benchPrivate(n); }, 1); +} + +BENCHMARK_RELATIVE(static_sp__1t, n) { + threadify([n] { Bench().benchStatic(n); }, 1); +} + +BENCHMARK_RELATIVE(privat_sp__1t, n) { + threadify([n] { Bench().benchPrivate(n); }, 1); +} + +/* + * Private/static benchmark pairs when run on multiple threads: + * + * - should be similar if '\0' is conditionally written or when + * FBSTRING_CONSERVATIVE is defined (in this case '\0' is not written at all) + * + * - static used to be significanlty slower than private for fbstring + * with unconditional '\0' written at the end. + */ +BENCHMARK(static_std_32t, n) { + threadify([n] { Bench().benchStatic(n); }, 32); +} + +BENCHMARK_RELATIVE(privat_std_32t, n) { + threadify([n] { Bench().benchPrivate(n); }, 32); +} + +BENCHMARK_RELATIVE(static_fbs_32t, n) { + threadify([n] { Bench().benchStatic(n); }, 32); +} + +BENCHMARK_RELATIVE(privat_fbs_32t, n) { + threadify([n] { Bench().benchPrivate(n); }, 32); +} + +BENCHMARK_RELATIVE(static_sp__32t, n) { + threadify([n] { Bench().benchStatic(n); }, 32); +} + +BENCHMARK_RELATIVE(privat_sp__32t, n) { + threadify([n] { Bench().benchPrivate(n); }, 32); +} + +int main(int argc, char *argv[]) { + google::ParseCommandLineFlags(&argc, &argv, true); + folly::runBenchmarks(); + return 0; +}