2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/Math.h>
21 #include <folly/Benchmark.h>
25 T brokenButWidespreadDivCeil(T num, T denom) {
26 return (num + denom - 1) / denom;
30 T viaFloatDivCeil(T num, T denom) {
31 return static_cast<T>(ceilf(static_cast<float>(num) / denom));
35 T viaDoubleDivCeil(T num, T denom) {
36 return static_cast<T>(ceil(static_cast<double>(num) / denom));
40 T viaLongDoubleDivCeil(T num, T denom) {
41 return static_cast<T>(ceill(static_cast<long double>(num) / denom));
45 std::vector<T> divValues() {
47 for (T i = 1; i < std::numeric_limits<T>::max() && i <= 1000; ++i) {
50 rv.push_back(std::numeric_limits<T>::max() / i);
51 auto x = std::numeric_limits<T>::min() / i;
59 template <typename T, typename F>
60 void runDivTests(const F& func, size_t iters) {
61 std::vector<T> denoms;
62 std::vector<T> numers;
64 denoms = divValues<T>();
67 std::mt19937 rnd(1234);
68 std::shuffle(denoms.begin(), denoms.end(), rnd);
69 std::shuffle(numers.begin(), numers.end(), rnd);
76 if (std::is_signed<T>::value && n == std::numeric_limits<T>::min() &&
78 // min / -1 overflows in two's complement
84 folly::doNotOptimizeAway(dep);
93 BENCHMARK_DRAW_LINE();
94 BENCHMARK(divTruncInt8, iters) {
95 runDivTests<int8_t>(&folly::divTrunc<int8_t, int8_t>, iters);
97 BENCHMARK(divFloorInt8, iters) {
98 runDivTests<int8_t>(&folly::divFloor<int8_t, int8_t>, iters);
100 BENCHMARK(divCeilInt8, iters) {
101 runDivTests<int8_t>(&folly::divCeil<int8_t, int8_t>, iters);
103 BENCHMARK_RELATIVE(branchlessDivCeilInt8, iters) {
104 runDivTests<int8_t>(&folly::detail::divCeilBranchless<int8_t>, iters);
106 BENCHMARK_RELATIVE(branchfulDivCeilInt8, iters) {
107 runDivTests<int8_t>(&folly::detail::divCeilBranchful<int8_t>, iters);
109 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt8, iters) {
110 runDivTests<int8_t>(&brokenButWidespreadDivCeil<int8_t>, iters);
112 BENCHMARK_RELATIVE(viaFloatDivCeilInt8, iters) {
113 runDivTests<int8_t>(&viaFloatDivCeil<int8_t>, iters);
115 BENCHMARK_RELATIVE(viaDoubleDivCeilInt8, iters) {
116 runDivTests<int8_t>(&viaDoubleDivCeil<int8_t>, iters);
118 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt8, iters) {
119 runDivTests<int8_t>(&viaLongDoubleDivCeil<int8_t>, iters);
121 BENCHMARK(divRoundAwayInt8, iters) {
122 runDivTests<int8_t>(&folly::divRoundAway<int8_t, int8_t>, iters);
125 BENCHMARK_DRAW_LINE();
126 BENCHMARK(divTruncInt16, iters) {
127 runDivTests<int16_t>(&folly::divTrunc<int16_t, int16_t>, iters);
129 BENCHMARK(divFloorInt16, iters) {
130 runDivTests<int16_t>(&folly::divFloor<int16_t, int16_t>, iters);
132 BENCHMARK(divCeilInt16, iters) {
133 runDivTests<int16_t>(&folly::divCeil<int16_t, int16_t>, iters);
135 BENCHMARK_RELATIVE(branchlessDivCeilInt16, iters) {
136 runDivTests<int16_t>(&folly::detail::divCeilBranchless<int16_t>, iters);
138 BENCHMARK_RELATIVE(branchfulDivCeilInt16, iters) {
139 runDivTests<int16_t>(&folly::detail::divCeilBranchful<int16_t>, iters);
141 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt16, iters) {
142 runDivTests<int16_t>(&brokenButWidespreadDivCeil<int16_t>, iters);
144 BENCHMARK_RELATIVE(viaFloatDivCeilInt16, iters) {
145 runDivTests<int16_t>(&viaFloatDivCeil<int16_t>, iters);
147 BENCHMARK_RELATIVE(viaDoubleDivCeilInt16, iters) {
148 runDivTests<int16_t>(&viaDoubleDivCeil<int16_t>, iters);
150 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt16, iters) {
151 runDivTests<int16_t>(&viaLongDoubleDivCeil<int16_t>, iters);
153 BENCHMARK(divRoundAwayInt16, iters) {
154 runDivTests<int16_t>(&folly::divRoundAway<int16_t, int16_t>, iters);
157 BENCHMARK_DRAW_LINE();
158 BENCHMARK(divTruncInt32, iters) {
159 runDivTests<int32_t>(&folly::divTrunc<int32_t, int32_t>, iters);
161 BENCHMARK(divFloorInt32, iters) {
162 runDivTests<int32_t>(&folly::divFloor<int32_t, int32_t>, iters);
164 BENCHMARK(divCeilInt32, iters) {
165 runDivTests<int32_t>(&folly::divCeil<int32_t, int32_t>, iters);
167 BENCHMARK_RELATIVE(branchlessDivCeilInt32, iters) {
168 runDivTests<int32_t>(&folly::detail::divCeilBranchless<int32_t>, iters);
170 BENCHMARK_RELATIVE(branchfulDivCeilInt32, iters) {
171 runDivTests<int32_t>(&folly::detail::divCeilBranchful<int32_t>, iters);
173 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt32, iters) {
174 runDivTests<int32_t>(&brokenButWidespreadDivCeil<int32_t>, iters);
176 BENCHMARK_RELATIVE(approxViaFloatDivCeilInt32, iters) {
177 runDivTests<int32_t>(&viaFloatDivCeil<int32_t>, iters);
179 BENCHMARK_RELATIVE(viaDoubleDivCeilInt32, iters) {
180 runDivTests<int32_t>(&viaDoubleDivCeil<int32_t>, iters);
182 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt32, iters) {
183 runDivTests<int32_t>(&viaLongDoubleDivCeil<int32_t>, iters);
185 BENCHMARK(divRoundAwayInt32, iters) {
186 runDivTests<int32_t>(&folly::divRoundAway<int32_t, int32_t>, iters);
189 BENCHMARK_DRAW_LINE();
190 BENCHMARK(divTruncInt64, iters) {
191 runDivTests<int64_t>(&folly::divTrunc<int64_t, int64_t>, iters);
193 BENCHMARK(divFloorInt64, iters) {
194 runDivTests<int64_t>(&folly::divFloor<int64_t, int64_t>, iters);
196 BENCHMARK(divCeilInt64, iters) {
197 runDivTests<int64_t>(&folly::divCeil<int64_t, int64_t>, iters);
199 BENCHMARK_RELATIVE(branchlessDivCeilInt64, iters) {
200 runDivTests<int64_t>(&folly::detail::divCeilBranchless<int64_t>, iters);
202 BENCHMARK_RELATIVE(branchfulDivCeilInt64, iters) {
203 runDivTests<int64_t>(&folly::detail::divCeilBranchful<int64_t>, iters);
205 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt64, iters) {
206 runDivTests<int64_t>(&brokenButWidespreadDivCeil<int64_t>, iters);
208 BENCHMARK_RELATIVE(approxViaFloatDivCeilInt64, iters) {
209 runDivTests<int64_t>(&viaFloatDivCeil<int64_t>, iters);
211 BENCHMARK_RELATIVE(approxViaDoubleDivCeilInt64, iters) {
212 runDivTests<int64_t>(&viaDoubleDivCeil<int64_t>, iters);
214 BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt64, iters) {
215 runDivTests<int64_t>(&viaLongDoubleDivCeil<int64_t>, iters);
217 BENCHMARK(divRoundAwayInt64, iters) {
218 runDivTests<int64_t>(&folly::divRoundAway<int64_t, int64_t>, iters);
221 BENCHMARK_DRAW_LINE();
222 BENCHMARK(divTruncUint8, iters) {
223 runDivTests<uint8_t>(&folly::divTrunc<uint8_t, uint8_t>, iters);
225 BENCHMARK(divFloorUint8, iters) {
226 runDivTests<uint8_t>(&folly::divFloor<uint8_t, uint8_t>, iters);
228 BENCHMARK(divCeilUint8, iters) {
229 runDivTests<uint8_t>(&folly::divCeil<uint8_t, uint8_t>, iters);
231 BENCHMARK_RELATIVE(branchlessDivCeilUint8, iters) {
232 runDivTests<uint8_t>(&folly::detail::divCeilBranchless<uint8_t>, iters);
234 BENCHMARK_RELATIVE(branchfulDivCeilUint8, iters) {
235 runDivTests<uint8_t>(&folly::detail::divCeilBranchful<uint8_t>, iters);
237 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint8, iters) {
238 runDivTests<uint8_t>(&brokenButWidespreadDivCeil<uint8_t>, iters);
240 BENCHMARK_RELATIVE(viaFloatDivCeilUint8, iters) {
241 runDivTests<uint8_t>(&viaFloatDivCeil<uint8_t>, iters);
243 BENCHMARK_RELATIVE(viaDoubleDivCeilUint8, iters) {
244 runDivTests<uint8_t>(&viaDoubleDivCeil<uint8_t>, iters);
246 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint8, iters) {
247 runDivTests<uint8_t>(&viaLongDoubleDivCeil<uint8_t>, iters);
249 BENCHMARK(divRoundAwayUint8, iters) {
250 runDivTests<uint8_t>(&folly::divRoundAway<uint8_t, uint8_t>, iters);
253 BENCHMARK_DRAW_LINE();
254 BENCHMARK(divTruncUint16, iters) {
255 runDivTests<uint16_t>(&folly::divTrunc<uint16_t, uint16_t>, iters);
257 BENCHMARK(divFloorUint16, iters) {
258 runDivTests<uint16_t>(&folly::divFloor<uint16_t, uint16_t>, iters);
260 BENCHMARK(divCeilUint16, iters) {
261 runDivTests<uint16_t>(&folly::divCeil<uint16_t, uint16_t>, iters);
263 BENCHMARK_RELATIVE(branchlessDivCeilUint16, iters) {
264 runDivTests<uint16_t>(&folly::detail::divCeilBranchless<uint16_t>, iters);
266 BENCHMARK_RELATIVE(branchfulDivCeilUint16, iters) {
267 runDivTests<uint16_t>(&folly::detail::divCeilBranchful<uint16_t>, iters);
269 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint16, iters) {
270 runDivTests<uint16_t>(&brokenButWidespreadDivCeil<uint16_t>, iters);
272 BENCHMARK_RELATIVE(viaFloatDivCeilUint16, iters) {
273 runDivTests<uint16_t>(&viaFloatDivCeil<uint16_t>, iters);
275 BENCHMARK_RELATIVE(viaDoubleDivCeilUint16, iters) {
276 runDivTests<uint16_t>(&viaDoubleDivCeil<uint16_t>, iters);
278 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint16, iters) {
279 runDivTests<uint16_t>(&viaLongDoubleDivCeil<uint16_t>, iters);
281 BENCHMARK(divRoundAwayUint16, iters) {
282 runDivTests<uint16_t>(&folly::divRoundAway<uint16_t, uint16_t>, iters);
285 BENCHMARK_DRAW_LINE();
286 BENCHMARK(divTruncUint32, iters) {
287 runDivTests<uint32_t>(&folly::divTrunc<uint32_t, uint32_t>, iters);
289 BENCHMARK(divFloorUint32, iters) {
290 runDivTests<uint32_t>(&folly::divFloor<uint32_t, uint32_t>, iters);
292 BENCHMARK(divCeilUint32, iters) {
293 runDivTests<uint32_t>(&folly::divCeil<uint32_t, uint32_t>, iters);
295 BENCHMARK_RELATIVE(branchlessDivCeilUint32, iters) {
296 runDivTests<uint32_t>(&folly::detail::divCeilBranchless<uint32_t>, iters);
298 BENCHMARK_RELATIVE(branchfulDivCeilUint32, iters) {
299 runDivTests<uint32_t>(&folly::detail::divCeilBranchful<uint32_t>, iters);
301 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint32, iters) {
302 runDivTests<uint32_t>(&brokenButWidespreadDivCeil<uint32_t>, iters);
304 BENCHMARK_RELATIVE(approxViaFloatDivCeilUint32, iters) {
305 runDivTests<uint32_t>(&viaFloatDivCeil<uint32_t>, iters);
307 BENCHMARK_RELATIVE(viaDoubleDivCeilUint32, iters) {
308 runDivTests<uint32_t>(&viaDoubleDivCeil<uint32_t>, iters);
310 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint32, iters) {
311 runDivTests<uint32_t>(&viaLongDoubleDivCeil<uint32_t>, iters);
313 BENCHMARK(divRoundAwayUint32, iters) {
314 runDivTests<uint32_t>(&folly::divRoundAway<uint32_t, uint32_t>, iters);
317 BENCHMARK_DRAW_LINE();
318 BENCHMARK(divTruncUint64, iters) {
319 runDivTests<uint64_t>(&folly::divTrunc<uint64_t, uint64_t>, iters);
321 BENCHMARK(divFloorUint64, iters) {
322 runDivTests<uint64_t>(&folly::divFloor<uint64_t, uint64_t>, iters);
324 BENCHMARK(divCeilUint64, iters) {
325 runDivTests<uint64_t>(&folly::divCeil<uint64_t, uint64_t>, iters);
327 BENCHMARK_RELATIVE(branchlessDivCeilUint64, iters) {
328 runDivTests<uint64_t>(&folly::detail::divCeilBranchless<uint64_t>, iters);
330 BENCHMARK_RELATIVE(branchfulDivCeilUint64, iters) {
331 runDivTests<uint64_t>(&folly::detail::divCeilBranchful<uint64_t>, iters);
333 BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint64, iters) {
334 runDivTests<uint64_t>(&brokenButWidespreadDivCeil<uint64_t>, iters);
336 BENCHMARK_RELATIVE(approxViaFloatDivCeilUint64, iters) {
337 runDivTests<uint64_t>(&viaFloatDivCeil<uint64_t>, iters);
339 BENCHMARK_RELATIVE(approxViaDoubleDivCeilUint64, iters) {
340 runDivTests<uint64_t>(&viaDoubleDivCeil<uint64_t>, iters);
342 BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint64, iters) {
343 runDivTests<uint64_t>(&viaLongDoubleDivCeil<uint64_t>, iters);
345 BENCHMARK(divRoundAwayUint64, iters) {
346 runDivTests<uint64_t>(&folly::divRoundAway<uint64_t, uint64_t>, iters);
349 int main(int argc, char** argv) {
350 gflags::ParseCommandLineFlags(&argc, &argv, true);
351 folly::runBenchmarks();
356 Benchmarks run single-threaded on a dual Xeon E5-2660 @ 2.2 Ghz with
357 hyperthreading (16 physical cores, 20 MB cache per socket, 256 GB RAM)
359 Benchmarks used --bm_min_iters=10000000.
361 divTrunc is just a native integral division. viaDoubleViaCeil doesn't
362 have full accuracy for Int64 or Uint64. There is a loop-carried
363 dependency for all of the div* tests, but there is a bit of extra slack
364 (a predictable call, a load that should be from the L1, and a predictable
365 not-taken branch in addition to the loop's branch) in the driving loop,
366 so the benchmark driver's attempt to subtract the overhead of the loop
367 might mean that the latency numbers here are slightly too low or too high.
369 The branchful implementation's branch is very predictable in this
370 microbenchmark for unsigned types, since it only needs to predict a
371 zero numerator. That's likely to be true in real life as well, so we
372 make this the default.
374 I was surprised at the speed of float and double division, but
375 the only case where it actually wins by much and is correct is for
376 int16_t. (float + ceil is faster for the 32-bit case, but is only
377 an approximation.) I ran a similar benchmark setup for ARM and ARM64.
378 On ARM the conditional versions win by quite a bit. 32-bit ARM doesn't
379 have a native integer divide, so getting the remainder after a division
380 (to see if truncation occurred) is more work than preconditioning the
381 numerator to make truncation go in the correct direction. 64-bit ARM
382 had the same winners and losers as x86_64, at least on the two physical
385 ============================================================================
386 folly/test/MathBenchmark.cpp relative time/iter iters/s
387 ============================================================================
388 ----------------------------------------------------------------------------
389 divTruncInt8 8.89ns 112.44M
390 divFloorInt8 10.99ns 91.00M
391 divCeilInt8 10.95ns 91.33M
392 branchlessDivCeilInt8 100.40% 10.91ns 91.69M
393 branchfulDivCeilInt8 88.87% 12.32ns 81.16M
394 brokenButWidespreadDivCeilInt8 109.20% 10.03ns 99.73M
395 viaFloatDivCeilInt8 109.68% 9.98ns 100.17M
396 viaDoubleDivCeilInt8 95.47% 11.47ns 87.19M
397 viaLongDoubleDivCeilInt8 31.65% 34.59ns 28.91M
398 divRoundAwayInt8 10.42ns 95.97M
399 ----------------------------------------------------------------------------
400 divTruncInt16 8.68ns 115.17M
401 divFloorInt16 10.94ns 91.38M
402 divCeilInt16 10.91ns 91.70M
403 branchlessDivCeilInt16 99.44% 10.97ns 91.18M
404 branchfulDivCeilInt16 81.68% 13.35ns 74.90M
405 brokenButWidespreadDivCeilInt16 109.50% 9.96ns 100.40M
406 viaFloatDivCeilInt16 108.04% 10.09ns 99.07M
407 viaDoubleDivCeilInt16 85.38% 12.77ns 78.29M
408 viaLongDoubleDivCeilInt16 29.99% 36.36ns 27.50M
409 divRoundAwayInt16 10.59ns 94.46M
410 ----------------------------------------------------------------------------
411 divTruncInt32 8.38ns 119.29M
412 divFloorInt32 11.01ns 90.84M
413 divCeilInt32 11.12ns 89.91M
414 branchlessDivCeilInt32 101.94% 10.91ns 91.66M
415 branchfulDivCeilInt32 84.67% 13.14ns 76.12M
416 brokenButWidespreadDivCeilInt32 117.61% 9.46ns 105.75M
417 approxViaFloatDivCeilInt32 115.98% 9.59ns 104.28M
418 viaDoubleDivCeilInt32 89.86% 12.38ns 80.79M
419 viaLongDoubleDivCeilInt32 30.84% 36.06ns 27.73M
420 divRoundAwayInt32 11.30ns 88.50M
421 ----------------------------------------------------------------------------
422 divTruncInt64 16.07ns 62.21M
423 divFloorInt64 18.37ns 54.45M
424 divCeilInt64 18.61ns 53.74M
425 branchlessDivCeilInt64 100.43% 18.53ns 53.97M
426 branchfulDivCeilInt64 84.65% 21.98ns 45.49M
427 brokenButWidespreadDivCeilInt64 108.47% 17.16ns 58.29M
428 approxViaFloatDivCeilInt64 190.99% 9.74ns 102.64M
429 approxViaDoubleDivCeilInt64 148.64% 12.52ns 79.88M
430 viaLongDoubleDivCeilInt64 52.01% 35.77ns 27.95M
431 divRoundAwayInt64 18.79ns 53.21M
432 ----------------------------------------------------------------------------
433 divTruncUint8 7.76ns 128.89M
434 divFloorUint8 8.29ns 120.61M
435 divCeilUint8 9.61ns 104.09M
436 branchlessDivCeilUint8 112.00% 8.58ns 116.58M
437 branchfulDivCeilUint8 114.01% 8.43ns 118.67M
438 brokenButWidespreadDivCeilUint8 100.48% 9.56ns 104.58M
439 viaFloatDivCeilUint8 103.53% 9.28ns 107.76M
440 viaDoubleDivCeilUint8 85.75% 11.20ns 89.26M
441 viaLongDoubleDivCeilUint8 27.72% 34.65ns 28.86M
442 divRoundAwayUint8 9.60ns 104.11M
443 ----------------------------------------------------------------------------
444 divTruncUint16 8.39ns 119.19M
445 divFloorUint16 8.28ns 120.82M
446 divCeilUint16 9.90ns 100.96M
447 branchlessDivCeilUint16 100.23% 9.88ns 101.19M
448 branchfulDivCeilUint16 107.83% 9.19ns 108.87M
449 brokenButWidespreadDivCeilUint16 99.89% 9.92ns 100.85M
450 viaFloatDivCeilUint16 100.54% 9.85ns 101.50M
451 viaDoubleDivCeilUint16 77.38% 12.80ns 78.13M
452 viaLongDoubleDivCeilUint16 27.30% 36.28ns 27.56M
453 divRoundAwayUint16 9.82ns 101.85M
454 ----------------------------------------------------------------------------
455 divTruncUint32 8.12ns 123.20M
456 divFloorUint32 8.09ns 123.58M
457 divCeilUint32 8.44ns 118.55M
458 branchlessDivCeilUint32 88.27% 9.56ns 104.64M
459 branchfulDivCeilUint32 98.91% 8.53ns 117.25M
460 brokenButWidespreadDivCeilUint32 93.48% 9.02ns 110.82M
461 approxViaFloatDivCeilUint32 86.29% 9.78ns 102.30M
462 viaDoubleDivCeilUint32 66.76% 12.63ns 79.15M
463 viaLongDoubleDivCeilUint32 23.35% 36.13ns 27.68M
464 divRoundAwayUint32 8.47ns 118.03M
465 ----------------------------------------------------------------------------
466 divTruncUint64 12.38ns 80.79M
467 divFloorUint64 12.27ns 81.47M
468 divCeilUint64 12.66ns 78.99M
469 branchlessDivCeilUint64 93.46% 13.55ns 73.83M
470 branchfulDivCeilUint64 100.30% 12.62ns 79.23M
471 brokenButWidespreadDivCeilUint64 99.41% 12.73ns 78.53M
472 approxViaFloatDivCeilUint64 106.59% 11.88ns 84.19M
473 approxViaDoubleDivCeilUint64 92.14% 13.74ns 72.78M
474 viaLongDoubleDivCeilUint64 33.51% 37.78ns 26.47M
475 divRoundAwayUint64 12.34ns 81.02M
476 ============================================================================