From 1d9d5cbe1dde81dd5fae5aada11a7091dbe4aa6b Mon Sep 17 00:00:00 2001 From: Nathan Bronson Date: Wed, 7 Sep 2016 15:50:38 -0700 Subject: [PATCH] integer division with controlled rounding in Math.h Summary: C++'s integer division performs truncation, but it is fairly common that people actually want to round up. There are actually four rounding modes that make some sense: toward negative infinity (floor), toward positive infinity (ceil), toward zero (truncation), and away from zero (?). It is pretty common that code that wants ceil actually writes (a + b - 1) / b, which doesn't work at all for negative values (and has a potential overflow issue). This diff adds 4 templated functions for performing integer division: divFloor, divCeil, divTrunc, and divRoundAway. They are not subject to unnecessary internal underflow or overflow, and they work correctly across their entire input domain. I did a bit of benchmarking across x86_64, arm64, and 32-bit ARM. Only 32-bit ARM was different. That's not surprising since it doesn't have an integer division instruction, and the function that implements integer division doesn't produce the remainder for free. On 32-bit ARM a branchful version that doesn't need the modulus is used. Reviewed By: yfeldblum Differential Revision: D3806743 fbshipit-source-id: c14c56717e96f135321920e64acbfe9dcb1fe039 --- folly/Makefile.am | 1 + folly/Math.h | 201 +++++++++++++++ folly/test/Makefile.am | 4 + folly/test/MathBenchmark.cpp | 477 +++++++++++++++++++++++++++++++++++ folly/test/MathTest.cpp | 246 ++++++++++++++++++ 5 files changed, 929 insertions(+) create mode 100644 folly/Math.h create mode 100644 folly/test/MathBenchmark.cpp create mode 100644 folly/test/MathTest.cpp diff --git a/folly/Makefile.am b/folly/Makefile.am index baffeb45..b22aae8c 100644 --- a/folly/Makefile.am +++ b/folly/Makefile.am @@ -254,6 +254,7 @@ nobase_follyinclude_HEADERS = \ MallctlHelper.h \ Malloc.h \ MapUtil.h \ + Math.h \ Memory.h \ MemoryMapping.h \ MicroSpinLock.h \ diff --git a/folly/Math.h b/folly/Math.h new file mode 100644 index 00000000..d37833b9 --- /dev/null +++ b/folly/Math.h @@ -0,0 +1,201 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Some arithmetic functions that seem to pop up or get hand-rolled a lot. + * So far they are all focused on integer division. + */ + +#pragma once + +#include + +#include +#include + +namespace folly { + +namespace detail { + +template +inline constexpr T divFloorBranchless(T num, T denom) { + // floor != trunc when the answer isn't exact and truncation went the + // wrong way (truncation went toward positive infinity). That happens + // when the true answer is negative, which happens when num and denom + // have different signs. The following code compiles branch-free on + // many platforms. + return (num / denom) + + ((num % denom) != 0 ? 1 : 0) * + (std::is_signed::value && (num ^ denom) < 0 ? -1 : 0); +} + +template +inline constexpr T divFloorBranchful(T num, T denom) { + // First case handles negative result by preconditioning numerator. + // Preconditioning decreases the magnitude of the numerator, which is + // itself sign-dependent. Second case handles zero or positive rational + // result, where trunc and floor are the same. + return std::is_signed::value && (num ^ denom) < 0 && num != 0 + ? (num + (num > 0 ? -1 : 1)) / denom - 1 + : num / denom; +} + +template +inline constexpr T divCeilBranchless(T num, T denom) { + // ceil != trunc when the answer isn't exact (truncation occurred) + // and truncation went away from positive infinity. That happens when + // the true answer is positive, which happens when num and denom have + // the same sign. + return (num / denom) + + ((num % denom) != 0 ? 1 : 0) * + (std::is_signed::value && (num ^ denom) < 0 ? 0 : 1); +} + +template +inline constexpr T divCeilBranchful(T num, T denom) { + // First case handles negative or zero rational result, where trunc and ceil + // are the same. + // Second case handles positive result by preconditioning numerator. + // Preconditioning decreases the magnitude of the numerator, which is + // itself sign-dependent. + return (std::is_signed::value && (num ^ denom) < 0) || num == 0 + ? num / denom + : (num + (num > 0 ? -1 : 1)) / denom + 1; +} + +template +inline constexpr T divRoundAwayBranchless(T num, T denom) { + // away != trunc whenever truncation actually occurred, which is when + // there is a non-zero remainder. If the unrounded result is negative + // then fixup moves it toward negative infinity. If the unrounded + // result is positive then adjustment makes it larger. + return (num / denom) + + ((num % denom) != 0 ? 1 : 0) * + (std::is_signed::value && (num ^ denom) < 0 ? -1 : 1); +} + +template +inline constexpr T divRoundAwayBranchful(T num, T denom) { + // First case of second ternary operator handles negative rational + // result, which is the same as divFloor. Second case of second ternary + // operator handles positive result, which is the same as divCeil. + // Zero case is separated for simplicity. + return num == 0 ? 0 + : (num + (num > 0 ? -1 : 1)) / denom + + (std::is_signed::value && (num ^ denom) < 0 ? -1 : 1); +} + +template +using IdivResultType = typename std::enable_if< + std::is_integral::value && std::is_integral::value && + !std::is_same::value && + !std::is_same::value, + decltype(N{1} / D{1})>::type; +} + +#if defined(__arm__) && !FOLLY_A64 +constexpr auto kIntegerDivisionGivesRemainder = false; +#else +constexpr auto kIntegerDivisionGivesRemainder = true; +#endif + +/** + * Returns num/denom, rounded toward negative infinity. Put another way, + * returns the largest integral value that is less than or equal to the + * exact (not rounded) fraction num/denom. + * + * The matching remainder (num - divFloor(num, denom) * denom) can be + * negative only if denom is negative, unlike in truncating division. + * Note that for unsigned types this is the same as the normal integer + * division operator. divFloor is equivalent to python's integral division + * operator //. + * + * This function undergoes the same integer promotion rules as a + * built-in operator, except that we don't allow bool -> int promotion. + * This function is undefined if denom == 0. It is also undefined if the + * result type T is a signed type, num is std::numeric_limits::min(), + * and denom is equal to -1 after conversion to the result type. + */ +template +inline constexpr detail::IdivResultType divFloor(N num, D denom) { + using R = decltype(num / denom); + return kIntegerDivisionGivesRemainder && std::is_signed::value + ? detail::divFloorBranchless(num, denom) + : detail::divFloorBranchful(num, denom); +} + +/** + * Returns num/denom, rounded toward positive infinity. Put another way, + * returns the smallest integral value that is greater than or equal to + * the exact (not rounded) fraction num/denom. + * + * This function undergoes the same integer promotion rules as a + * built-in operator, except that we don't allow bool -> int promotion. + * This function is undefined if denom == 0. It is also undefined if the + * result type T is a signed type, num is std::numeric_limits::min(), + * and denom is equal to -1 after conversion to the result type. + */ +template +inline constexpr detail::IdivResultType divCeil(N num, D denom) { + using R = decltype(num / denom); + return kIntegerDivisionGivesRemainder && std::is_signed::value + ? detail::divCeilBranchless(num, denom) + : detail::divCeilBranchful(num, denom); +} + +/** + * Returns num/denom, rounded toward zero. If num and denom are non-zero + * and have different signs (so the unrounded fraction num/denom is + * negative), returns divCeil, otherwise returns divFloor. If T is an + * unsigned type then this is always equal to divFloor. + * + * Note that this is the same as the normal integer division operator, + * at least since C99 (before then the rounding for negative results was + * implementation defined). This function is here for completeness and + * as a place to hang this comment. + * + * This function undergoes the same integer promotion rules as a + * built-in operator, except that we don't allow bool -> int promotion. + * This function is undefined if denom == 0. It is also undefined if the + * result type T is a signed type, num is std::numeric_limits::min(), + * and denom is equal to -1 after conversion to the result type. + */ +template +inline constexpr detail::IdivResultType divTrunc(N num, D denom) { + return num / denom; +} + +/** + * Returns num/denom, rounded away from zero. If num and denom are + * non-zero and have different signs (so the unrounded fraction num/denom + * is negative), returns divFloor, otherwise returns divCeil. If T is + * an unsigned type then this is always equal to divCeil. + * + * This function undergoes the same integer promotion rules as a + * built-in operator, except that we don't allow bool -> int promotion. + * This function is undefined if denom == 0. It is also undefined if the + * result type T is a signed type, num is std::numeric_limits::min(), + * and denom is equal to -1 after conversion to the result type. + */ +template +inline constexpr detail::IdivResultType divRoundAway(N num, D denom) { + using R = decltype(num / denom); + return kIntegerDivisionGivesRemainder && std::is_signed::value + ? detail::divRoundAwayBranchless(num, denom) + : detail::divRoundAwayBranchful(num, denom); +} + +} // namespace folly diff --git a/folly/test/Makefile.am b/folly/test/Makefile.am index d8c5acae..0ee3e1ff 100644 --- a/folly/test/Makefile.am +++ b/folly/test/Makefile.am @@ -12,6 +12,7 @@ TESTS= \ conv_test \ expected_test \ range_test \ + math_test \ bits_test \ bit_iterator_test @@ -135,6 +136,9 @@ expected_test_LDADD = libfollytestmain.la $(top_builddir)/libfollybenchmark.la range_test_SOURCES = RangeTest.cpp range_test_LDADD = libfollytestmain.la +math_test_SOURCES = MathTest.cpp +math_test_LDADD = libfollytestmain.la $(top_builddir)/libfollybenchmark.la + bits_test_SOURCES = BitsTest.cpp bits_test_LDADD = libfollytestmain.la $(top_builddir)/libfollybenchmark.la diff --git a/folly/test/MathBenchmark.cpp b/folly/test/MathBenchmark.cpp new file mode 100644 index 00000000..30d7e5e6 --- /dev/null +++ b/folly/test/MathBenchmark.cpp @@ -0,0 +1,477 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +namespace { +template +T brokenButWidespreadDivCeil(T num, T denom) { + return (num + denom - 1) / denom; +} + +template +T viaFloatDivCeil(T num, T denom) { + return static_cast(ceilf(static_cast(num) / denom)); +} + +template +T viaDoubleDivCeil(T num, T denom) { + return static_cast(ceil(static_cast(num) / denom)); +} + +template +T viaLongDoubleDivCeil(T num, T denom) { + return static_cast(ceill(static_cast(num) / denom)); +} + +template +std::vector divValues() { + std::vector rv; + for (T i = 1; i < std::numeric_limits::max() && i <= 1000; ++i) { + rv.push_back(i); + rv.push_back(-i); + rv.push_back(std::numeric_limits::max() / i); + auto x = std::numeric_limits::min() / i; + if (x != 0) { + rv.push_back(x); + } + } + return rv; +} + +template +void runDivTests(const F& func, size_t iters) { + std::vector denoms; + std::vector numers; + BENCHMARK_SUSPEND { + denoms = divValues(); + numers = denoms; + numers.push_back(0); + std::mt19937 rnd(1234); + std::shuffle(denoms.begin(), denoms.end(), rnd); + std::shuffle(numers.begin(), numers.end(), rnd); + } + T dep = 0; + while (true) { + for (T d : denoms) { + for (T n : numers) { + n ^= dep; + if (std::is_signed::value && n == std::numeric_limits::min() && + d == -1) { + // min / -1 overflows in two's complement + d = -2; + } + dep = func(n, d); + + if (--iters == 0) { + folly::doNotOptimizeAway(dep); + return; + } + } + } + } +} +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncInt8, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorInt8, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilInt8, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilInt8, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilInt8, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt8, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(viaFloatDivCeilInt8, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(viaDoubleDivCeilInt8, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt8, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayInt8, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncInt16, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorInt16, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilInt16, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilInt16, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilInt16, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt16, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(viaFloatDivCeilInt16, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(viaDoubleDivCeilInt16, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt16, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayInt16, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncInt32, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorInt32, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilInt32, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilInt32, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilInt32, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt32, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(approxViaFloatDivCeilInt32, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(viaDoubleDivCeilInt32, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt32, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayInt32, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncInt64, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorInt64, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilInt64, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilInt64, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilInt64, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilInt64, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(approxViaFloatDivCeilInt64, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(approxViaDoubleDivCeilInt64, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilInt64, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayInt64, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncUint8, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorUint8, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilUint8, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilUint8, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilUint8, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint8, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(viaFloatDivCeilUint8, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(viaDoubleDivCeilUint8, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint8, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayUint8, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncUint16, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorUint16, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilUint16, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilUint16, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilUint16, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint16, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(viaFloatDivCeilUint16, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(viaDoubleDivCeilUint16, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint16, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayUint16, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncUint32, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorUint32, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilUint32, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilUint32, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilUint32, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint32, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(approxViaFloatDivCeilUint32, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(viaDoubleDivCeilUint32, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint32, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayUint32, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +BENCHMARK_DRAW_LINE(); +BENCHMARK(divTruncUint64, iters) { + runDivTests(&folly::divTrunc, iters); +} +BENCHMARK(divFloorUint64, iters) { + runDivTests(&folly::divFloor, iters); +} +BENCHMARK(divCeilUint64, iters) { + runDivTests(&folly::divCeil, iters); +} +BENCHMARK_RELATIVE(branchlessDivCeilUint64, iters) { + runDivTests(&folly::detail::divCeilBranchless, iters); +} +BENCHMARK_RELATIVE(branchfulDivCeilUint64, iters) { + runDivTests(&folly::detail::divCeilBranchful, iters); +} +BENCHMARK_RELATIVE(brokenButWidespreadDivCeilUint64, iters) { + runDivTests(&brokenButWidespreadDivCeil, iters); +} +BENCHMARK_RELATIVE(approxViaFloatDivCeilUint64, iters) { + runDivTests(&viaFloatDivCeil, iters); +} +BENCHMARK_RELATIVE(approxViaDoubleDivCeilUint64, iters) { + runDivTests(&viaDoubleDivCeil, iters); +} +BENCHMARK_RELATIVE(viaLongDoubleDivCeilUint64, iters) { + runDivTests(&viaLongDoubleDivCeil, iters); +} +BENCHMARK(divRoundAwayUint64, iters) { + runDivTests(&folly::divRoundAway, iters); +} + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + folly::runBenchmarks(); + return 0; +} + +/* +Benchmarks run single-threaded on a dual Xeon E5-2660 @ 2.2 Ghz with +hyperthreading (16 physical cores, 20 MB cache per socket, 256 GB RAM) + +Benchmarks used --bm_min_iters=10000000. + +divTrunc is just a native integral division. viaDoubleViaCeil doesn't +have full accuracy for Int64 or Uint64. There is a loop-carried +dependency for all of the div* tests, but there is a bit of extra slack +(a predictable call, a load that should be from the L1, and a predictable +not-taken branch in addition to the loop's branch) in the driving loop, +so the benchmark driver's attempt to subtract the overhead of the loop +might mean that the latency numbers here are slightly too low or too high. + +The branchful implementation's branch is very predictable in this +microbenchmark for unsigned types, since it only needs to predict a +zero numerator. That's likely to be true in real life as well, so we +make this the default. + +I was surprised at the speed of float and double division, but +the only case where it actually wins by much and is correct is for +int16_t. (float + ceil is faster for the 32-bit case, but is only +an approximation.) I ran a similar benchmark setup for ARM and ARM64. +On ARM the conditional versions win by quite a bit. 32-bit ARM doesn't +have a native integer divide, so getting the remainder after a division +(to see if truncation occurred) is more work than preconditioning the +numerator to make truncation go in the correct direction. 64-bit ARM +had the same winners and losers as x86_64, at least on the two physical +instances I tested. + +============================================================================ +folly/test/MathBenchmark.cpp relative time/iter iters/s +============================================================================ +---------------------------------------------------------------------------- +divTruncInt8 8.89ns 112.44M +divFloorInt8 10.99ns 91.00M +divCeilInt8 10.95ns 91.33M +branchlessDivCeilInt8 100.40% 10.91ns 91.69M +branchfulDivCeilInt8 88.87% 12.32ns 81.16M +brokenButWidespreadDivCeilInt8 109.20% 10.03ns 99.73M +viaFloatDivCeilInt8 109.68% 9.98ns 100.17M +viaDoubleDivCeilInt8 95.47% 11.47ns 87.19M +viaLongDoubleDivCeilInt8 31.65% 34.59ns 28.91M +divRoundAwayInt8 10.42ns 95.97M +---------------------------------------------------------------------------- +divTruncInt16 8.68ns 115.17M +divFloorInt16 10.94ns 91.38M +divCeilInt16 10.91ns 91.70M +branchlessDivCeilInt16 99.44% 10.97ns 91.18M +branchfulDivCeilInt16 81.68% 13.35ns 74.90M +brokenButWidespreadDivCeilInt16 109.50% 9.96ns 100.40M +viaFloatDivCeilInt16 108.04% 10.09ns 99.07M +viaDoubleDivCeilInt16 85.38% 12.77ns 78.29M +viaLongDoubleDivCeilInt16 29.99% 36.36ns 27.50M +divRoundAwayInt16 10.59ns 94.46M +---------------------------------------------------------------------------- +divTruncInt32 8.38ns 119.29M +divFloorInt32 11.01ns 90.84M +divCeilInt32 11.12ns 89.91M +branchlessDivCeilInt32 101.94% 10.91ns 91.66M +branchfulDivCeilInt32 84.67% 13.14ns 76.12M +brokenButWidespreadDivCeilInt32 117.61% 9.46ns 105.75M +approxViaFloatDivCeilInt32 115.98% 9.59ns 104.28M +viaDoubleDivCeilInt32 89.86% 12.38ns 80.79M +viaLongDoubleDivCeilInt32 30.84% 36.06ns 27.73M +divRoundAwayInt32 11.30ns 88.50M +---------------------------------------------------------------------------- +divTruncInt64 16.07ns 62.21M +divFloorInt64 18.37ns 54.45M +divCeilInt64 18.61ns 53.74M +branchlessDivCeilInt64 100.43% 18.53ns 53.97M +branchfulDivCeilInt64 84.65% 21.98ns 45.49M +brokenButWidespreadDivCeilInt64 108.47% 17.16ns 58.29M +approxViaFloatDivCeilInt64 190.99% 9.74ns 102.64M +approxViaDoubleDivCeilInt64 148.64% 12.52ns 79.88M +viaLongDoubleDivCeilInt64 52.01% 35.77ns 27.95M +divRoundAwayInt64 18.79ns 53.21M +---------------------------------------------------------------------------- +divTruncUint8 7.76ns 128.89M +divFloorUint8 8.29ns 120.61M +divCeilUint8 9.61ns 104.09M +branchlessDivCeilUint8 112.00% 8.58ns 116.58M +branchfulDivCeilUint8 114.01% 8.43ns 118.67M +brokenButWidespreadDivCeilUint8 100.48% 9.56ns 104.58M +viaFloatDivCeilUint8 103.53% 9.28ns 107.76M +viaDoubleDivCeilUint8 85.75% 11.20ns 89.26M +viaLongDoubleDivCeilUint8 27.72% 34.65ns 28.86M +divRoundAwayUint8 9.60ns 104.11M +---------------------------------------------------------------------------- +divTruncUint16 8.39ns 119.19M +divFloorUint16 8.28ns 120.82M +divCeilUint16 9.90ns 100.96M +branchlessDivCeilUint16 100.23% 9.88ns 101.19M +branchfulDivCeilUint16 107.83% 9.19ns 108.87M +brokenButWidespreadDivCeilUint16 99.89% 9.92ns 100.85M +viaFloatDivCeilUint16 100.54% 9.85ns 101.50M +viaDoubleDivCeilUint16 77.38% 12.80ns 78.13M +viaLongDoubleDivCeilUint16 27.30% 36.28ns 27.56M +divRoundAwayUint16 9.82ns 101.85M +---------------------------------------------------------------------------- +divTruncUint32 8.12ns 123.20M +divFloorUint32 8.09ns 123.58M +divCeilUint32 8.44ns 118.55M +branchlessDivCeilUint32 88.27% 9.56ns 104.64M +branchfulDivCeilUint32 98.91% 8.53ns 117.25M +brokenButWidespreadDivCeilUint32 93.48% 9.02ns 110.82M +approxViaFloatDivCeilUint32 86.29% 9.78ns 102.30M +viaDoubleDivCeilUint32 66.76% 12.63ns 79.15M +viaLongDoubleDivCeilUint32 23.35% 36.13ns 27.68M +divRoundAwayUint32 8.47ns 118.03M +---------------------------------------------------------------------------- +divTruncUint64 12.38ns 80.79M +divFloorUint64 12.27ns 81.47M +divCeilUint64 12.66ns 78.99M +branchlessDivCeilUint64 93.46% 13.55ns 73.83M +branchfulDivCeilUint64 100.30% 12.62ns 79.23M +brokenButWidespreadDivCeilUint64 99.41% 12.73ns 78.53M +approxViaFloatDivCeilUint64 106.59% 11.88ns 84.19M +approxViaDoubleDivCeilUint64 92.14% 13.74ns 72.78M +viaLongDoubleDivCeilUint64 33.51% 37.78ns 26.47M +divRoundAwayUint64 12.34ns 81.02M +============================================================================ +*/ diff --git a/folly/test/MathTest.cpp b/folly/test/MathTest.cpp new file mode 100644 index 00000000..a3023edc --- /dev/null +++ b/folly/test/MathTest.cpp @@ -0,0 +1,246 @@ +/* + * Copyright 2016 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +using namespace folly; +using namespace folly::detail; + +namespace { + +// Workaround for https://llvm.org/bugs/show_bug.cgi?id=16404, +// issues with __int128 multiplication and UBSAN +template +T mul(T lhs, T rhs) { + if (rhs < 0) { + rhs = -rhs; + lhs = -lhs; + } + T accum = 0; + while (rhs != 0) { + if ((rhs & 1) != 0) { + accum += lhs; + } + lhs += lhs; + rhs >>= 1; + } + return accum; +} + +template +T referenceDivFloor(T numer, T denom) { + // rv = largest integral value <= numer / denom + B n = numer; + B d = denom; + if (d < 0) { + d = -d; + n = -n; + } + B r = n / d; + while (mul(r, d) > n) { + --r; + } + while (mul(r + 1, d) <= n) { + ++r; + } + T rv = static_cast(r); + assert(static_cast(rv) == r); + return rv; +} + +template +T referenceDivCeil(T numer, T denom) { + // rv = smallest integral value >= numer / denom + B n = numer; + B d = denom; + if (d < 0) { + d = -d; + n = -n; + } + B r = n / d; + while (mul(r, d) < n) { + ++r; + } + while (mul(r - 1, d) >= n) { + --r; + } + T rv = static_cast(r); + assert(static_cast(rv) == r); + return rv; +} + +template +T referenceDivRoundAway(T numer, T denom) { + if ((numer < 0) != (denom < 0)) { + return referenceDivFloor(numer, denom); + } else { + return referenceDivCeil(numer, denom); + } +} + +template +std::vector cornerValues() { + std::vector rv; + for (T i = 1; i < 24; ++i) { + rv.push_back(i); + rv.push_back(std::numeric_limits::max() / i); + rv.push_back(std::numeric_limits::max() - i); + rv.push_back(std::numeric_limits::max() / 2 - i); + if (std::is_signed::value) { + rv.push_back(-i); + rv.push_back(std::numeric_limits::min() / i); + rv.push_back(std::numeric_limits::min() + i); + rv.push_back(std::numeric_limits::min() / 2 + i); + } + } + return rv; +} + +template +void runDivTests() { + using T = decltype(static_cast(1) / static_cast(1)); + auto numers = cornerValues(); + numers.push_back(0); + auto denoms = cornerValues(); + for (A n : numers) { + for (B d : denoms) { + if (std::is_signed::value && n == std::numeric_limits::min() && + d == static_cast(-1)) { + // n / d overflows in two's complement + continue; + } + EXPECT_EQ(divCeil(n, d), (referenceDivCeil(n, d))) << n << "/" << d; + EXPECT_EQ(divFloor(n, d), (referenceDivFloor(n, d))) << n << "/" + << d; + EXPECT_EQ(divTrunc(n, d), n / d) << n << "/" << d; + EXPECT_EQ(divRoundAway(n, d), (referenceDivRoundAway(n, d))) + << n << "/" << d; + T nn = n; + T dd = d; + EXPECT_EQ(divCeilBranchless(nn, dd), divCeilBranchful(nn, dd)); + EXPECT_EQ(divFloorBranchless(nn, dd), divFloorBranchful(nn, dd)); + EXPECT_EQ(divRoundAwayBranchless(nn, dd), divRoundAwayBranchful(nn, dd)); + } + } +} +} + +TEST(Bits, divTestInt8) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +#ifdef FOLLY_HAVE_INT128_T + runDivTests(); + runDivTests(); +#endif +} +TEST(Bits, divTestInt16) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +#ifdef FOLLY_HAVE_INT128_T + runDivTests(); + runDivTests(); +#endif +} +TEST(Bits, divTestInt32) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +#ifdef FOLLY_HAVE_INT128_T + runDivTests(); + runDivTests(); +#endif +} +#ifdef FOLLY_HAVE_INT128_T +TEST(Bits, divTestInt64) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +} +#endif +TEST(Bits, divTestUint8) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +#ifdef FOLLY_HAVE_INT128_T + runDivTests(); + runDivTests(); +#endif +} +TEST(Bits, divTestUint16) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +#ifdef FOLLY_HAVE_INT128_T + runDivTests(); + runDivTests(); +#endif +} +TEST(Bits, divTestUint32) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +#ifdef FOLLY_HAVE_INT128_T + runDivTests(); + runDivTests(); +#endif +} +#ifdef FOLLY_HAVE_INT128_T +TEST(Bits, divTestUint64) { + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); + runDivTests(); +} +#endif -- 2.34.1