From: Steve O'Brien Date: Sun, 7 Jun 2015 22:00:02 +0000 (-0700) Subject: decimal conversion: digits10 using bit-counting instructions on x86-64 X-Git-Tag: v0.45.0~14 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=7efcdea30699df51cce989c860d2e88ddfe1bdc2;p=folly.git decimal conversion: digits10 using bit-counting instructions on x86-64 Summary: To estimate length of a number's base-10 length in digits, use insn `bsrq` (Bit Scan Reverse) to count significant bits. From that, approximate base-10 length. Tries to avoid branchiness, expensive math, and loops. Test Plan: 1) Tested correctness by comparing results with `snprintf` and ensuring same string lengths. Tested at each boundary condition (2^k)-1, 2^k, (2^k+1); and similar for base 10. 2) Benchmarked with gcc 4.9 and clang 3.5. Before/after values are millions operations / sec GCC 4.9 Clang 3.5 1 111.09 111.7 1.005 1 115.36 393.81 3.414 2 115.36 111.7 0.968 2 115.36 393.89 3.414 3 114.91 111.34 0.969 3 111.09 393.56 3.543 4 114.91 111.34 0.969 4 111.09 393.86 3.545 5 115.36 111.36 0.965 5 111.09 392.18 3.530 6 99.99 104.32 1.043 6 103.43 393.74 3.807 7 83.31 84.71 1.017 7 81.06 268.39 3.311 8 76.9 78.23 1.017 8 76.91 268.26 3.488 9 62.48 68.26 1.093 9 65.56 190 2.898 10 59.98 63.65 1.061 10 61.17 190.54 3.115 11 50.6 55.87 1.104 11 54.54 148.03 2.714 12 47.19 51.7 1.096 12 50.84 148.57 2.922 13 40.53 46.99 1.159 13 43.33 115.91 2.675 14 40.48 43.42 1.073 14 41.5 115.97 2.794 15 34.92 40.21 1.151 15 37.27 94.89 2.546 16 33.49 37.51 1.120 16 35.77 94.88 2.653 17 29.89 35.02 1.172 17 31.7 80.67 2.545 18 29.11 32.98 1.133 18 30.76 80.63 2.621 19 26.58 31.05 1.168 19 28.22 70.15 2.486 20 25.64 29.38 1.146 20 27.96 70.16 2.509 Reviewed By: ldbrandy@fb.com Subscribers: dancol, trunkagent, marcelo, chalfant, maoy, folly-diffs@, yzhan, yfeldblum FB internal diff: D1934777 Signature: t1:1934777:1433523486:3acbe7ed9c9560c44194f854754529041eaa289d --- diff --git a/folly/Conv.h b/folly/Conv.h index ad2d32c1..cf7d8662 100644 --- a/folly/Conv.h +++ b/folly/Conv.h @@ -228,6 +228,43 @@ unsafeTelescope128(char * buffer, size_t room, unsigned __int128 x) { */ inline uint32_t digits10(uint64_t v) { +#ifdef __x86_64__ + + // For this arch we can get a little help from specialized CPU instructions + // which can count leading zeroes; 64 minus that is appx. log (base 2). + // Use that to approximate base-10 digits (log_10) and then adjust if needed. + + // 10^i, defined for i 0 through 19. + // This is 20 * 8 == 160 bytes, which fits neatly into 5 cache lines + // (assuming a cache line size of 64). + static const uint64_t powersOf10[20] __attribute__((__aligned__(64))) = { + 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, + 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, + 1000000000000000, 10000000000000000, 100000000000000000, + 1000000000000000000, 10000000000000000000UL + }; + + // "count leading zeroes" operation not valid; for 0; special case this. + if UNLIKELY (! v) { + return 1; + } + + // bits is in the ballpark of log_2(v). + const uint8_t leadingZeroes = __builtin_clzll(v); + const auto bits = 63 - leadingZeroes; + + // approximate log_10(v) == log_10(2) * bits. + // Integer magic below: 77/256 is appx. 0.3010 (log_10(2)). + // The +1 is to make this the ceiling of the log_10 estimate. + const auto minLength = 1 + ((bits * 77) >> 8); + + // return that log_10 lower bound, plus adjust if input >= 10^(that bound) + // in case there's a small error and we misjudged length. + return minLength + + (UNLIKELY (v >= powersOf10[minLength])); + +#else + uint32_t result = 1; for (;;) { if (LIKELY(v < 10)) return result; @@ -238,6 +275,8 @@ inline uint32_t digits10(uint64_t v) { v /= 10000U; result += 4; } + +#endif } /** diff --git a/folly/test/ConvTest.cpp b/folly/test/ConvTest.cpp index 04fac62a..bf4c68dd 100644 --- a/folly/test/ConvTest.cpp +++ b/folly/test/ConvTest.cpp @@ -34,6 +34,60 @@ static uint32_t u32; static int64_t s64; static uint64_t u64; +TEST(Conv, digits10Minimal) { + // Not much of a test (and it's included in the test below anyway). + // I just want to inspect the generated assembly for this function. + folly::doNotOptimizeAway(digits10(random() * random())); +} + +TEST(Conv, digits10) { + char buffer[100]; + uint64_t power; + + // first, some basic sniff tests + EXPECT_EQ( 1, digits10(0)); + EXPECT_EQ( 1, digits10(1)); + EXPECT_EQ( 1, digits10(9)); + EXPECT_EQ( 2, digits10(10)); + EXPECT_EQ( 2, digits10(99)); + EXPECT_EQ( 3, digits10(100)); + EXPECT_EQ( 3, digits10(999)); + EXPECT_EQ( 4, digits10(1000)); + EXPECT_EQ( 4, digits10(9999)); + EXPECT_EQ(20, digits10(18446744073709551615ULL)); + + // try the first X nonnegatives. + // Covers some more cases of 2^p, 10^p + for (uint64_t i = 0; i < 100000; i++) { + snprintf(buffer, sizeof(buffer), "%lu", i); + EXPECT_EQ(strlen(buffer), digits10(i)); + } + + // try powers of 2 + power = 1; + for (int p = 0; p < 64; p++) { + snprintf(buffer, sizeof(buffer), "%lu", power); + EXPECT_EQ(strlen(buffer), digits10(power)); + snprintf(buffer, sizeof(buffer), "%lu", power - 1); + EXPECT_EQ(strlen(buffer), digits10(power - 1)); + snprintf(buffer, sizeof(buffer), "%lu", power + 1); + EXPECT_EQ(strlen(buffer), digits10(power + 1)); + power *= 2; + } + + // try powers of 10 + power = 1; + for (int p = 0; p < 20; p++) { + snprintf(buffer, sizeof(buffer), "%lu", power); + EXPECT_EQ(strlen(buffer), digits10(power)); + snprintf(buffer, sizeof(buffer), "%lu", power - 1); + EXPECT_EQ(strlen(buffer), digits10(power - 1)); + snprintf(buffer, sizeof(buffer), "%lu", power + 1); + EXPECT_EQ(strlen(buffer), digits10(power + 1)); + power *= 10; + } +} + // Test to(T) TEST(Conv, Type2Type) { int intV = 42;