From: Steve O'Brien <steveo@fb.com>
Date: Sun, 7 Jun 2015 22:00:02 +0000 (-0700)
Subject: decimal conversion: digits10 using bit-counting instructions on x86-64
X-Git-Tag: v0.45.0~14
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=7efcdea30699df51cce989c860d2e88ddfe1bdc2;p=folly.git

decimal conversion: digits10 using bit-counting instructions on x86-64

Summary: To estimate length of a number's base-10 length in digits, use insn `bsrq` (Bit Scan Reverse) to count significant bits.  From that, approximate base-10 length.  Tries to avoid branchiness, expensive math, and loops.

Test Plan:
1) Tested correctness by comparing results with `snprintf` and ensuring same string lengths.  Tested at each boundary condition (2^k)-1, 2^k, (2^k+1); and similar for base 10.
2) Benchmarked with gcc 4.9 and clang 3.5.

Before/after values are millions operations / sec
GCC 4.9                             Clang 3.5
1       111.09  111.7   1.005       1       115.36  393.81  3.414
2       115.36  111.7   0.968       2       115.36  393.89  3.414
3       114.91  111.34  0.969       3       111.09  393.56  3.543
4       114.91  111.34  0.969       4       111.09  393.86  3.545
5       115.36  111.36  0.965       5       111.09  392.18  3.530
6        99.99  104.32  1.043       6       103.43  393.74  3.807
7        83.31   84.71  1.017       7        81.06  268.39  3.311
8        76.9    78.23  1.017       8        76.91  268.26  3.488
9        62.48   68.26  1.093       9        65.56  190     2.898
10        59.98   63.65  1.061      10        61.17  190.54  3.115
11        50.6    55.87  1.104      11        54.54  148.03  2.714
12        47.19   51.7   1.096      12        50.84  148.57  2.922
13        40.53   46.99  1.159      13        43.33  115.91  2.675
14        40.48   43.42  1.073      14        41.5   115.97  2.794
15        34.92   40.21  1.151      15        37.27   94.89  2.546
16        33.49   37.51  1.120      16        35.77   94.88  2.653
17        29.89   35.02  1.172      17        31.7    80.67  2.545
18        29.11   32.98  1.133      18        30.76   80.63  2.621
19        26.58   31.05  1.168      19        28.22   70.15  2.486
20        25.64   29.38  1.146      20        27.96   70.16  2.509

Reviewed By: ldbrandy@fb.com

Subscribers: dancol, trunkagent, marcelo, chalfant, maoy, folly-diffs@, yzhan, yfeldblum

FB internal diff: D1934777

Signature: t1:1934777:1433523486:3acbe7ed9c9560c44194f854754529041eaa289d
---

diff --git a/folly/Conv.h b/folly/Conv.h
index ad2d32c1..cf7d8662 100644
--- a/folly/Conv.h
+++ b/folly/Conv.h
@@ -228,6 +228,43 @@ unsafeTelescope128(char * buffer, size_t room, unsigned __int128 x) {
  */
 
 inline uint32_t digits10(uint64_t v) {
+#ifdef __x86_64__
+
+  // For this arch we can get a little help from specialized CPU instructions
+  // which can count leading zeroes; 64 minus that is appx. log (base 2).
+  // Use that to approximate base-10 digits (log_10) and then adjust if needed.
+
+  // 10^i, defined for i 0 through 19.
+  // This is 20 * 8 == 160 bytes, which fits neatly into 5 cache lines
+  // (assuming a cache line size of 64).
+  static const uint64_t powersOf10[20] __attribute__((__aligned__(64))) = {
+    1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000,
+    10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000,
+    1000000000000000, 10000000000000000, 100000000000000000,
+    1000000000000000000, 10000000000000000000UL
+  };
+
+  // "count leading zeroes" operation not valid; for 0; special case this.
+  if UNLIKELY (! v) {
+    return 1;
+  }
+
+  // bits is in the ballpark of log_2(v).
+  const uint8_t leadingZeroes = __builtin_clzll(v);
+  const auto bits = 63 - leadingZeroes;
+
+  // approximate log_10(v) == log_10(2) * bits.
+  // Integer magic below: 77/256 is appx. 0.3010 (log_10(2)).
+  // The +1 is to make this the ceiling of the log_10 estimate.
+  const auto minLength = 1 + ((bits * 77) >> 8);
+
+  // return that log_10 lower bound, plus adjust if input >= 10^(that bound)
+  // in case there's a small error and we misjudged length.
+  return minLength +
+         (UNLIKELY (v >= powersOf10[minLength]));
+
+#else
+
   uint32_t result = 1;
   for (;;) {
     if (LIKELY(v < 10)) return result;
@@ -238,6 +275,8 @@ inline uint32_t digits10(uint64_t v) {
     v /= 10000U;
     result += 4;
   }
+
+#endif
 }
 
 /**
diff --git a/folly/test/ConvTest.cpp b/folly/test/ConvTest.cpp
index 04fac62a..bf4c68dd 100644
--- a/folly/test/ConvTest.cpp
+++ b/folly/test/ConvTest.cpp
@@ -34,6 +34,60 @@ static uint32_t u32;
 static int64_t s64;
 static uint64_t u64;
 
+TEST(Conv, digits10Minimal) {
+  // Not much of a test (and it's included in the test below anyway).
+  // I just want to inspect the generated assembly for this function.
+  folly::doNotOptimizeAway(digits10(random() * random()));
+}
+
+TEST(Conv, digits10) {
+  char buffer[100];
+  uint64_t power;
+
+  // first, some basic sniff tests
+  EXPECT_EQ( 1, digits10(0));
+  EXPECT_EQ( 1, digits10(1));
+  EXPECT_EQ( 1, digits10(9));
+  EXPECT_EQ( 2, digits10(10));
+  EXPECT_EQ( 2, digits10(99));
+  EXPECT_EQ( 3, digits10(100));
+  EXPECT_EQ( 3, digits10(999));
+  EXPECT_EQ( 4, digits10(1000));
+  EXPECT_EQ( 4, digits10(9999));
+  EXPECT_EQ(20, digits10(18446744073709551615ULL));
+
+  // try the first X nonnegatives.
+  // Covers some more cases of 2^p, 10^p
+  for (uint64_t i = 0; i < 100000; i++) {
+    snprintf(buffer, sizeof(buffer), "%lu", i);
+    EXPECT_EQ(strlen(buffer), digits10(i));
+  }
+
+  // try powers of 2
+  power = 1;
+  for (int p = 0; p < 64; p++) {
+    snprintf(buffer, sizeof(buffer), "%lu", power);
+    EXPECT_EQ(strlen(buffer), digits10(power));
+    snprintf(buffer, sizeof(buffer), "%lu", power - 1);
+    EXPECT_EQ(strlen(buffer), digits10(power - 1));
+    snprintf(buffer, sizeof(buffer), "%lu", power + 1);
+    EXPECT_EQ(strlen(buffer), digits10(power + 1));
+    power *= 2;
+  }
+
+  // try powers of 10
+  power = 1;
+  for (int p = 0; p < 20; p++) {
+    snprintf(buffer, sizeof(buffer), "%lu", power);
+    EXPECT_EQ(strlen(buffer), digits10(power));
+    snprintf(buffer, sizeof(buffer), "%lu", power - 1);
+    EXPECT_EQ(strlen(buffer), digits10(power - 1));
+    snprintf(buffer, sizeof(buffer), "%lu", power + 1);
+    EXPECT_EQ(strlen(buffer), digits10(power + 1));
+    power *= 10;
+  }
+}
+
 // Test to<T>(T)
 TEST(Conv, Type2Type) {
   int intV = 42;