From: Andrei Alexandrescu Date: Tue, 10 Jul 2012 18:42:34 +0000 (-0700) Subject: Better unsigned to string conversion X-Git-Tag: v0.22.0~1180 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=43bdc5d7d0bee944c1f437f3e159f32d4c57f3cd;p=folly.git Better unsigned to string conversion Summary: In https://phabricator.fb.com/D511928 Brian mentioned the current API for string append is insufficient for appending to a buffer. That made me curious about the relative performance of classic and table-based number to ASCII conversions. The results were interesting as on the average (over all digit lengths) the table-based conversion was faster, but performance was lackluster (in the worst case half as fast as the classic implementation) for large numbers, I presume due to the cache misses incurred by the tables. This diff proposes an improved unsigned-to-ASCII primitive that is much faster than both table-based (previous Folly) and classic primitive. The key is a fast digits10() implementation that precomputes the space required by the conversion. After that, the digits are issued in place, no more reverse required. The new routine is up to 14x faster than the classic implementation, depending on the number of digits (benchmarks in comments). Adding a few people who may be interested in the matter. Brian, thanks for bringing this matter up; if this gets in you may want to use the folly routine in proxygen. Test Plan: unittest and benchmarks. Reviewed By: simpkins@fb.com FB internal diff: D515572 --- diff --git a/folly/Benchmark.h b/folly/Benchmark.h index 444ad81d..2b68c4fa 100644 --- a/folly/Benchmark.h +++ b/folly/Benchmark.h @@ -190,11 +190,14 @@ addBenchmark(const char* file, const char* name, Lambda&& lambda) { timespec start, end; // CORE MEASUREMENT STARTS - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &start)); + auto const r1 = clock_gettime(detail::DEFAULT_CLOCK_ID, &start); lambda(times); - CHECK_EQ(0, clock_gettime(detail::DEFAULT_CLOCK_ID, &end)); + auto const r2 = clock_gettime(detail::DEFAULT_CLOCK_ID, &end); // CORE MEASUREMENT ENDS + CHECK_EQ(0, r1); + CHECK_EQ(0, r2); + return detail::timespecDiff(end, start) - BenchmarkSuspender::nsSpent; }; diff --git a/folly/Conv.h b/folly/Conv.h index 98c66b3d..98ff2c01 100644 --- a/folly/Conv.h +++ b/folly/Conv.h @@ -116,41 +116,63 @@ typename std::tuple_element< return getLastElement(vs...); } +} // namespace detail + /******************************************************************************* * Conversions from integral types to string types. ******************************************************************************/ -// Returns the offset of the formatted string from the start of -// the supplied buffer. The new string will be at range -// [buf+begin,buf+bufLen). Uint will be either uint32_t or uint64_t. -template -size_t uintToBuffer(char*const buffer, size_t bufLen, Uint v) { - extern const char digit1[101], digit2[101]; +/** + * Returns the number of digits in the base 10 representation of an + * uint64_t. Useful for preallocating buffers and such. It's also used + * internally, see below. Measurements suggest that defining a + * separate overload for 32-bit integers is not worthwhile. + */ + +inline uint32_t digits10(uint64_t v) { + uint32_t result = 1; for (;;) { - if (v < 100) { - if (v < 10) { - buffer[--bufLen] = static_cast(v + '0'); - } else { - size_t r = static_cast(v); - bufLen -= 2; - buffer[bufLen] = digit1[r]; - buffer[bufLen + 1] = digit2[r]; - } - break; - } - Uint t = v; - v /= 100; - size_t r = static_cast (t - v * 100); - bufLen -= 2; - buffer[bufLen] = digit1[r]; - buffer[bufLen + 1] = digit2[r]; + if (LIKELY(v < 10)) return result; + if (LIKELY(v < 100)) return result + 1; + if (LIKELY(v < 1000)) return result + 2; + if (LIKELY(v < 10000)) return result + 3; + // Skip ahead by 4 orders of magnitude + v /= 10000U; + result += 4; } - return bufLen; } -const size_t kMaxInt64BufLen = 21;// 19 + 1 for possible '-' sign + 1 for \0 +/** + * Copies the ASCII base 10 representation of v into buffer and + * returns the number of bytes written. Does NOT append a \0. Assumes + * the buffer points to digits10(v) bytes of valid memory. Note that + * uint64 needs at most 20 bytes, uint32_t needs at most 10 bytes, + * uint16_t needs at most 5 bytes, and so on. Measurements suggest + * that defining a separate overload for 32-bit integers is not + * worthwhile. + * + * This primitive is unsafe because it makes the size assumption and + * because it does not add a terminating \0. + */ -} // namespace detail +inline uint32_t uint64ToBufferUnsafe(uint64_t v, char *const buffer) { + auto const result = digits10(v); + // WARNING: using size_t or pointer arithmetic for pos slows down + // the loop below 20x. This is because several 32-bit ops can be + // done in parallel, but only fewer 64-bit ones. + uint32_t pos = result - 1; + while (v >= 10) { + // Keep these together so a peephole optimization "sees" them and + // computes them in one shot. + auto const q = v / 10; + auto const r = static_cast(v % 10); + buffer[pos--] = '0' + r; + v = q; + } + // Last digit is trivial to handle + buffer[pos] = static_cast(v) + '0'; + return result; +} /** * A single char gets appended. @@ -222,18 +244,13 @@ typename std::enable_if< && detail::IsSomeString::value && sizeof(Src) >= 4>::type toAppend(Src value, Tgt * result) { typedef typename std::make_unsigned::type Usrc; - char buffer[detail::kMaxInt64BufLen]; - size_t begin; + char buffer[20]; if (value < 0) { - begin = detail::uintToBuffer(buffer, sizeof(buffer), - static_cast(-value)); - DCHECK_GE(begin, 1); - buffer[--begin] = '-'; + result->push_back('-'); + result->append(buffer, uint64ToBufferUnsafe(-uint64_t(value), buffer)); } else { - begin = detail::uintToBuffer(buffer, sizeof(buffer), - static_cast(value)); + result->append(buffer, uint64ToBufferUnsafe(value, buffer)); } - result->append(buffer + begin, buffer + sizeof(buffer)); } /** @@ -244,9 +261,8 @@ typename std::enable_if< std::is_integral::value && !std::is_signed::value && detail::IsSomeString::value && sizeof(Src) >= 4>::type toAppend(Src value, Tgt * result) { - char buffer[detail::kMaxInt64BufLen]; - const size_t begin = detail::uintToBuffer(buffer, sizeof(buffer), value); - result->append(buffer + begin, buffer + sizeof(buffer)); + char buffer[20]; + result->append(buffer, buffer + uint64ToBufferUnsafe(value, buffer)); } /** diff --git a/folly/Format-inl.h b/folly/Format-inl.h index 59f4550a..3e5cf212 100644 --- a/folly/Format-inl.h +++ b/folly/Format-inl.h @@ -429,9 +429,8 @@ class FormatValue< useSprintf("%'ju"); } else { // Use uintToBuffer, faster than sprintf - valBufEnd = valBuf + valBufSize - 1; - valBufBegin = valBuf + detail::uintToBuffer(valBuf, valBufSize - 1, - uval); + valBufBegin = valBuf + 3; + valBufEnd = valBufBegin + uint64ToBufferUnsafe(uval, valBufBegin); } break; case 'c': diff --git a/folly/test/ConvTest.cpp b/folly/test/ConvTest.cpp index a67eef85..fed93e5a 100644 --- a/folly/test/ConvTest.cpp +++ b/folly/test/ConvTest.cpp @@ -570,6 +570,44 @@ TEST(Conv, StringToBool) { EXPECT_EQ(buf5, sp5.begin()); } +TEST(Conv, NewUint64ToString) { + char buf[21]; + +#define THE_GREAT_EXPECTATIONS(n, len) \ + do { \ + EXPECT_EQ((len), uint64ToBufferUnsafe((n), buf)); \ + buf[(len)] = 0; \ + auto s = string(#n); \ + s = s.substr(0, s.size() - 2); \ + EXPECT_EQ(s, buf); \ + } while (0) + + THE_GREAT_EXPECTATIONS(0UL, 1); + THE_GREAT_EXPECTATIONS(1UL, 1); + THE_GREAT_EXPECTATIONS(12UL, 2); + THE_GREAT_EXPECTATIONS(123UL, 3); + THE_GREAT_EXPECTATIONS(1234UL, 4); + THE_GREAT_EXPECTATIONS(12345UL, 5); + THE_GREAT_EXPECTATIONS(123456UL, 6); + THE_GREAT_EXPECTATIONS(1234567UL, 7); + THE_GREAT_EXPECTATIONS(12345678UL, 8); + THE_GREAT_EXPECTATIONS(123456789UL, 9); + THE_GREAT_EXPECTATIONS(1234567890UL, 10); + THE_GREAT_EXPECTATIONS(12345678901UL, 11); + THE_GREAT_EXPECTATIONS(123456789012UL, 12); + THE_GREAT_EXPECTATIONS(1234567890123UL, 13); + THE_GREAT_EXPECTATIONS(12345678901234UL, 14); + THE_GREAT_EXPECTATIONS(123456789012345UL, 15); + THE_GREAT_EXPECTATIONS(1234567890123456UL, 16); + THE_GREAT_EXPECTATIONS(12345678901234567UL, 17); + THE_GREAT_EXPECTATIONS(123456789012345678UL, 18); + THE_GREAT_EXPECTATIONS(1234567890123456789UL, 19); + THE_GREAT_EXPECTATIONS(18446744073709551614UL, 20); + THE_GREAT_EXPECTATIONS(18446744073709551615UL, 20); + +#undef THE_GREAT_EXPECTATIONS +} + //////////////////////////////////////////////////////////////////////////////// // Benchmarks for ASCII to int conversion //////////////////////////////////////////////////////////////////////////////// @@ -653,11 +691,144 @@ void lexicalCastMeasure(uint n, uint digits) { } } +// Benchmarks for unsigned to string conversion, raw + +unsigned u64ToAsciiTable(uint64_t value, char* dst) { + static const char digits[201] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + + uint32_t const length = digits10(value); + uint32_t next = length - 1; + while (value >= 100) { + auto const i = (value % 100) * 2; + value /= 100; + dst[next] = digits[i + 1]; + dst[next - 1] = digits[i]; + next -= 2; + } + // Handle last 1-2 digits + if (value < 10) { + dst[next] = '0' + uint32_t(value); + } else { + auto i = uint32_t(value) * 2; + dst[next] = digits[i + 1]; + dst[next - 1] = digits[i]; + } + return length; +} + +void u64ToAsciiTableBM(uint n, uint64_t value) { + // This is too fast, need to do 10 times per iteration + char buf[20]; + FOR_EACH_RANGE (i, 0, n) { + doNotOptimizeAway(u64ToAsciiTable(value + n, buf)); + } +} + +unsigned u64ToAsciiClassic(uint64_t value, char* dst) { + // Write backwards. + char* next = (char*)dst; + char* start = next; + do { + *next++ = '0' + (value % 10); + value /= 10; + } while (value != 0); + unsigned length = next - start; + + // Reverse in-place. + next--; + while (next > start) { + char swap = *next; + *next = *start; + *start = swap; + next--; + start++; + } + return length; +} + +void u64ToAsciiClassicBM(uint n, uint64_t value) { + // This is too fast, need to do 10 times per iteration + char buf[20]; + FOR_EACH_RANGE (i, 0, n) { + doNotOptimizeAway(u64ToAsciiClassic(value + n, buf)); + } +} + +void u64ToAsciiFollyBM(uint n, uint64_t value) { + // This is too fast, need to do 10 times per iteration + char buf[20]; + FOR_EACH_RANGE (i, 0, n) { + doNotOptimizeAway(uint64ToBufferUnsafe(value + n, buf)); + } +} + +// Benchmark uitoa with string append + +void u2aAppendClassicBM(uint n, uint64_t value) { + string s; + FOR_EACH_RANGE (i, 0, n) { + // auto buf = &s.back() + 1; + char buffer[20]; + s.append(buffer, u64ToAsciiClassic(value, buffer)); + doNotOptimizeAway(s.size()); + } +} + +void u2aAppendFollyBM(uint n, uint64_t value) { + string s; + FOR_EACH_RANGE (i, 0, n) { + // auto buf = &s.back() + 1; + char buffer[20]; + s.append(buffer, uint64ToBufferUnsafe(value, buffer)); + doNotOptimizeAway(s.size()); + } +} + +#define DEFINE_BENCHMARK_GROUP(n) \ + BENCHMARK_PARAM(u64ToAsciiClassicBM, n); \ + BENCHMARK_RELATIVE_PARAM(u64ToAsciiTableBM, n); \ + BENCHMARK_RELATIVE_PARAM(u64ToAsciiFollyBM, n); \ + BENCHMARK_DRAW_LINE(); + +DEFINE_BENCHMARK_GROUP(1); +DEFINE_BENCHMARK_GROUP(12); +DEFINE_BENCHMARK_GROUP(123); +DEFINE_BENCHMARK_GROUP(1234); +DEFINE_BENCHMARK_GROUP(12345); +DEFINE_BENCHMARK_GROUP(123456); +DEFINE_BENCHMARK_GROUP(1234567); +DEFINE_BENCHMARK_GROUP(12345678); +DEFINE_BENCHMARK_GROUP(123456789); +DEFINE_BENCHMARK_GROUP(1234567890); +DEFINE_BENCHMARK_GROUP(12345678901); +DEFINE_BENCHMARK_GROUP(123456789012); +DEFINE_BENCHMARK_GROUP(1234567890123); +DEFINE_BENCHMARK_GROUP(12345678901234); +DEFINE_BENCHMARK_GROUP(123456789012345); +DEFINE_BENCHMARK_GROUP(1234567890123456); +DEFINE_BENCHMARK_GROUP(12345678901234567); +DEFINE_BENCHMARK_GROUP(123456789012345678); +DEFINE_BENCHMARK_GROUP(1234567890123456789); +DEFINE_BENCHMARK_GROUP(12345678901234567890U); + +#undef DEFINE_BENCHMARK_GROUP + #define DEFINE_BENCHMARK_GROUP(n) \ BENCHMARK_PARAM(clibAtoiMeasure, n); \ BENCHMARK_RELATIVE_PARAM(lexicalCastMeasure, n); \ BENCHMARK_RELATIVE_PARAM(handwrittenAtoiMeasure, n); \ - BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n); + BENCHMARK_RELATIVE_PARAM(follyAtoiMeasure, n); \ + BENCHMARK_DRAW_LINE(); DEFINE_BENCHMARK_GROUP(1); DEFINE_BENCHMARK_GROUP(2);