From be4c6d6b3e21914df8a0207ef4f716a09a648745 Mon Sep 17 00:00:00 2001 From: Giuseppe Ottaviano Date: Mon, 3 Oct 2016 11:58:07 -0700 Subject: [PATCH] Optimize frequently inlined FBString methods Summary: Almost every method of `fbstring` needs to perform category dispatching. The category constants are `size_t`, which become 8-byte immediate values in the dispatching code, so even a simple `if (category() == Category::isSmall)` is quite large. When inlined hundreds of thousands of time, it adds up. This diff redefines the category type to be 1 byte (without changing the ABI). It also optimizes `size()` and `c_str()` and makes them branch-free, which probably is not going to have any perf impact but it saves a few bytes. Generated code for some small functions: - `reset()` Before: ``` 48 ba 00 00 00 00 00 movabs $0x1700000000000000,%rdx 00 00 17 48 89 f8 mov %rdi,%rax c6 07 00 movb $0x0,(%rdi) 48 89 57 10 mov %rdx,0x10(%rdi) ``` 20 bytes After: ``` 48 89 f8 mov %rdi,%rax c6 47 17 17 movb $0x17,0x17(%rdi) c6 07 00 movb $0x0,(%rdi) ``` 10 bytes - `c_str()` Before: ``` 48 b8 00 00 00 00 00 movabs $0xc000000000000000,%rax 00 00 c0 48 85 47 10 test %rax,0x10(%rdi) 74 08 je 401fd8 48 8b 07 mov (%rdi),%rax c3 retq 0f 1f 40 00 nopl 0x0(%rax) 48 89 f8 mov %rdi,%rax ``` 26 bytes (without the `retq`) After: ``` f6 47 17 c0 testb $0xc0,0x17(%rdi) 48 89 f8 mov %rdi,%rax 48 0f 45 07 cmovne (%rdi),%rax ``` 11 bytes - `size()` Before: ``` 48 b8 00 00 00 00 00 movabs $0xc000000000000000,%rax 00 00 c0 48 85 47 10 test %rax,0x10(%rdi) 74 08 je 401fa8 48 8b 47 08 mov 0x8(%rdi),%rax c3 retq 0f 1f 00 nopl (%rax) 48 0f be 57 17 movsbq 0x17(%rdi),%rdx b8 17 00 00 00 mov $0x17,%eax 48 29 d0 sub %rdx,%rax ``` 36 bytes (without the `retq`) After: ``` 0f b6 57 17 movzbl 0x17(%rdi),%edx b8 17 00 00 00 mov $0x17,%eax 48 29 d0 sub %rdx,%rax 48 0f 48 47 08 cmovs 0x8(%rdi),%rax ``` 17 bytes Reviewed By: philippv Differential Revision: D3957276 fbshipit-source-id: ef40d82bbbb0456b1044421cd02133c268abe39b --- folly/FBString.h | 77 ++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/folly/FBString.h b/folly/FBString.h index c7698103..6730d115 100644 --- a/folly/FBString.h +++ b/folly/FBString.h @@ -426,15 +426,11 @@ public: fbstring_detail::assume_unreachable(); } - const Char * c_str() const { - auto const c = category(); - if (c == Category::isSmall) { - FBSTRING_ASSERT(small_[smallSize()] == '\0'); - return small_; - } - FBSTRING_ASSERT(c == Category::isMedium || c == Category::isLarge); - FBSTRING_ASSERT(ml_.data_[ml_.size_] == '\0'); - return ml_.data_; + const Char* c_str() const { + const Char* ptr = ml_.data_; + // With this syntax, GCC and Clang generate a CMOV instead of a branch. + ptr = (category() == Category::isSmall) ? small_ : ptr; + return ptr; } void shrink(const size_t delta) { @@ -475,7 +471,19 @@ public: } size_t size() const { - return category() == Category::isSmall ? smallSize() : ml_.size_; + size_t ret = ml_.size_; + /* static */ if (kIsLittleEndian) { + // We can save a couple instructions, because the category is + // small iff the last char, as unsigned, is <= maxSmallSize. + typedef typename std::make_unsigned::type UChar; + auto maybeSmallSize = size_t(maxSmallSize) - + size_t(static_cast(small_[maxSmallSize])); + // With this syntax, GCC and Clang generate a CMOV instead of a branch. + ret = (static_cast(maybeSmallSize) >= 0) ? maybeSmallSize : ret; + } else { + ret = (category() == Category::isSmall) ? smallSize() : ret; + } + return ret; } size_t capacity() const { @@ -502,14 +510,8 @@ private: // Disabled fbstring_core & operator=(const fbstring_core & rhs); - // Equivalent to setSmallSize(0) but a few ns faster in - // microbenchmarks. void reset() { - ml_.capacity_ = kIsLittleEndian - ? maxSmallSize << (8 * (sizeof(size_t) - sizeof(Char))) - : maxSmallSize << 2; - small_[0] = '\0'; - FBSTRING_ASSERT(category() == Category::isSmall && size() == 0); + setSmallSize(0); } struct RefCounted { @@ -579,22 +581,17 @@ private: } }; - typedef std::conditional::type - category_type; + typedef uint8_t category_type; enum class Category : category_type { isSmall = 0, - isMedium = kIsLittleEndian - ? sizeof(size_t) == 4 ? 0x80000000 : 0x8000000000000000 - : 0x2, - isLarge = kIsLittleEndian - ? sizeof(size_t) == 4 ? 0x40000000 : 0x4000000000000000 - : 0x1, + isMedium = kIsLittleEndian ? 0x80 : 0x2, + isLarge = kIsLittleEndian ? 0x40 : 0x1, }; Category category() const { // works for both big-endian and little-endian - return static_cast(ml_.capacity_ & categoryExtractMask); + return static_cast(bytes_[lastChar] & categoryExtractMask); } struct MediumLarge { @@ -609,29 +606,27 @@ private: } void setCapacity(size_t cap, Category cat) { - capacity_ = kIsLittleEndian - ? cap | static_cast(cat) - : (cap << 2) | static_cast(cat); + capacity_ = kIsLittleEndian + ? cap | (static_cast(cat) << kCategoryShift) + : (cap << 2) | static_cast(cat); } }; union { + uint8_t bytes_[sizeof(MediumLarge)]; // For accessing the last byte. Char small_[sizeof(MediumLarge) / sizeof(Char)]; MediumLarge ml_; }; - enum : size_t { - lastChar = sizeof(MediumLarge) - 1, - maxSmallSize = lastChar / sizeof(Char), - maxMediumSize = 254 / sizeof(Char), // coincides with the small - // bin size in dlmalloc - categoryExtractMask = kIsLittleEndian - ? sizeof(size_t) == 4 ? 0xC0000000 : size_t(0xC000000000000000) - : 0x3, - capacityExtractMask = kIsLittleEndian - ? ~categoryExtractMask - : 0x0 /*unused*/, - }; + constexpr static size_t lastChar = sizeof(MediumLarge) - 1; + constexpr static size_t maxSmallSize = lastChar / sizeof(Char); + constexpr static size_t maxMediumSize = 254 / sizeof(Char); + constexpr static uint8_t categoryExtractMask = kIsLittleEndian ? 0xC0 : 0x3; + constexpr static size_t kCategoryShift = (sizeof(size_t) - 1) * 8; + constexpr static size_t capacityExtractMask = kIsLittleEndian + ? ~(size_t(categoryExtractMask) << kCategoryShift) + : 0x0 /* unused */; + static_assert(!(sizeof(MediumLarge) % sizeof(Char)), "Corrupt memory layout for fbstring."); -- 2.34.1