From 8063649a1af220a9cbd12ae46cd01ffaa4adb323 Mon Sep 17 00:00:00 2001 From: Nicholas Ormrod Date: Tue, 29 Mar 2016 09:28:51 -0700 Subject: [PATCH] Make FBVector faster Summary:Per https://github.com/facebook/folly/issues/379, github user pmalek determined that fbvector's insert function was a lot slower than std::vector's. While insert is often imagined to be a second-class vector function (if you're inserting in the middle, don't use a vector!), it is the correct function to use when inserting multiple elements at the back of an array (since the standard lacks an ##append(multiple-elements)## function). The code, therefore, required optimization. There are three things that were slowing down fbvector: - Excessive asserts. The fbvector code contains internal asserts that ensure proper calling of private methods. These are mostly unnecessary, given the extensive test suite that batters fbvector, in addition to fbvector's battle tested history. I have removed these. - Internal data checks. When inserting into a vector, the elements being inserted may not reference the contents of the vector in question. If the client supplies internal references, then the standard allows for undefined behavior. An old design decision for fbvector was to be forgiving of this error; we check for internal data before performing internal data moves. This is expensive. Further, it is unnecessary when the insertion point is at the end of the vector (which is the valid case we are optimizing) or if the vector is being reallocated. I've rearchitected the insert macros to only perform the internal-data-check when it is absolutely necessary. While rearchitecting the checks, I also reorganized the n==0 base case checking. - The 'window' function is pretty expensive when called with the end() iterator. It is effectively a no-op, but the function (and some of its called functions) are too big to inline, so the call overhead is non-trivial. I've put window calls behind a conditional to save this overhead. I added a benchmark test to FBVectorTestBenchmark.cpp.h, but the particular pattern in that file caused the results to be completely optimized away, voiding the test. Oh well. Running the github benchmarks show a 4x speedup for inserting a single element on the back, bringing it quite close to that of std::vector. The multi-insert function got a bit faster, though the exact number of iterations seems to be having an effect on the numbers (running the tests 10,000 times, intead of only 1,000 times, yields results that are 90% as fast as std::vector, instead of 75%). Both of these cases are now looking quite a bit better. I resurrected the old StlVectorTest suite, since I was mucking with the complicated insert code. The suite had one latent compilation error (caught with newer compiler warnings - fortunately this particular case was benign). The test suite caught an error in shrink_to_fit, where a clean-slate optimization for empty vectors (see D2696314) that trampled the vector's allocator. I have fixed that small bug in this diff, too. Reviewed By: ot Differential Revision: D3105962 fb-gh-sync-id: ac9fa9d7ca79335cf0ff6bb9010af1ed8bd7916b fbshipit-source-id: ac9fa9d7ca79335cf0ff6bb9010af1ed8bd7916b --- folly/FBVector.h | 61 +++++++++++++++----------- folly/test/stl_tests/StlVectorTest.cpp | 2 +- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/folly/FBVector.h b/folly/FBVector.h index 39668ed4..c44defef 100644 --- a/folly/FBVector.h +++ b/folly/FBVector.h @@ -164,8 +164,7 @@ private: } } - void - set(pointer newB, size_type newSize, size_type newCap) { + void set(pointer newB, size_type newSize, size_type newCap) { z_ = newB + newCap; e_ = newB + newSize; b_ = newB; @@ -967,8 +966,7 @@ public: void shrink_to_fit() noexcept { if (empty()) { - // Just skip reallocation. - *this = fbvector(); + impl_.reset(); return; } @@ -1261,7 +1259,7 @@ private: // we have the private section first because it defines some macros // These three functions, make_window, wrap_frame, and // insert_use_fresh_memory, can be combined into a uniform interface. // Since that interface involves a lot of case-work, it is built into - // some macros: FOLLY_FBVECTOR_INSERT_(START|TRY|END) + // some macros: FOLLY_FBVECTOR_INSERT_(PRE|START|TRY|END) // Macros are used in an attempt to let GCC perform better optimizations, // especially control flow optimization. // @@ -1270,10 +1268,6 @@ private: // we have the private section first because it defines some macros // window void make_window(iterator position, size_type n) { - assert(isValid(position)); - assert(size() + n <= capacity()); - assert(n != 0); - // The result is guaranteed to be non-negative, so use an unsigned type: size_type tail = std::distance(position, impl_.e_); @@ -1327,8 +1321,8 @@ private: // we have the private section first because it defines some macros //--------------------------------------------------------------------------- // use fresh? - bool insert_use_fresh(const_iterator cposition, size_type n) { - if (cposition == cend()) { + bool insert_use_fresh(bool at_end, size_type n) { + if (at_end) { if (size() + n <= capacity()) return false; if (reserve_in_place(size() + n)) return false; return true; @@ -1342,19 +1336,33 @@ private: // we have the private section first because it defines some macros //--------------------------------------------------------------------------- // interface + #define FOLLY_FBVECTOR_INSERT_PRE(cpos, n) \ + if (n == 0) return (iterator)cpos; \ + bool at_end = cpos == cend(); \ + bool fresh = insert_use_fresh(at_end, n); \ + if (!at_end) { \ + if (!fresh) { + + // check for internal data (technically not required by the standard) + #define FOLLY_FBVECTOR_INSERT_START(cpos, n) \ - assert(isValid(cpos)); \ + } \ + assert(isValid(cpos)); \ + } \ T* position = const_cast(cpos); \ size_type idx = std::distance(impl_.b_, position); \ - bool fresh = insert_use_fresh(position, n); \ T* b; \ - size_type newCap = 0; \ + size_type newCap; /* intentionally uninitialized */ \ \ if (fresh) { \ newCap = computeInsertCapacity(n); \ b = M_allocate(newCap); \ } else { \ - make_window(position, n); \ + if (!at_end) { \ + make_window(position, n); \ + } else { \ + impl_.e_ += n; \ + } \ b = impl_.b_; \ } \ \ @@ -1369,7 +1377,11 @@ private: // we have the private section first because it defines some macros if (fresh) { \ M_deallocate(b, newCap); \ } else { \ - undo_window(position, n); \ + if (!at_end) { \ + undo_window(position, n); \ + } else { \ + impl_.e_ -= n; \ + } \ } \ throw; \ } \ @@ -1399,6 +1411,7 @@ public: template iterator emplace(const_iterator cpos, Args&&... args) { + FOLLY_FBVECTOR_INSERT_PRE(cpos, 1) FOLLY_FBVECTOR_INSERT_START(cpos, 1) M_construct(start, std::forward(args)...); FOLLY_FBVECTOR_INSERT_TRY(cpos, 1) @@ -1407,8 +1420,8 @@ public: } iterator insert(const_iterator cpos, const T& value) { - if (dataIsInternal(value)) return insert(cpos, T(value)); - + FOLLY_FBVECTOR_INSERT_PRE(cpos, 1) + if (dataIsInternal(value)) return insert(cpos, T(value)); FOLLY_FBVECTOR_INSERT_START(cpos, 1) M_construct(start, value); FOLLY_FBVECTOR_INSERT_TRY(cpos, 1) @@ -1417,8 +1430,8 @@ public: } iterator insert(const_iterator cpos, T&& value) { - if (dataIsInternal(value)) return insert(cpos, T(std::move(value))); - + FOLLY_FBVECTOR_INSERT_PRE(cpos, 1) + if (dataIsInternal(value)) return insert(cpos, T(std::move(value))); FOLLY_FBVECTOR_INSERT_START(cpos, 1) M_construct(start, std::move(value)); FOLLY_FBVECTOR_INSERT_TRY(cpos, 1) @@ -1427,9 +1440,8 @@ public: } iterator insert(const_iterator cpos, size_type n, VT value) { - if (n == 0) return (iterator)cpos; - if (dataIsInternalAndNotVT(value)) return insert(cpos, n, T(value)); - + FOLLY_FBVECTOR_INSERT_PRE(cpos, n) + if (dataIsInternalAndNotVT(value)) return insert(cpos, n, T(value)); FOLLY_FBVECTOR_INSERT_START(cpos, n) D_uninitialized_fill_n_a(start, n, value); FOLLY_FBVECTOR_INSERT_TRY(cpos, n) @@ -1455,8 +1467,7 @@ private: iterator insert(const_iterator cpos, FIt first, FIt last, std::forward_iterator_tag) { size_type n = std::distance(first, last); - if (n == 0) return (iterator)cpos; - + FOLLY_FBVECTOR_INSERT_PRE(cpos, n) FOLLY_FBVECTOR_INSERT_START(cpos, n) D_uninitialized_copy_a(start, first, last); FOLLY_FBVECTOR_INSERT_TRY(cpos, n) diff --git a/folly/test/stl_tests/StlVectorTest.cpp b/folly/test/stl_tests/StlVectorTest.cpp index 75744094..1b3d6164 100644 --- a/folly/test/stl_tests/StlVectorTest.cpp +++ b/folly/test/stl_tests/StlVectorTest.cpp @@ -590,7 +590,7 @@ struct Alloc : AllocTracker, Ticker { std::allocator a; int id; - explicit Alloc(int i = 8) : a(a), id(i) {} + explicit Alloc(int i = 8) : a(), id(i) {} Alloc(const Alloc& o) : a(o.a), id(o.id) {} Alloc(Alloc&& o) : a(move(o.a)), id(o.id) {} Alloc& operator=(const Alloc&) = default; -- 2.34.1