From: Dave Watson Date: Tue, 6 Jun 2017 21:06:53 +0000 (-0700) Subject: Core-local allocator X-Git-Tag: v2017.06.12.00~31 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=5ba3126fb76f1d81100b34e429c79cd21f8cd142;p=folly.git Core-local allocator Summary: Adds a core-local allocator to CacheLocality. Multiple objects using cache locality may share the same allocator, and allocate things less than cacheline sized, without incurring additional false-sharing overhead. Reviewed By: nbronson, ot Differential Revision: D5139886 fbshipit-source-id: a9804662d6339829a12e0791f418dabd9678f1bf --- diff --git a/folly/concurrency/CoreCachedSharedPtr.h b/folly/concurrency/CoreCachedSharedPtr.h index 7f5b9a95..594050b2 100644 --- a/folly/concurrency/CoreCachedSharedPtr.h +++ b/folly/concurrency/CoreCachedSharedPtr.h @@ -42,9 +42,13 @@ class CoreCachedSharedPtr { } void reset(const std::shared_ptr& p = nullptr) { - for (auto& slot : slots_) { - auto holder = std::make_shared(p); - slot = std::shared_ptr(holder, p.get()); + // Allocate each Holder in a different CoreAllocator stripe to + // prevent false sharing. Their control blocks will be adjacent + // thanks to allocate_shared(). + for (auto slot : folly::enumerate(slots_)) { + auto alloc = detail::getCoreAllocatorStl(slot.index); + auto holder = std::allocate_shared(alloc, p); + *slot = std::shared_ptr(holder, p.get()); } } @@ -53,17 +57,11 @@ class CoreCachedSharedPtr { } private: + using Holder = std::shared_ptr; + template friend class CoreCachedWeakPtr; - // Space the Holders by a cache line, so their control blocks (which - // are adjacent to the slots thanks to make_shared()) will also be - // spaced. - struct FOLLY_ALIGN_TO_AVOID_FALSE_SHARING Holder { - explicit Holder(std::shared_ptr p) : ptr(std::move(p)) {} - std::shared_ptr ptr; - }; - std::array, kNumSlots> slots_; }; diff --git a/folly/detail/CacheLocality.cpp b/folly/detail/CacheLocality.cpp index 09da2871..d646ebe9 100644 --- a/folly/detail/CacheLocality.cpp +++ b/folly/detail/CacheLocality.cpp @@ -238,5 +238,38 @@ template struct SequentialThreadId; /////////////// AccessSpreader template struct AccessSpreader; +SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz) + : allocSize_{allocSize}, sz_(sz) {} + +SimpleAllocator::~SimpleAllocator() { + std::lock_guard g(m_); + for (auto& block : blocks_) { + aligned_free(block); + } +} + +void* SimpleAllocator::allocateHard() { + // Allocate a new slab. + mem_ = static_cast(aligned_malloc(allocSize_, allocSize_)); + if (!mem_) { + std::__throw_bad_alloc(); + } + end_ = mem_ + allocSize_; + blocks_.push_back(mem_); + + // Install a pointer to ourselves as the allocator. + *reinterpret_cast(mem_) = this; + static_assert( + alignof(std::max_align_t) >= sizeof(SimpleAllocator*), + "alignment too small"); + mem_ += std::min(sz_, alignof(std::max_align_t)); + + // New allocation. + auto mem = mem_; + mem_ += sz_; + assert(intptr_t(mem) % 128 != 0); + return mem; +} + } // namespace detail } // namespace folly diff --git a/folly/detail/CacheLocality.h b/folly/detail/CacheLocality.h index 617182d5..b6dd66e7 100644 --- a/folly/detail/CacheLocality.h +++ b/folly/detail/CacheLocality.h @@ -17,18 +17,23 @@ #pragma once #include +#include #include #include #include #include +#include #include #include +#include #include #include #include +#include #include #include +#include namespace folly { namespace detail { @@ -352,5 +357,151 @@ bool AccessSpreader::initialized = AccessSpreader::initialize(); // instantiated in CacheLocality.cpp extern template struct AccessSpreader; +/** + * A simple freelist allocator. Allocates things of size sz, from + * slabs of size allocSize. Takes a lock on each + * allocation/deallocation. + */ +class SimpleAllocator { + std::mutex m_; + uint8_t* mem_{nullptr}; + uint8_t* end_{nullptr}; + void* freelist_{nullptr}; + size_t allocSize_; + size_t sz_; + std::vector blocks_; + + public: + SimpleAllocator(size_t allocSize, size_t sz); + ~SimpleAllocator(); + void* allocateHard(); + + // Inline fast-paths. + void* allocate() { + std::lock_guard g(m_); + // Freelist allocation. + if (freelist_) { + auto mem = freelist_; + freelist_ = *static_cast(freelist_); + return mem; + } + + // Bump-ptr allocation. + if (intptr_t(mem_) % 128 == 0) { + // Avoid allocating pointers that may look like malloc + // pointers. + mem_ += std::min(sz_, alignof(std::max_align_t)); + } + if (mem_ && (mem_ + sz_ <= end_)) { + auto mem = mem_; + mem_ += sz_; + + assert(intptr_t(mem) % 128 != 0); + return mem; + } + + return allocateHard(); + } + void deallocate(void* mem) { + std::lock_guard g(m_); + *static_cast(mem) = freelist_; + freelist_ = mem; + } +}; + +/** + * An allocator that can be used with CacheLocality to allocate + * core-local memory. + * + * There is actually nothing special about the memory itself (it is + * not bound to numa nodes or anything), but the allocator guarantees + * that memory allocatd from the same stripe will only come from cache + * lines also allocated to the same stripe. This means multiple + * things using CacheLocality can allocate memory in smaller-than + * cacheline increments, and be assured that it won't cause more false + * sharing than it otherwise would. + * + * Note that allocation and deallocation takes a per-sizeclass lock. + */ +template +class CoreAllocator { + public: + class Allocator { + static constexpr size_t AllocSize{4096}; + + uint8_t sizeClass(size_t size) { + if (size <= 8) { + return 0; + } else if (size <= 16) { + return 1; + } else if (size <= 32) { + return 2; + } else if (size <= 64) { + return 3; + } else { // punt to malloc. + return 4; + } + } + + std::array allocators_{ + {{AllocSize, 8}, {AllocSize, 16}, {AllocSize, 32}, {AllocSize, 64}}}; + + public: + void* allocate(size_t size) { + auto cl = sizeClass(size); + if (cl == 4) { + static_assert( + CacheLocality::kFalseSharingRange == 128, + "kFalseSharingRange changed"); + // Align to a cacheline + size = size + (CacheLocality::kFalseSharingRange - 1); + size &= ~size_t(CacheLocality::kFalseSharingRange - 1); + void* mem = aligned_malloc(size, CacheLocality::kFalseSharingRange); + if (!mem) { + std::__throw_bad_alloc(); + } + return mem; + } + return allocators_[cl].allocate(); + } + void deallocate(void* mem) { + if (!mem) { + return; + } + + // See if it came from this allocator or malloc. + if (intptr_t(mem) % 128 != 0) { + auto addr = + reinterpret_cast(intptr_t(mem) & ~intptr_t(AllocSize - 1)); + auto allocator = *static_cast(addr); + allocator->deallocate(mem); + } else { + aligned_free(mem); + } + } + }; + + Allocator* get(size_t stripe) { + assert(stripe < Stripes); + return &allocators_[stripe]; + } + + private: + Allocator allocators_[Stripes]; +}; + +template +typename CoreAllocator::Allocator* getCoreAllocator(size_t stripe) { + static CoreAllocator allocator; + return allocator.get(stripe); +} + +template +StlAllocator::Allocator, T> getCoreAllocatorStl( + size_t stripe) { + auto alloc = getCoreAllocator(stripe); + return StlAllocator::Allocator, T>(alloc); +} + } // namespace detail } // namespace folly diff --git a/folly/test/CacheLocalityTest.cpp b/folly/test/CacheLocalityTest.cpp index 5ecd0eb1..cb18f14c 100644 --- a/folly/test/CacheLocalityTest.cpp +++ b/folly/test/CacheLocalityTest.cpp @@ -444,4 +444,33 @@ TEST(AccessSpreader, Wrapping) { } } } + +TEST(CoreAllocator, Basic) { + CoreAllocator<32> alloc; + auto a = alloc.get(0); + auto res = a->allocate(8); + memset(res, 0, 8); + a->deallocate(res); + res = a->allocate(8); + EXPECT_TRUE((intptr_t)res % 8 == 0); // check alignment + memset(res, 0, 8); + a->deallocate(res); + res = a->allocate(12); + EXPECT_TRUE((intptr_t)res % 16 == 0); // check alignment + memset(res, 0, 12); + a->deallocate(res); + res = a->allocate(257); + memset(res, 0, 257); + a->deallocate(res); + + std::vector mems; + for (int i = 0; i < 10000; i++) { + mems.push_back(a->allocate(1)); + } + for (auto& mem : mems) { + a->deallocate(mem); + } + mems.clear(); +} + #endif