From 433e4a6ef8e2eb9d040c699f0a393850374f89a6 Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Fri, 21 Apr 2017 16:01:47 -0700 Subject: [PATCH] urcu: add cacheline padding to URCU thread data The thread-local data was previously smaller than a cacheline, so it was possible for multiple thread's thread data to be allocated on the same cacheline and cause false sharing. I measured the offcore_response.all_data_rd.l3_hit.hitm_other_core perf counter in a read-only workload with 16 threads executing a read-only workload against a URCU-protected split-list set. Before, I saw about 5.6M events per second. After, I saw basically none. Additionally, 'perf c2c' showed a lot of cache-line bouncing in the URCU read side critical section before, but now shows none. I benchmarked the above read-only workload on a machine with 88 logical cores (Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz) and 88 threads, and the performance improved more than 2x. Fixes issue #75 --- cds/urcu/details/gp_decl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cds/urcu/details/gp_decl.h b/cds/urcu/details/gp_decl.h index 52592c8a..dcfe0a22 100644 --- a/cds/urcu/details/gp_decl.h +++ b/cds/urcu/details/gp_decl.h @@ -34,6 +34,7 @@ #include #include #include +#include //@cond namespace cds { namespace urcu { namespace details { @@ -45,6 +46,7 @@ namespace cds { namespace urcu { namespace details { template <> struct thread_data { \ atomics::atomic m_nAccessControl ; \ thread_list_record< thread_data > m_list ; \ + char pad_[cds::c_nCacheLineSize]; \ thread_data(): m_nAccessControl(0) {} \ explicit thread_data( OS::ThreadId owner ): m_nAccessControl(0), m_list(owner) {} \ ~thread_data() {} \ -- 2.34.1