From 433e4a6ef8e2eb9d040c699f0a393850374f89a6 Mon Sep 17 00:00:00 2001
From: Todd Lipcon <todd@cloudera.com>
Date: Fri, 21 Apr 2017 16:01:47 -0700
Subject: [PATCH] urcu: add cacheline padding to URCU thread data

The thread-local data was previously smaller than a cacheline, so it was
possible for multiple thread's thread data to be allocated on the same
cacheline and cause false sharing.

I measured the offcore_response.all_data_rd.l3_hit.hitm_other_core
perf counter in a read-only workload with 16 threads executing a
read-only workload against a URCU-protected split-list set. Before, I
saw about 5.6M events per second. After, I saw basically none.

Additionally, 'perf c2c' showed a lot of cache-line bouncing in the URCU
read side critical section before, but now shows none.

I benchmarked the above read-only workload on a machine with 88 logical
cores (Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz) and 88 threads, and
the performance improved more than 2x.

Fixes issue #75
---
 cds/urcu/details/gp_decl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cds/urcu/details/gp_decl.h b/cds/urcu/details/gp_decl.h
index 52592c8a..dcfe0a22 100644
--- a/cds/urcu/details/gp_decl.h
+++ b/cds/urcu/details/gp_decl.h
@@ -34,6 +34,7 @@
 #include <cds/urcu/details/base.h>
 #include <cds/details/static_functor.h>
 #include <cds/details/lib.h>
+#include <cds/user_setup/cache_line.h>
 
 //@cond
 namespace cds { namespace urcu { namespace details {
@@ -45,6 +46,7 @@ namespace cds { namespace urcu { namespace details {
     template <> struct thread_data<tag_> { \
         atomics::atomic<uint32_t>        m_nAccessControl ; \
         thread_list_record< thread_data >   m_list ; \
+        char pad_[cds::c_nCacheLineSize]; \
         thread_data(): m_nAccessControl(0) {} \
         explicit thread_data( OS::ThreadId owner ): m_nAccessControl(0), m_list(owner) {} \
         ~thread_data() {} \
-- 
2.34.1