2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/detail/CacheLocality.h>
22 #include <type_traits>
23 #include <unordered_map>
24 #include <glog/logging.h>
25 #include <gtest/gtest.h>
26 #include <folly/Benchmark.h>
28 using namespace folly::detail;
30 /// This is the relevant nodes from a production box's sysfs tree. If you
31 /// think this map is ugly you should see the version of this test that
32 /// used a real directory tree. To reduce the chance of testing error
33 /// I haven't tried to remove the common prefix
34 static std::unordered_map<std::string,std::string> fakeSysfsTree = {
35 { "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
36 { "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
37 { "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
38 { "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
39 { "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
40 { "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
41 { "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
42 { "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
43 { "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
44 { "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
45 { "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
46 { "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
47 { "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
48 { "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
49 { "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
50 { "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
51 { "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
52 { "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
53 { "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
54 { "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
55 { "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
56 { "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
57 { "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
58 { "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
59 { "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
60 { "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
61 { "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
62 { "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
63 { "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
64 { "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
65 { "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
66 { "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
67 { "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
68 { "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
69 { "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
70 { "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
71 { "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
72 { "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
73 { "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
74 { "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
75 { "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
76 { "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
77 { "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
78 { "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
79 { "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
80 { "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
81 { "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
82 { "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
83 { "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
84 { "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
85 { "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
86 { "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
87 { "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
88 { "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
89 { "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
90 { "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
91 { "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
92 { "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
93 { "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
94 { "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
95 { "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
96 { "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
97 { "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
98 { "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
99 { "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
100 { "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
101 { "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
102 { "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
103 { "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
104 { "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
105 { "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
106 { "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
107 { "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
108 { "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
109 { "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
110 { "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
111 { "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
112 { "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
113 { "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
114 { "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
115 { "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
116 { "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
117 { "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
118 { "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
119 { "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
120 { "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
121 { "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
122 { "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
123 { "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
124 { "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
125 { "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
126 { "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
127 { "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
128 { "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
129 { "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
130 { "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
131 { "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
132 { "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
133 { "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
134 { "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
135 { "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
136 { "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
137 { "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
138 { "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
139 { "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
140 { "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
141 { "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
142 { "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
143 { "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
144 { "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
145 { "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
146 { "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
147 { "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
148 { "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
149 { "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
150 { "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
151 { "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
152 { "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
153 { "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
154 { "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
155 { "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
156 { "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
157 { "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
158 { "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
159 { "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
160 { "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
161 { "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
162 { "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
163 { "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
164 { "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
165 { "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
166 { "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
167 { "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
168 { "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
169 { "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
170 { "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
171 { "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
172 { "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
173 { "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
174 { "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
175 { "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
176 { "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
177 { "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
178 { "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
179 { "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
180 { "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
181 { "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
182 { "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
183 { "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
184 { "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
185 { "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
186 { "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
187 { "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
188 { "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
189 { "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
190 { "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
191 { "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
192 { "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
193 { "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
194 { "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
195 { "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
196 { "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
197 { "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
198 { "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
199 { "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
200 { "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
201 { "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
202 { "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
203 { "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
204 { "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
205 { "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
206 { "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
207 { "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
208 { "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
209 { "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
210 { "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
211 { "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
212 { "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
213 { "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
214 { "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
215 { "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
216 { "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
217 { "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
218 { "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
219 { "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
220 { "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
221 { "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
222 { "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
223 { "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
224 { "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
225 { "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
226 { "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
227 { "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
228 { "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
229 { "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
230 { "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
231 { "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
232 { "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
233 { "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
234 { "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
235 { "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
236 { "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
237 { "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
238 { "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
239 { "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
240 { "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
241 { "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
242 { "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
243 { "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
244 { "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
245 { "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
246 { "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
247 { "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
248 { "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
249 { "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
250 { "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
251 { "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
252 { "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
253 { "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
254 { "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
255 { "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
256 { "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
257 { "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
258 { "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
259 { "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
260 { "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
261 { "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
262 { "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
263 { "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
264 { "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
265 { "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
266 { "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
267 { "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
268 { "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
269 { "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
270 { "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
271 { "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
272 { "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
273 { "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
274 { "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
275 { "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
276 { "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
277 { "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
278 { "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
279 { "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
280 { "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
281 { "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
282 { "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
283 { "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
284 { "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
285 { "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
286 { "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
287 { "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
288 { "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
289 { "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
290 { "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
293 /// This is the expected CacheLocality structure for fakeSysfsTree
294 static const CacheLocality nonUniformExampleLocality = {
297 { 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
298 30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
301 TEST(CacheLocality, FakeSysfs) {
302 auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
303 auto iter = fakeSysfsTree.find(name);
304 return iter == fakeSysfsTree.end() ? std::string() : iter->second;
307 auto& expected = nonUniformExampleLocality;
308 EXPECT_EQ(expected.numCpus, parsed.numCpus);
309 EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
310 EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
313 TEST(Getcpu, VdsoGetcpu) {
315 Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
317 EXPECT_TRUE(cpu < CPU_SETSIZE);
320 TEST(SequentialThreadId, Simple) {
322 auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
324 EXPECT_TRUE(cpu > 0);
326 SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
327 EXPECT_EQ(cpu, again);
330 static FOLLY_TLS unsigned testingCpu = 0;
332 static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
333 if (cpu != nullptr) {
336 if (node != nullptr) {
342 TEST(AccessSpreader, Stubbed) {
343 std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
344 for (size_t s = 1; s < spreaders.size(); ++s) {
345 spreaders[s].reset(new AccessSpreader<>(
346 s, nonUniformExampleLocality, &testingGetcpu));
348 std::vector<size_t> cpusInLocalityOrder = {
349 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
350 11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
351 for (size_t i = 0; i < 32; ++i) {
352 // extra i * 32 is to check wrapping behavior of impl
353 testingCpu = cpusInLocalityOrder[i] + i * 64;
354 for (size_t s = 1; s < spreaders.size(); ++s) {
355 EXPECT_EQ((i * s) / 32, spreaders[s]->current())
356 << "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
361 TEST(AccessSpreader, Default) {
362 AccessSpreader<> spreader(16);
363 EXPECT_LT(spreader.current(), 16);
366 TEST(AccessSpreader, Shared) {
367 for (size_t s = 1; s < 200; ++s) {
368 EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
372 TEST(AccessSpreader, Statics) {
373 LOG(INFO) << "stripeByCore.numStripes() = "
374 << AccessSpreader<>::stripeByCore.numStripes();
375 LOG(INFO) << "stripeByChip.numStripes() = "
376 << AccessSpreader<>::stripeByChip.numStripes();
377 for (size_t s = 1; s < 200; ++s) {
378 EXPECT_LT(AccessSpreader<>::current(s), s);
382 TEST(AccessSpreader, Wrapping) {
383 // this test won't pass unless locality.numCpus divides kMaxCpus
385 auto locality = CacheLocality::uniform(numCpus);
386 for (size_t s = 1; s < 200; ++s) {
387 AccessSpreader<> spreader(s, locality, &testingGetcpu);
388 for (size_t c = 0; c < 400; ++c) {
390 auto observed = spreader.current();
391 testingCpu = c % numCpus;
392 auto expected = spreader.current();
393 EXPECT_EQ(expected, observed)
394 << "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
399 // Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
401 // ============================================================================
402 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
403 // ============================================================================
404 // LocalAccessSpreaderUse 20.77ns 48.16M
405 // SharedAccessSpreaderUse 21.95ns 45.55M
406 // AccessSpreaderConstruction 466.56ns 2.14M
407 // ============================================================================
409 BENCHMARK(LocalAccessSpreaderUse, iters) {
410 folly::BenchmarkSuspender braces;
411 AccessSpreader<> spreader(16);
414 for (unsigned long i = 0; i < iters; ++i) {
415 auto x = spreader.current();
416 folly::doNotOptimizeAway(x);
420 BENCHMARK(SharedAccessSpreaderUse, iters) {
421 for (unsigned long i = 0; i < iters; ++i) {
422 auto x = AccessSpreader<>::current(16);
423 folly::doNotOptimizeAway(x);
427 BENCHMARK(AccessSpreaderConstruction, iters) {
428 std::aligned_storage<sizeof(AccessSpreader<>),
429 std::alignment_of<AccessSpreader<>>::value>::type raw;
430 for (unsigned long i = 0; i < iters; ++i) {
431 auto x = new (&raw) AccessSpreader<>(16);
432 folly::doNotOptimizeAway(x);
433 x->~AccessSpreader();
437 enum class SpreaderType { GETCPU, SHARED, TLS_RR };
439 // Benchmark scores here reflect the time for 32 threads to perform an
440 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
441 // if we don't separate the counters onto unique 128 byte stripes the
442 // 1_stripe and 2_stripe results are identical, even though the L3 is
443 // claimed to have 64 byte cache lines.
445 // _stub means there was no call to getcpu or the tls round-robin
446 // implementation, because for a single stripe the cpu doesn't matter.
447 // _getcpu refers to the vdso getcpu implementation with a locally
448 // constructed AccessSpreader. _tls_rr refers to execution using
449 // SequentialThreadId, the fallback if the vdso getcpu isn't available.
450 // _shared refers to calling AccessSpreader<>::current(numStripes)
451 // inside the hot loop.
453 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
454 // so since the stripe selection is 21 nanos the atomic increments in
455 // the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
456 // to ping-pong almost every operation, since the loops have the same
457 // duration. Widths 4 and 2 have the same behavior, but each tour of the
458 // cache line is 4 and 8 cores long, respectively. These all suggest a
459 // lower bound of 60 nanos for intra-chip handoff and increment between
462 // With 455 nanos (1K cycles) of busywork per contended increment, the
463 // system can hide all of the latency of a tour of length 4, but not
464 // quite one of length 8. I was a bit surprised at how much worse the
465 // non-striped version got. It seems that the inter-chip traffic also
466 // interferes with the L1-only localWork.load(). When the local work is
467 // doubled to about 1 microsecond we see that the inter-chip contention
468 // is still very important, but subdivisions on the same chip don't matter.
471 // _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
472 // ============================================================================
473 // folly/test/CacheLocalityTest.cpp relative time/iter iters/s
474 // ============================================================================
475 // contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K
476 // contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M
477 // contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M
478 // contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M
479 // contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M
480 // contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M
481 // contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M
482 // contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K
483 // contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M
484 // contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M
485 // contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M
486 // contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M
487 // contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M
488 // contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M
489 // contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M
490 // contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M
491 // contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M
492 // contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M
493 // contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M
494 // atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M
495 // ----------------------------------------------------------------------------
496 // contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K
497 // contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M
498 // contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M
499 // contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M
500 // contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M
501 // contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M
502 // atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M
503 // ----------------------------------------------------------------------------
504 // contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K
505 // contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M
506 // contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M
507 // contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K
508 // contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M
509 // contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M
510 // atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M
511 // ============================================================================
512 static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
513 SpreaderType spreaderType,
514 size_t counterAlignment = 128,
515 size_t numThreads = 32) {
516 folly::BenchmarkSuspender braces;
518 AccessSpreader<> spreader(
520 CacheLocality::system<std::atomic>(),
521 spreaderType == SpreaderType::TLS_RR
522 ? SequentialThreadId<std::atomic>::getcpu : nullptr);
524 std::atomic<size_t> ready(0);
525 std::atomic<bool> go(false);
527 // while in theory the cache line size is 64 bytes, experiments show
528 // that we get contention on 128 byte boundaries for Ivy Bridge. The
529 // extra indirection adds 1 or 2 nanos
530 assert(counterAlignment >= sizeof(std::atomic<size_t>));
531 std::vector<char> raw(counterAlignment * stripes);
533 // if we happen to be using the tlsRoundRobin, then sequentially
534 // assigning the thread identifiers is the unlikely best-case scenario.
535 // We don't want to unfairly benefit or penalize. Computing the exact
536 // maximum likelihood of the probability distributions is annoying, so
537 // I approximate as 2/5 of the ids that have no threads, 2/5 that have
538 // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
539 // wrapping back to slot 0 when we hit 1/15 and 1/5.
541 std::vector<std::thread> threads;
542 while (threads.size() < numThreads) {
543 threads.push_back(std::thread([&,iters,stripes,work]() {
544 std::atomic<size_t>* counters[stripes];
545 for (size_t i = 0; i < stripes; ++i) {
547 = new (raw.data() + counterAlignment * i) std::atomic<size_t>();
555 std::atomic<int> localWork(0);
556 if (spreaderType == SpreaderType::SHARED) {
557 for (size_t i = iters; i > 0; --i) {
558 ++*(counters[AccessSpreader<>::current(stripes)]);
559 for (size_t j = work; j > 0; --j) {
564 for (size_t i = iters; i > 0; --i) {
565 ++*(counters[spreader.current()]);
566 for (size_t j = work; j > 0; --j) {
573 if (threads.size() == numThreads / 15 ||
574 threads.size() == numThreads / 5) {
575 // create a few dummy threads to wrap back around to 0 mod numCpus
576 for (size_t i = threads.size(); i != numThreads; ++i) {
584 while (ready < numThreads) {
590 for (auto& thr : threads) {
595 static void atomicIncrBaseline(size_t iters, size_t work,
596 size_t numThreads = 32) {
597 folly::BenchmarkSuspender braces;
599 std::atomic<bool> go(false);
601 std::vector<std::thread> threads;
602 while (threads.size() < numThreads) {
603 threads.push_back(std::thread([&]() {
607 std::atomic<size_t> localCounter(0);
608 std::atomic<int> localWork(0);
609 for (size_t i = iters; i > 0; --i) {
611 for (size_t j = work; j > 0; --j) {
621 for (auto& thr : threads) {
626 BENCHMARK_DRAW_LINE()
628 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
629 1, 0, SpreaderType::GETCPU)
630 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
631 2, 0, SpreaderType::GETCPU)
632 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
633 4, 0, SpreaderType::GETCPU)
634 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
635 8, 0, SpreaderType::GETCPU)
636 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
637 16, 0, SpreaderType::GETCPU)
638 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
639 32, 0, SpreaderType::GETCPU)
640 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
641 64, 0, SpreaderType::GETCPU)
642 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
643 2, 0, SpreaderType::TLS_RR)
644 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
645 4, 0, SpreaderType::TLS_RR)
646 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
647 8, 0, SpreaderType::TLS_RR)
648 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
649 16, 0, SpreaderType::TLS_RR)
650 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
651 32, 0, SpreaderType::TLS_RR)
652 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
653 64, 0, SpreaderType::TLS_RR)
654 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
655 2, 0, SpreaderType::SHARED)
656 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
657 4, 0, SpreaderType::SHARED)
658 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
659 8, 0, SpreaderType::SHARED)
660 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
661 16, 0, SpreaderType::SHARED)
662 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
663 32, 0, SpreaderType::SHARED)
664 BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
665 64, 0, SpreaderType::SHARED)
666 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
667 BENCHMARK_DRAW_LINE()
668 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
669 1, 500, SpreaderType::GETCPU)
670 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
671 2, 500, SpreaderType::GETCPU)
672 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
673 4, 500, SpreaderType::GETCPU)
674 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
675 8, 500, SpreaderType::GETCPU)
676 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
677 16, 500, SpreaderType::GETCPU)
678 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
679 32, 500, SpreaderType::GETCPU)
680 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
681 BENCHMARK_DRAW_LINE()
682 BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
683 1, 1000, SpreaderType::GETCPU)
684 BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
685 2, 1000, SpreaderType::GETCPU)
686 BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
687 4, 1000, SpreaderType::GETCPU)
688 BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
689 8, 1000, SpreaderType::GETCPU)
690 BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
691 16, 1000, SpreaderType::GETCPU)
692 BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
693 32, 1000, SpreaderType::GETCPU)
694 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
697 int main(int argc, char** argv) {
698 testing::InitGoogleTest(&argc, argv);
699 gflags::ParseCommandLineFlags(&argc, &argv, true);
700 auto ret = RUN_ALL_TESTS();
701 if (!ret && FLAGS_benchmark) {
702 folly::runBenchmarks();