2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author: Xin Liu <xliux@fb.com>
23 #include <folly/Benchmark.h>
24 #include <folly/ConcurrentSkipList.h>
25 #include <folly/Hash.h>
26 #include <folly/RWSpinLock.h>
27 #include <folly/portability/GFlags.h>
28 #include <glog/logging.h>
30 DEFINE_int32(num_threads, 12, "num concurrent threads to test");
32 // In some case, we may want to test worker threads operating on multiple
33 // lists. For example in search, not all threads are visiting the same posting
34 // list, but for the ones with some popular terms, they do get multiple
35 // visitors at the same time.
36 DEFINE_int32(num_sets, 1, "num of set to operate on");
38 static const int kInitHeadHeight = 10;
39 static const int kMaxValue = 0x1000000;
43 using namespace folly;
45 typedef int ValueType;
46 typedef ConcurrentSkipList<ValueType> SkipListType;
47 typedef SkipListType::Accessor SkipListAccessor;
48 typedef std::set<ValueType> SetType;
50 static std::vector<ValueType> gData;
51 static void initData() {
52 gData.resize(kMaxValue);
53 for (int i = 0; i < kMaxValue; ++i) {
56 std::random_shuffle(gData.begin(), gData.end());
59 // single thread benchmarks
60 void BM_IterateOverSet(int iters, int size) {
65 for (int i = 0; i < size; ++i) {
66 a_set.insert(gData[rand() % kMaxValue]);
71 auto iter = a_set.begin();
72 for (int i = 0; i < iters; ++i) {
74 if (iter == a_set.end()) iter = a_set.begin();
77 // VLOG(20) << "sum = " << sum;
81 void BM_IterateSkipList(int iters, int size) {
82 BenchmarkSuspender susp;
84 auto skipList = SkipListType::create(kInitHeadHeight);
85 for (int i = 0; i < size; ++i) {
86 skipList.add(rand() % kMaxValue);
91 auto iter = skipList.begin();
92 for (int i = 0; i < iters; ++i) {
94 if (iter == skipList.end()) iter = skipList.begin();
98 // VLOG(20) << "sum = " << sum;
102 void BM_SetMerge(int iters, int size) {
103 BenchmarkSuspender susp;
106 for (int i = 0; i < iters; ++i) {
107 a_set.insert(rand() % kMaxValue);
109 for (int i = 0; i < size; ++i) {
110 b_set.insert(rand() % kMaxValue);
114 int64_t mergedSum = 0;
115 FOR_EACH(it, a_set) {
116 if (b_set.find(*it) != b_set.end()) mergedSum += *it;
119 // VLOG(20) << mergedSum;
123 void BM_CSLMergeLookup(int iters, int size) {
124 BenchmarkSuspender susp;
125 auto skipList = SkipListType::create(kInitHeadHeight);
126 auto skipList2 = SkipListType::create(kInitHeadHeight);
128 for (int i = 0; i < iters; ++i) {
129 skipList.add(rand() % kMaxValue);
131 for (int i = 0; i < size; ++i) {
132 skipList2.add(rand() % kMaxValue);
134 int64_t mergedSum = 0;
137 SkipListType::Skipper skipper(skipList2);
138 FOR_EACH(it, skipList) {
139 if (skipper.to(*it)) mergedSum += *it;
143 // VLOG(20) << mergedSum;
147 // merge by two skippers
148 void BM_CSLMergeIntersection(int iters, int size) {
149 BenchmarkSuspender susp;
150 auto skipList = SkipListType::create(kInitHeadHeight);
151 auto skipList2 = SkipListType::create(kInitHeadHeight);
152 for (int i = 0; i < iters; ++i) {
153 skipList.add(rand() % kMaxValue);
155 for (int i = 0; i < size; ++i) {
156 skipList2.add(rand() % kMaxValue);
160 SkipListType::Skipper s1(skipList);
161 SkipListType::Skipper s2(skipList2);
163 int64_t mergedSum = 0;
165 while (s1.good() && s2.good()) {
170 } else if (v1 > v2) {
180 // VLOG(20) << mergedSum;
184 void BM_SetContainsNotFound(int iters, int size) {
185 BenchmarkSuspender susp;
187 CHECK_LT(size, kMaxValue);
188 for (int i = 0; i < size; ++i) {
194 for (int i = 0; i < iters; ++i) {
195 sum += (aset.end() == aset.find(2 * i + 1));
203 void BM_SetContainsFound(int iters, int size) {
204 BenchmarkSuspender susp;
206 CHECK_LT(size, kMaxValue);
208 for (int i = 0; i < size; ++i) {
212 std::vector<int> values;
213 for (int i = 0; i < iters; ++i) {
214 values.push_back(rand() % size);
219 for (int i = 0; i < iters; ++i) {
220 sum += (aset.end() == aset.find(values[i]));
228 void BM_CSLContainsFound(int iters, int size) {
229 BenchmarkSuspender susp;
230 auto skipList = SkipListType::create(kInitHeadHeight);
231 CHECK_LT(size, kMaxValue);
233 for (int i = 0; i < size; ++i) {
236 std::vector<int> values;
237 for (int i = 0; i < iters; ++i) {
238 values.push_back(rand() % size);
243 for (int i = 0; i < iters; ++i) {
244 sum += skipList.contains(values[i]);
252 void BM_CSLContainsNotFound(int iters, int size) {
253 BenchmarkSuspender susp;
254 auto skipList = SkipListType::create(kInitHeadHeight);
255 CHECK_LT(size, kMaxValue);
257 for (int i = 0; i < size; ++i) {
263 for (int i = 0; i < iters; ++i) {
264 sum += skipList.contains(2 * i + 1);
272 void BM_AddSet(int iters, int size) {
273 BenchmarkSuspender susp;
275 for (int i = 0; i < size; ++i) {
276 aset.insert(gData[i]);
280 for (int i = size; i < size + iters; ++i) {
281 aset.insert(gData[i]);
285 void BM_AddSkipList(int iters, int size) {
286 BenchmarkSuspender susp;
287 auto skipList = SkipListType::create(kInitHeadHeight);
288 for (int i = 0; i < size; ++i) {
289 skipList.add(gData[i]);
293 for (int i = size; i < size + iters; ++i) {
294 skipList.add(gData[i]);
298 BENCHMARK(Accessor, iters) {
299 BenchmarkSuspender susp;
300 auto skiplist = SkipListType::createInstance(kInitHeadHeight);
301 auto sl = skiplist.get();
304 for (size_t i = 0; i < iters; ++i) {
305 SkipListAccessor accessor(sl);
309 // a benchmark to estimate the
310 // low bound of doing a ref counting for an Accessor
311 BENCHMARK(accessorBasicRefcounting, iters) {
312 BenchmarkSuspender susp;
313 auto* value = new std::atomic<int32_t>();
314 auto* dirty = new std::atomic<int32_t>();
316 folly::MicroSpinLock l;
320 for (size_t i = 0; i < iters; ++i) {
321 value->fetch_add(1, std::memory_order_relaxed);
322 if (dirty->load(std::memory_order_acquire) != 0) {
323 folly::MSLGuard g(l);
325 value->fetch_sub(1, std::memory_order_relaxed);
335 // Data For testing contention benchmark
336 class ConcurrentAccessData {
338 explicit ConcurrentAccessData(int size) :
339 skipList_(SkipListType::create(10)),
340 sets_(FLAGS_num_sets), locks_(FLAGS_num_sets) {
342 for (int i = 0; i < size; ++i) {
347 for (int i = 0; i < FLAGS_num_sets; ++i) {
348 locks_[i] = new RWSpinLock();
349 if (i > 0) sets_[i] = sets_[0];
352 // This requires knowledge of the C++ library internals. Only use it if we're
353 // using the GNU C++ library.
354 #ifdef _GLIBCXX_SYMVER
356 int64_t setMemorySize = sets_[0].size() * sizeof(*sets_[0].begin()._M_node);
357 int64_t cslMemorySize = 0;
358 for (auto it = skipList_.begin(); it != skipList_.end(); ++it) {
359 cslMemorySize += it.nodeSize();
362 LOG(INFO) << "size=" << sets_[0].size()
363 << "; std::set memory size=" << setMemorySize
364 << "; csl memory size=" << cslMemorySize;
367 readValues_.reserve(size);
368 deleteValues_.reserve(size);
369 writeValues_.reserve(size);
370 for (int i = size; i < 2 * size; ++i) {
371 readValues_.push_back(2 * i);
372 deleteValues_.push_back(2 * i);
374 // half new values and half already in the list
375 writeValues_.push_back((rand() % 2) + 2 * i);
377 std::random_shuffle(readValues_.begin(), readValues_.end());
378 std::random_shuffle(deleteValues_.begin(), deleteValues_.end());
379 std::random_shuffle(writeValues_.begin(), writeValues_.end());
382 ~ConcurrentAccessData() {
383 FOR_EACH(lock, locks_) delete *lock;
386 inline bool skipListFind(int /* idx */, ValueType val) {
387 return skipList_.contains(val);
389 inline void skipListInsert(int /* idx */, ValueType val) {
392 inline void skipListErase(int /* idx */, ValueType val) {
393 skipList_.remove(val);
396 inline bool setFind(int idx, ValueType val) {
397 RWSpinLock::ReadHolder g(locks_[idx]);
398 return sets_[idx].find(val) == sets_[idx].end();
400 inline void setInsert(int idx, ValueType val) {
401 RWSpinLock::WriteHolder g(locks_[idx]);
402 sets_[idx].insert(val);
404 inline void setErase(int idx, ValueType val) {
405 RWSpinLock::WriteHolder g(locks_[idx]);
406 sets_[idx].erase(val);
409 void runSkipList(int id, size_t iters) {
411 for (size_t i = 0; i < iters; ++i) {
412 sum += accessSkipList(id, i);
417 void runSet(size_t id, size_t iters) {
419 for (size_t i = 0; i < iters; ++i) {
420 sum += accessSet(id, i);
425 bool accessSkipList(int64_t id, size_t t) {
426 if (t > readValues_.size()) {
427 t = t % readValues_.size();
429 uint32_t h = folly::hash::twang_32from64(t * id);
432 if ((h & 0x31) == 0) { // 1/4 chance to delete
433 skipListErase(0, deleteValues_[t]);
435 skipListInsert(0, writeValues_[t]);
439 return skipListFind(0, readValues_[t]);
443 bool accessSet(int64_t id, size_t t) {
444 if (t > readValues_.size()) {
445 t = t % readValues_.size();
447 uint32_t h = folly::hash::twang_32from64(t * id);
448 int idx = (h % FLAGS_num_sets);
449 switch (h % 8) { // 1/8 chance to write
451 if ((h & 0x31) == 0) { // 1/32 chance to delete
452 setErase(idx, deleteValues_[t]);
454 setInsert(idx, writeValues_[t]);
458 return setFind(idx, readValues_[t]);
463 SkipListType::Accessor skipList_;
464 std::vector<SetType> sets_;
465 std::vector<RWSpinLock*> locks_;
467 std::vector<ValueType> readValues_;
468 std::vector<ValueType> writeValues_;
469 std::vector<ValueType> deleteValues_;
472 static std::map<int, std::shared_ptr<ConcurrentAccessData> > g_data;
474 static ConcurrentAccessData *mayInitTestData(int size) {
475 auto it = g_data.find(size);
476 if (it == g_data.end()) {
477 auto ptr = std::shared_ptr<ConcurrentAccessData>(
478 new ConcurrentAccessData(size));
482 return it->second.get();
485 void BM_ContentionCSL(int iters, int size) {
486 BenchmarkSuspender susp;
487 auto data = mayInitTestData(size);
488 std::vector<std::thread> threads;
491 for (int i = 0; i < FLAGS_num_threads; ++i) {
492 threads.push_back(std::thread(
493 &ConcurrentAccessData::runSkipList, data, i, iters));
495 FOR_EACH(t, threads) {
500 void BM_ContentionStdSet(int iters, int size) {
501 BenchmarkSuspender susp;
502 auto data = mayInitTestData(size);
503 std::vector<std::thread> threads;
506 for (int i = 0; i < FLAGS_num_threads; ++i) {
507 threads.push_back(std::thread(
508 &ConcurrentAccessData::runSet, data, i, iters));
510 FOR_EACH(t, threads) {
517 // Single-thread benchmarking
519 BENCHMARK_DRAW_LINE();
521 BENCHMARK_PARAM(BM_IterateOverSet, 1000);
522 BENCHMARK_PARAM(BM_IterateSkipList, 1000);
523 BENCHMARK_DRAW_LINE();
524 BENCHMARK_PARAM(BM_IterateOverSet, 1000000);
525 BENCHMARK_PARAM(BM_IterateSkipList, 1000000);
526 BENCHMARK_DRAW_LINE();
528 // find with keys in the set
529 BENCHMARK_PARAM(BM_SetContainsFound, 1000);
530 BENCHMARK_PARAM(BM_CSLContainsFound, 1000);
531 BENCHMARK_DRAW_LINE();
532 BENCHMARK_PARAM(BM_SetContainsFound, 100000);
533 BENCHMARK_PARAM(BM_CSLContainsFound, 100000);
534 BENCHMARK_DRAW_LINE();
535 BENCHMARK_PARAM(BM_SetContainsFound, 1000000);
536 BENCHMARK_PARAM(BM_CSLContainsFound, 1000000);
537 BENCHMARK_DRAW_LINE();
538 BENCHMARK_PARAM(BM_SetContainsFound, 10000000);
539 BENCHMARK_PARAM(BM_CSLContainsFound, 10000000);
540 BENCHMARK_DRAW_LINE();
543 // find with keys not in the set
544 BENCHMARK_PARAM(BM_SetContainsNotFound, 1000);
545 BENCHMARK_PARAM(BM_CSLContainsNotFound, 1000);
546 BENCHMARK_DRAW_LINE();
547 BENCHMARK_PARAM(BM_SetContainsNotFound, 100000);
548 BENCHMARK_PARAM(BM_CSLContainsNotFound, 100000);
549 BENCHMARK_DRAW_LINE();
550 BENCHMARK_PARAM(BM_SetContainsNotFound, 1000000);
551 BENCHMARK_PARAM(BM_CSLContainsNotFound, 1000000);
552 BENCHMARK_DRAW_LINE();
555 BENCHMARK_PARAM(BM_AddSet, 1000);
556 BENCHMARK_PARAM(BM_AddSkipList, 1000);
557 BENCHMARK_DRAW_LINE();
559 BENCHMARK_PARAM(BM_AddSet, 65536);
560 BENCHMARK_PARAM(BM_AddSkipList, 65536);
561 BENCHMARK_DRAW_LINE();
563 BENCHMARK_PARAM(BM_AddSet, 1000000);
564 BENCHMARK_PARAM(BM_AddSkipList, 1000000);
565 BENCHMARK_DRAW_LINE();
567 BENCHMARK_PARAM(BM_SetMerge, 1000);
568 BENCHMARK_PARAM(BM_CSLMergeIntersection, 1000);
569 BENCHMARK_PARAM(BM_CSLMergeLookup, 1000);
570 BENCHMARK_DRAW_LINE();
572 BENCHMARK_PARAM(BM_SetMerge, 65536);
573 BENCHMARK_PARAM(BM_CSLMergeIntersection, 65536);
574 BENCHMARK_PARAM(BM_CSLMergeLookup, 65536);
575 BENCHMARK_DRAW_LINE();
577 BENCHMARK_PARAM(BM_SetMerge, 1000000);
578 BENCHMARK_PARAM(BM_CSLMergeIntersection, 1000000);
579 BENCHMARK_PARAM(BM_CSLMergeLookup, 1000000);
580 BENCHMARK_DRAW_LINE();
583 // multithreaded benchmarking
585 BENCHMARK_PARAM(BM_ContentionStdSet, 1024);
586 BENCHMARK_PARAM(BM_ContentionCSL, 1024);
587 BENCHMARK_DRAW_LINE();
589 BENCHMARK_PARAM(BM_ContentionStdSet, 65536);
590 BENCHMARK_PARAM(BM_ContentionCSL, 65536);
591 BENCHMARK_DRAW_LINE();
593 BENCHMARK_PARAM(BM_ContentionStdSet, 1048576);
594 BENCHMARK_PARAM(BM_ContentionCSL, 1048576);
595 BENCHMARK_DRAW_LINE();
599 int main(int argc, char** argv) {
600 google::InitGoogleLogging(argv[0]);
601 gflags::ParseCommandLineFlags(&argc, &argv, true);
610 Benchmark on Intel(R) Xeon(R) CPU X5650 @2.67GHz
612 ==============================================================================
613 1 thread Benchmark Iters Total t t/iter iter/sec
614 ------------------------------------------------------------------------------
615 +37.0% BM_Accessor 100000 1.958 ms 19.58 ns 48.71 M
616 * BM_AccessorBasicRefcounting 100000 1.429 ms 14.29 ns 66.74 M
617 ------------------------------------------------------------------------------
618 + 603% BM_IterateOverSet/1000 100000 1.589 ms 15.89 ns 60.02 M
619 * BM_IterateSkipList/1000 100000 226 us 2.26 ns 422 M
620 ------------------------------------------------------------------------------
621 + 107% BM_IterateOverSet/976.6k 100000 8.324 ms 83.24 ns 11.46 M
622 * BM_IterateSkipList/976.6k 100000 4.016 ms 40.16 ns 23.75 M
623 ------------------------------------------------------------------------------
624 * BM_SetContainsFound/1000 100000 7.082 ms 70.82 ns 13.47 M
625 +39.9% BM_CSLContainsFound/1000 100000 9.908 ms 99.08 ns 9.625 M
626 ------------------------------------------------------------------------------
627 * BM_SetContainsFound/97.66k 100000 23.8 ms 238 ns 4.006 M
628 +5.97% BM_CSLContainsFound/97.66k 100000 25.23 ms 252.3 ns 3.781 M
629 ------------------------------------------------------------------------------
630 +33.6% BM_SetContainsFound/976.6k 100000 64.3 ms 643 ns 1.483 M
631 * BM_CSLContainsFound/976.6k 100000 48.13 ms 481.3 ns 1.981 M
632 ------------------------------------------------------------------------------
633 +30.3% BM_SetContainsFound/9.537M 100000 115.1 ms 1.151 us 848.6 k
634 * BM_CSLContainsFound/9.537M 100000 88.33 ms 883.3 ns 1.08 M
635 ------------------------------------------------------------------------------
636 * BM_SetContainsNotFound/1000 100000 2.081 ms 20.81 ns 45.83 M
637 +76.2% BM_CSLContainsNotFound/1000 100000 3.667 ms 36.67 ns 26.01 M
638 ------------------------------------------------------------------------------
639 * BM_SetContainsNotFound/97.66k 100000 6.049 ms 60.49 ns 15.77 M
640 +32.7% BM_CSLContainsNotFound/97.66k 100000 8.025 ms 80.25 ns 11.88 M
641 ------------------------------------------------------------------------------
642 * BM_SetContainsNotFound/976.6k 100000 7.464 ms 74.64 ns 12.78 M
643 +12.8% BM_CSLContainsNotFound/976.6k 100000 8.417 ms 84.17 ns 11.33 M
644 ------------------------------------------------------------------------------
645 * BM_AddSet/1000 100000 29.26 ms 292.6 ns 3.259 M
646 +70.0% BM_AddSkipList/1000 100000 49.75 ms 497.5 ns 1.917 M
647 ------------------------------------------------------------------------------
648 * BM_AddSet/64k 100000 38.73 ms 387.3 ns 2.462 M
649 +55.7% BM_AddSkipList/64k 100000 60.3 ms 603 ns 1.581 M
650 ------------------------------------------------------------------------------
651 * BM_AddSet/976.6k 100000 75.71 ms 757.1 ns 1.26 M
652 +33.6% BM_AddSkipList/976.6k 100000 101.2 ms 1.012 us 965.3 k
653 ------------------------------------------------------------------------------
654 + 716% BM_SetMerge/1000 100000 6.872 ms 68.72 ns 13.88 M
655 * BM_CSLMergeIntersection/1000 100000 842 us 8.42 ns 113.3 M
656 + 268% BM_CSLMergeLookup/1000 100000 3.1 ms 31 ns 30.76 M
657 ------------------------------------------------------------------------------
658 +36.3% BM_SetMerge/64k 100000 14.03 ms 140.3 ns 6.798 M
659 +39.4% BM_CSLMergeIntersection/64k 100000 14.35 ms 143.5 ns 6.645 M
660 * BM_CSLMergeLookup/64k 100000 10.29 ms 102.9 ns 9.266 M
661 ------------------------------------------------------------------------------
662 +10.3% BM_SetMerge/976.6k 100000 46.24 ms 462.4 ns 2.062 M
663 +25.1% BM_CSLMergeIntersection/976.6k 100000 52.47 ms 524.7 ns 1.818 M
664 * BM_CSLMergeLookup/976.6k 100000 41.94 ms 419.3 ns 2.274 M
665 ------------------------------------------------------------------------------
668 ==============================================================================
669 Contention benchmark 7/8 find, 3/32 insert, 1/32 erase
671 4 threads Benchmark Iters Total t t/iter iter/sec
672 ------------------------------------------------------------------------------
673 + 269% BM_ContentionStdSet/1k 100000 75.66 ms 756.6 ns 1.26 M
674 * BM_ContentionCSL/1k 100000 20.47 ms 204.7 ns 4.658 M
675 ------------------------------------------------------------------------------
676 + 228% BM_ContentionStdSet/64k 100000 105.6 ms 1.056 us 924.9 k
677 * BM_ContentionCSL/64k 100000 32.18 ms 321.8 ns 2.963 M
678 ------------------------------------------------------------------------------
679 + 224% BM_ContentionStdSet/1M 100000 117.4 ms 1.174 us 832.2 k
680 * BM_ContentionCSL/1M 100000 36.18 ms 361.8 ns 2.636 M
681 ------------------------------------------------------------------------------
684 12 threads Benchmark Iters Total t t/iter iter/sec
685 ------------------------------------------------------------------------------
686 + 697% BM_ContentionStdSet/1k 100000 455.3 ms 4.553 us 214.5 k
687 * BM_ContentionCSL/1k 100000 57.12 ms 571.2 ns 1.67 M
688 ------------------------------------------------------------------------------
689 +1257% BM_ContentionStdSet/64k 100000 654.9 ms 6.549 us 149.1 k
690 * BM_ContentionCSL/64k 100000 48.24 ms 482.4 ns 1.977 M
691 ------------------------------------------------------------------------------
692 +1262% BM_ContentionStdSet/1M 100000 657.3 ms 6.573 us 148.6 k
693 * BM_ContentionCSL/1M 100000 48.25 ms 482.5 ns 1.977 M
694 ------------------------------------------------------------------------------