3 * git+ssh://amsterdam.csail.mit.edu/home/am1/prof/proftools.git
4 * under spmc/lib/scopedperf.hh
6 * Modified by stephentu to disable for non C++11 builds
9 #ifndef _SCOPED_PERF_H_
10 #define _SCOPED_PERF_H_
24 namespace scopedperf {
32 * statically enable/disable most of the generated code for profiling.
34 class default_enabler {
36 bool enabled() const { return true; }
39 class always_enabled {
41 bool enabled() const { return true; }
44 class always_disabled {
46 bool enabled() const { return false; }
50 * get CPU id function type
52 typedef int(*getcpu_fn)(void);
55 * spinlock: mostly to avoid pthread mutex sleeping.
57 #if !defined(XV6_KERNEL)
63 while (!__sync_bool_compare_and_swap(&x, 0, 1))
76 #if defined(XV6_KERNEL)
79 static inline int sched_getcpu() {
84 class scoped_spinlock {
86 scoped_spinlock(spinlock *larg) : l(larg) {
97 ~scoped_spinlock() { release(); }
106 * vector & pair: for portability.
108 template<class A, class B>
114 template<class A, class B>
116 make_pair(const A &a, const B &b)
129 vector() : _cnt(0) {}
130 void insert_front(T e) {
131 assert(_cnt < sizeof(_buf) / sizeof(T));
132 memmove(&_buf[1], &_buf[0], _cnt * sizeof(T));
136 void push_back(T e) {
137 assert(_cnt < sizeof(_buf) / sizeof(T));
148 viter(const vector<T> *v, int pos) : _v(v), _pos(pos) {}
149 bool operator!=(const viter &other) const { return _pos != other._pos; }
150 void operator++() { _pos++; }
151 T operator*() { return _v->_buf[_pos]; }
156 begin(const vector<T> &v)
158 return viter<T>(&v, 0);
163 end(const vector<T> &v)
165 return viter<T>(&v, v._cnt);
170 * fast log-base-2, for histograms.
172 static const uint8_t log2table[256] = {
174 #define R4(x) R2(x), R2(x)
175 #define R8(x) R4(x), R4(x)
176 #define R16(x) R8(x), R8(x)
177 #define R32(x) R16(x), R16(x)
178 #define R64(x) R32(x), R32(x)
179 #define R128(x) R64(x), R64(x)
180 0, 1, R2(2), R4(3), R8(4), R16(5), R32(6), R64(7), R128(8)
190 template<class T, int Nbits>
197 T hi = v >> (Nbits/2);
199 return Nbits/2 + log2r<T, Nbits/2>(hi);
201 return log2r<T, Nbits/2>(v);
209 return log2r<T, sizeof(T)*8>(v);
214 * ctrgroup: a group of performance counters.
217 template<typename... Counters>
218 class ctrgroup_chain;
221 class ctrgroup_chain<> {
224 static const uint cg_nctr = 0;
225 void cg_get_samples(uint64_t *v) const {}
226 void cg_get_delta(uint64_t *delta, uint64_t *prev) const {}
227 vector<const char*> get_names() const { return {}; }
230 template<typename One, typename... Others>
231 class ctrgroup_chain<One, Others...> : ctrgroup_chain<Others...> {
233 ctrgroup_chain(One *x, Others*... y)
234 : ctrgroup_chain<Others...>(y...), ctr(x)
239 static const uint cg_nctr = 1 + ctrgroup_chain<Others...>::cg_nctr;
241 void cg_get_samples(uint64_t *v) const {
242 v[0] = ctr->sample();
243 ctrgroup_chain<Others...>::cg_get_samples(v+1);
246 void cg_get_delta(uint64_t *delta, uint64_t *prev) const {
247 uint64_t x = ctr->sample();
248 *delta = (x - *prev) & ctr->mask;
250 ctrgroup_chain<Others...>::cg_get_delta(delta+1, prev+1);
253 vector<const char*> get_names() const {
254 vector<const char*> v = ctrgroup_chain<Others...>::get_names();
255 v.insert_front(ctr->name);
260 const One *const ctr;
263 template<typename... Counters>
264 ctrgroup_chain<Counters...>
265 ctrgroup(Counters*... args)
267 return ctrgroup_chain<Counters...>(args...);
272 * perfsum: aggregating counter deltas across multiple CPUs.
276 enum display_opt { show, hide };
278 perfsum_base(const char *n, display_opt d) : name(n), disp(d) {
279 scoped_spinlock x(get_sums_lock());
280 get_sums()->push_back(this);
283 static void printall(int w0 = 17, int w = 13) {
284 scoped_spinlock x(get_sums_lock());
285 auto sums = get_sums();
286 for (perfsum_base *ps: *sums)
287 if (ps->disp == show)
291 static void resetall() {
292 scoped_spinlock x(get_sums_lock());
293 for (perfsum_base *ps: *get_sums())
297 virtual void print(int w0, int w) const = 0;
298 virtual void reset() = 0;
301 template<class Row, class Callback>
302 static void print_row(const char *rowname, const Row &r,
303 int w0, int w, Callback f)
305 std::cout << std::left << std::setw(w0) << rowname;
306 for (const auto &elem: r)
307 std::cout << std::left << std::setw(w) << f(elem) << " ";
308 std::cout << std::endl;
312 const display_opt disp;
315 static vector<perfsum_base*> *get_sums() {
316 static vector<perfsum_base*> v;
320 static spinlock *get_sums_lock() {
329 /* Avoid compile-time reordering across performance counter reads */
330 __asm __volatile("" ::: "memory");
333 template<typename Enabler, typename... Counters>
334 class perfsum_tmpl : public perfsum_base, public Enabler {
336 perfsum_tmpl(const ctrgroup_chain<Counters...> *c,
337 const char *n, perfsum_base::display_opt d)
338 : perfsum_base(n, d), cg(c)
342 static const uint ps_nctr = ctrgroup_chain<Counters...>::cg_nctr;
345 const struct ctrgroup_chain<Counters...> *const cg;
346 enum { maxcpu = 256 };
348 template<class Stats, class T>
349 static uint64_t addcpus(const Stats stat[], T f) {
351 for (uint i = 0; i < maxcpu; i++)
358 * perfsum_ctr: aggregate counts of performance events.
360 template<typename Enabler, typename... Counters>
361 class perfsum_ctr : public perfsum_tmpl<Enabler, Counters...> {
363 perfsum_ctr(const ctrgroup_chain<Counters...> *c,
364 const char *n, perfsum_base::display_opt d)
365 : perfsum_tmpl<Enabler, Counters...>(c, n, d), base(0)
370 perfsum_ctr(const char *n,
371 const perfsum_ctr<Enabler, Counters...> *basesum,
372 perfsum_base::display_opt d)
373 : perfsum_tmpl<Enabler, Counters...>(basesum->cg, n, d), base(basesum)
378 void get_samples(uint64_t *s) const {
380 perfsum_tmpl<Enabler, Counters...>::cg->cg_get_samples(s);
384 void record(uint cpuid, uint64_t *s) {
385 uint64_t delta[perfsum_tmpl<Enabler, Counters...>::ps_nctr];
388 perfsum_tmpl<Enabler, Counters...>::cg->cg_get_delta(delta, s);
391 for (uint i = 0; i < perfsum_tmpl<Enabler, Counters...>::ps_nctr; i++)
392 stat[cpuid].sum[i] += delta[i];
396 void print(int w0, int w) const /* override */ {
397 if (!Enabler::enabled())
400 auto &cg = perfsum_tmpl<Enabler, Counters...>::cg;
401 vector<pair<uint64_t, uint64_t> > p;
402 for (uint i = 0; i < cg->cg_nctr; i++) {
404 base ? this->addcpus(base->stat, [&](const stats *s) { return s->sum[i]; })
405 : this->addcpus(stat, [&](const stats *s) { return s->count; });
406 p.push_back(make_pair(b,
407 this->addcpus(stat, [i](const stats *s) { return s->sum[i]; })));
410 this->print_row(perfsum_base::name, cg->get_names(), w0, w, [](const char *name)
412 this->print_row(" avg", p, w0, w, [](const pair<uint64_t, uint64_t> &e)
414 { return ((double) e.second) / (double) e.first; }
416 { return e.second / e.first; }
419 this->print_row(" total", p, w0, w, [](const pair<uint64_t, uint64_t> &e)
420 { return e.second; });
421 this->print_row(" count", p, w0, w, [](const pair<uint64_t, uint64_t> &e)
422 { return e.first; });
425 void reset() /* override */ {
426 memset(stat, 0, sizeof(stat));
432 uint64_t sum[perfsum_tmpl<Enabler, Counters...>::ps_nctr];
433 } __attribute__((aligned (64)));
435 struct stats stat[perfsum_tmpl<Enabler, Counters...>::maxcpu];
436 const struct perfsum_ctr<Enabler, Counters...> *const base;
439 template<typename Enabler, typename... Counters>
440 class perfsum_ctr_inlinegroup :
441 public ctrgroup_chain<Counters...>,
442 public perfsum_ctr<Enabler, Counters...>
445 perfsum_ctr_inlinegroup(const char *n, perfsum_base::display_opt d,
447 : ctrgroup_chain<Counters...>(ctrs...),
448 perfsum_ctr<Enabler, Counters...>(this, n, d) {}
451 template<typename Enabler = default_enabler, typename... Counters>
452 perfsum_ctr<Enabler, Counters...>
453 perfsum(const char *name, const ctrgroup_chain<Counters...> *c,
454 const perfsum_base::display_opt d = perfsum_base::show)
456 return perfsum_ctr<Enabler, Counters...>(c, name, d);
459 template<typename Enabler = default_enabler, typename... Counters>
460 perfsum_ctr_inlinegroup<Enabler, Counters...>
461 perfsum_group(const char *name, Counters*... c)
463 return perfsum_ctr_inlinegroup<Enabler, Counters...>(name, perfsum_base::show, c...);
466 template<typename Enabler, typename... Counters>
467 perfsum_ctr<Enabler, Counters...>
468 perfsum_frac(const char *name,
469 const perfsum_ctr<Enabler, Counters...> *base)
471 return perfsum_ctr<Enabler, Counters...>(name, base, perfsum_base::show);
475 * perfsum_hist: histogram-based aggregates.
477 template<typename Enabler, typename... Counters>
478 class perfsum_hist_tmpl : public perfsum_tmpl<Enabler, Counters...> {
480 perfsum_hist_tmpl(const ctrgroup_chain<Counters...> *c,
481 const char *n, perfsum_base::display_opt d)
482 : perfsum_tmpl<Enabler, Counters...>(c, n, d)
487 void get_samples(uint64_t *s) const {
489 perfsum_tmpl<Enabler, Counters...>::cg->cg_get_samples(s);
493 void record(uint cpuid, uint64_t *s) {
494 uint64_t delta[perfsum_tmpl<Enabler, Counters...>::ps_nctr];
497 perfsum_tmpl<Enabler, Counters...>::cg->cg_get_delta(delta, s);
500 for (uint i = 0; i < perfsum_tmpl<Enabler, Counters...>::ps_nctr; i++)
501 stat[cpuid].hist[i].count[log2(delta[i])]++;
504 void print(int w0, int w) const /* override */ {
505 if (!Enabler::enabled())
508 uint first = nbuckets, last = 0;
510 auto &cg = perfsum_tmpl<Enabler, Counters...>::cg;
512 for (uint i = 0; i < cg->cg_nctr; i++) {
514 for (uint j = 0; j < nbuckets; j++) {
515 v.count[j] = this->addcpus(stat, [&](const stats *s) { return s->hist[i].count[j]; });
517 if (j < first) first = j;
518 if (j > last) last = j;
524 this->print_row(perfsum_base::name, cg->get_names(), w0, w, [](const char *name)
526 for (uint i = first; i <= last; i++) {
528 snprintf(n, sizeof(n), " < 2^%d", i);
529 this->print_row(n, p, w0, w, [&](const buckets &b) { return b.count[i]; });
531 this->print_row(" total", p, w0, w, [](const buckets &b)
532 { uint64_t s = 0; for (auto x: b.count) s += x; return s; });
535 void reset() /* override */ {
536 memset(stat, 0, sizeof(stat));
540 enum { nbuckets = sizeof(uint64_t)*8 + 1 };
543 uint64_t count[nbuckets];
547 struct buckets hist[perfsum_tmpl<Enabler, Counters...>::ps_nctr];
548 } __attribute__((aligned (64)));
550 struct stats stat[perfsum_tmpl<Enabler, Counters...>::maxcpu];
553 template<typename Enabler = default_enabler, typename... Counters>
554 perfsum_hist_tmpl<Enabler, Counters...>
555 perfsum_hist(const char *name, const ctrgroup_chain<Counters...> *c,
556 const perfsum_base::display_opt d = perfsum_base::show)
558 return perfsum_hist_tmpl<Enabler, Counters...>(c, name, d);
563 * namedctr &c: actual counter implementations.
565 template<uint64_t CounterWidth>
568 namedctr(const char *n) : name(n) {}
571 static const uint64_t mask =
572 ((1ULL << (CounterWidth - 1)) - 1) << 1 | 1;
575 class tsc_ctr : public namedctr<64> {
577 tsc_ctr() : namedctr("tsc") {}
578 static uint64_t sample() {
580 __asm __volatile("rdtsc" : "=a" (a), "=d" (d));
581 return a | (d << 32);
585 class tscp_ctr : public namedctr<64> {
587 tscp_ctr() : namedctr("tscp") {}
588 static uint64_t sample() {
590 __asm __volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
591 return a | (d << 32);
595 template<uint64_t CounterWidth>
596 class pmc_ctr : public namedctr<CounterWidth> {
598 pmc_ctr(int n) : namedctr<CounterWidth>(mkname(n)), cn(n) {}
599 pmc_ctr(const char *nm) : namedctr<CounterWidth>(nm), cn(-1) {}
601 uint64_t sample() const {
603 __asm __volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (cn));
604 return a | (d << 32);
610 static const char* mkname(int n) {
611 char *buf = new char[32];
612 snprintf(buf, 32, "pmc%d", n);
617 template<uint64_t CounterWidth = 64>
618 class pmc_setup : public pmc_ctr<CounterWidth> {
620 pmc_setup(uint64_t v, const char *nm)
621 : pmc_ctr<CounterWidth>(nm), pmc_v(v) {}
624 if (pmc_ctr<CounterWidth>::cn >= 0)
628 * XXX detect how many counters the hardware has
630 static bool pmcuse[4];
631 static spinlock pmcuselock;
634 scoped_spinlock x(&pmcuselock);
635 while (n < 4 && pmcuse[n])
642 // ugly but effective
643 std::stringstream ss;
644 ss << "for f in /sys/kernel/spmc/cpu*/" << n << "; do "
645 << "echo " << std::hex << pmc_v << " > $f; done";
646 assert(0 == system(ss.str().c_str()));
649 pmc_ctr<CounterWidth>::cn = n;
657 class tod_ctr : public namedctr<64> {
659 tod_ctr() : namedctr("tod-usec") {}
660 uint64_t sample() const {
662 gettimeofday(&tv, 0);
663 return ((uint64_t) tv.tv_usec) + ((uint64_t) tv.tv_sec) * 1000000;
668 class zero_ctr : public namedctr<64> {
670 zero_ctr() : namedctr("zero") {}
671 uint64_t sample() const { return 0; }
676 * scoped performance-counting regions, which record samples into a perfsum.
678 template<typename Perfsum>
679 class base_perf_region {
681 base_perf_region(Perfsum *psarg, getcpu_fn getcpu)
682 : ps(psarg), enabled(ps->enabled()), cpuid(enabled ? getcpu() : 0)
688 // invoke lap multiple times to precisely measure iterations
689 // (use same measurement for end of one & start of next round)
692 ps->record(cpuid, s);
699 uint64_t s[Perfsum::ps_nctr];
702 template<typename Perfsum>
703 class scoped_perf_region : public base_perf_region<Perfsum> {
705 scoped_perf_region(Perfsum *psarg, getcpu_fn getcpu)
706 : base_perf_region<Perfsum>(psarg, getcpu) {}
707 ~scoped_perf_region() { base_perf_region<Perfsum>::lap(); }
710 template<typename Perfsum>
711 class killable_perf_region : public base_perf_region<Perfsum> {
713 killable_perf_region(Perfsum *psarg, getcpu_fn getcpu)
714 : base_perf_region<Perfsum>(psarg, getcpu), active(true) {}
715 ~killable_perf_region() { stop(); }
717 // perform a final measurement, if needed before destructor
720 base_perf_region<Perfsum>::lap();
724 // prevent destructor from performing a measurement
725 void kill() { active = false; }
731 template<typename Perfsum>
732 scoped_perf_region<Perfsum>
733 perf_region(Perfsum *ps, getcpu_fn getcpu = sched_getcpu)
735 return scoped_perf_region<Perfsum>(ps, getcpu);
738 template<typename Perfsum>
739 killable_perf_region<Perfsum>
740 killable_region(Perfsum *ps, getcpu_fn getcpu = sched_getcpu)
742 return killable_perf_region<Perfsum>(ps, getcpu);
747 * macros for the common case of putting in a scoped perf-counting region.
749 #define __PERF_CONCAT2(a, b) a ## b
750 #define __PERF_CONCAT(a, b) __PERF_CONCAT2(a, b)
751 #define __PERF_ANON __PERF_CONCAT(__anon_id_, __COUNTER__)
753 #define __PERF_REGION(region_var, sum_var, region_type, text, group) \
754 static auto __PERF_CONCAT(sum_var, _sum) = scopedperf::perfsum(text, group); \
755 auto region_var = region_type(&__PERF_CONCAT(sum_var, _sum));
757 #define ANON_REGION(text, group) \
758 __PERF_REGION(__PERF_ANON, __PERF_ANON, scopedperf::perf_region, text, group)
759 #define PERF_REGION(var, text, group) \
760 __PERF_REGION(var, __PERF_ANON, scopedperf::perf_region, text, group)
761 #define KILLABLE_REGION(var, text, group) \
762 __PERF_REGION(var, __PERF_ANON, scopedperf::killable_region, text, group)
764 #define STATIC_COUNTER_DECL(ctrtype, ctrname, groupname) \
765 static ctrtype ctrname; \
766 static ::scopedperf::ctrgroup_chain< ctrtype > groupname(&ctrname);
767 #define PERF_EXPR(expr) expr
768 #define PERF_DECL(decl) decl
770 #define CLASS_STATIC_COUNTER_DECL(ctrtype, ctrname, groupname) \
771 static ctrtype ctrname; \
772 static ::scopedperf::ctrgroup_chain< ctrtype > groupname;
773 #define CLASS_STATIC_COUNTER_IMPL(clsname, ctrtype, ctrname, groupname) \
774 ctrtype clsname::ctrname; \
775 ::scopedperf::ctrgroup_chain< ctrtype > clsname::groupname(&ctrname) ;
777 } /* namespace scopedperf */
779 #else /* !USE_PERF_CTRS */
781 #define ANON_REGION(text, group) ((void)0)
782 #define PERF_REGION(var, text, group) ((void)0)
783 #define KILLABLE_REGION(var, text, group) ((void)0)
785 #define STATIC_COUNTER_DECL(ctrtype, ctrname, groupname)
786 #define PERF_EXPR(expr) ((void)0)
787 #define PERF_DECL(decl)
789 #define CLASS_STATIC_COUNTER_DECL(ctrtype, ctrname, groupname)
790 #define CLASS_STATIC_COUNTER_IMPL(clsname, ctrtype, ctrname, groupname)
792 #endif /* USE_PERF_CTRS */
794 #endif /* _SCOPED_PERF_H_ */