perf trace: Put syscall formatter parms into struct
[firefly-linux-kernel-4.4.55.git] / tools / perf / builtin-trace.c
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13
14 #include <libaudit.h>
15 #include <stdlib.h>
16 #include <sys/mman.h>
17 #include <linux/futex.h>
18
19 /* For older distros: */
20 #ifndef MAP_STACK
21 # define MAP_STACK              0x20000
22 #endif
23
24 #ifndef MADV_HWPOISON
25 # define MADV_HWPOISON          100
26 #endif
27
28 #ifndef MADV_MERGEABLE
29 # define MADV_MERGEABLE         12
30 #endif
31
32 #ifndef MADV_UNMERGEABLE
33 # define MADV_UNMERGEABLE       13
34 #endif
35
36 struct syscall_arg {
37         unsigned long val;
38         u8            idx;
39         u8            mask;
40 };
41
42 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
43                                          struct syscall_arg *arg)
44 {
45         return scnprintf(bf, size, "%#lx", arg->val);
46 }
47
48 #define SCA_HEX syscall_arg__scnprintf_hex
49
50 static size_t syscall_arg__scnprintf_whence(char *bf, size_t size,
51                                             struct syscall_arg *arg)
52 {
53         int whence = arg->val;
54
55         switch (whence) {
56 #define P_WHENCE(n) case SEEK_##n: return scnprintf(bf, size, #n)
57         P_WHENCE(SET);
58         P_WHENCE(CUR);
59         P_WHENCE(END);
60 #ifdef SEEK_DATA
61         P_WHENCE(DATA);
62 #endif
63 #ifdef SEEK_HOLE
64         P_WHENCE(HOLE);
65 #endif
66 #undef P_WHENCE
67         default: break;
68         }
69
70         return scnprintf(bf, size, "%#x", whence);
71 }
72
73 #define SCA_WHENCE syscall_arg__scnprintf_whence
74
75 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
76                                                struct syscall_arg *arg)
77 {
78         int printed = 0, prot = arg->val;
79
80         if (prot == PROT_NONE)
81                 return scnprintf(bf, size, "NONE");
82 #define P_MMAP_PROT(n) \
83         if (prot & PROT_##n) { \
84                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
85                 prot &= ~PROT_##n; \
86         }
87
88         P_MMAP_PROT(EXEC);
89         P_MMAP_PROT(READ);
90         P_MMAP_PROT(WRITE);
91 #ifdef PROT_SEM
92         P_MMAP_PROT(SEM);
93 #endif
94         P_MMAP_PROT(GROWSDOWN);
95         P_MMAP_PROT(GROWSUP);
96 #undef P_MMAP_PROT
97
98         if (prot)
99                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
100
101         return printed;
102 }
103
104 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
105
106 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
107                                                 struct syscall_arg *arg)
108 {
109         int printed = 0, flags = arg->val;
110
111 #define P_MMAP_FLAG(n) \
112         if (flags & MAP_##n) { \
113                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
114                 flags &= ~MAP_##n; \
115         }
116
117         P_MMAP_FLAG(SHARED);
118         P_MMAP_FLAG(PRIVATE);
119 #ifdef MAP_32BIT
120         P_MMAP_FLAG(32BIT);
121 #endif
122         P_MMAP_FLAG(ANONYMOUS);
123         P_MMAP_FLAG(DENYWRITE);
124         P_MMAP_FLAG(EXECUTABLE);
125         P_MMAP_FLAG(FILE);
126         P_MMAP_FLAG(FIXED);
127         P_MMAP_FLAG(GROWSDOWN);
128 #ifdef MAP_HUGETLB
129         P_MMAP_FLAG(HUGETLB);
130 #endif
131         P_MMAP_FLAG(LOCKED);
132         P_MMAP_FLAG(NONBLOCK);
133         P_MMAP_FLAG(NORESERVE);
134         P_MMAP_FLAG(POPULATE);
135         P_MMAP_FLAG(STACK);
136 #ifdef MAP_UNINITIALIZED
137         P_MMAP_FLAG(UNINITIALIZED);
138 #endif
139 #undef P_MMAP_FLAG
140
141         if (flags)
142                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
143
144         return printed;
145 }
146
147 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
148
149 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
150                                                       struct syscall_arg *arg)
151 {
152         int behavior = arg->val;
153
154         switch (behavior) {
155 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
156         P_MADV_BHV(NORMAL);
157         P_MADV_BHV(RANDOM);
158         P_MADV_BHV(SEQUENTIAL);
159         P_MADV_BHV(WILLNEED);
160         P_MADV_BHV(DONTNEED);
161         P_MADV_BHV(REMOVE);
162         P_MADV_BHV(DONTFORK);
163         P_MADV_BHV(DOFORK);
164         P_MADV_BHV(HWPOISON);
165 #ifdef MADV_SOFT_OFFLINE
166         P_MADV_BHV(SOFT_OFFLINE);
167 #endif
168         P_MADV_BHV(MERGEABLE);
169         P_MADV_BHV(UNMERGEABLE);
170 #ifdef MADV_HUGEPAGE
171         P_MADV_BHV(HUGEPAGE);
172 #endif
173 #ifdef MADV_NOHUGEPAGE
174         P_MADV_BHV(NOHUGEPAGE);
175 #endif
176 #ifdef MADV_DONTDUMP
177         P_MADV_BHV(DONTDUMP);
178 #endif
179 #ifdef MADV_DODUMP
180         P_MADV_BHV(DODUMP);
181 #endif
182 #undef P_MADV_PHV
183         default: break;
184         }
185
186         return scnprintf(bf, size, "%#x", behavior);
187 }
188
189 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
190
191 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
192 {
193         enum syscall_futex_args {
194                 SCF_UADDR   = (1 << 0),
195                 SCF_OP      = (1 << 1),
196                 SCF_VAL     = (1 << 2),
197                 SCF_TIMEOUT = (1 << 3),
198                 SCF_UADDR2  = (1 << 4),
199                 SCF_VAL3    = (1 << 5),
200         };
201         int op = arg->val;
202         int cmd = op & FUTEX_CMD_MASK;
203         size_t printed = 0;
204
205         switch (cmd) {
206 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
207         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
208         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
209         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
210         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
211         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
212         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
213         P_FUTEX_OP(WAKE_OP);                                                      break;
214         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
215         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
216         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
217         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
218         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
219         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
220         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
221         }
222
223         if (op & FUTEX_PRIVATE_FLAG)
224                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
225
226         if (op & FUTEX_CLOCK_REALTIME)
227                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
228
229         return printed;
230 }
231
232 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
233
234 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
235                                                struct syscall_arg *arg)
236 {
237         int printed = 0, flags = arg->val;
238
239         if (!(flags & O_CREAT))
240                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
241
242         if (flags == 0)
243                 return scnprintf(bf, size, "RDONLY");
244 #define P_FLAG(n) \
245         if (flags & O_##n) { \
246                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
247                 flags &= ~O_##n; \
248         }
249
250         P_FLAG(APPEND);
251         P_FLAG(ASYNC);
252         P_FLAG(CLOEXEC);
253         P_FLAG(CREAT);
254         P_FLAG(DIRECT);
255         P_FLAG(DIRECTORY);
256         P_FLAG(EXCL);
257         P_FLAG(LARGEFILE);
258         P_FLAG(NOATIME);
259         P_FLAG(NOCTTY);
260 #ifdef O_NONBLOCK
261         P_FLAG(NONBLOCK);
262 #elif O_NDELAY
263         P_FLAG(NDELAY);
264 #endif
265 #ifdef O_PATH
266         P_FLAG(PATH);
267 #endif
268         P_FLAG(RDWR);
269 #ifdef O_DSYNC
270         if ((flags & O_SYNC) == O_SYNC)
271                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
272         else {
273                 P_FLAG(DSYNC);
274         }
275 #else
276         P_FLAG(SYNC);
277 #endif
278         P_FLAG(TRUNC);
279         P_FLAG(WRONLY);
280 #undef P_FLAG
281
282         if (flags)
283                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
284
285         return printed;
286 }
287
288 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
289
290 static struct syscall_fmt {
291         const char *name;
292         const char *alias;
293         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
294         bool       errmsg;
295         bool       timeout;
296         bool       hexret;
297 } syscall_fmts[] = {
298         { .name     = "access",     .errmsg = true, },
299         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
300         { .name     = "brk",        .hexret = true,
301           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
302         { .name     = "mmap",       .hexret = true, },
303         { .name     = "connect",    .errmsg = true, },
304         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
305         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
306         { .name     = "futex",      .errmsg = true,
307           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
308         { .name     = "ioctl",      .errmsg = true,
309           .arg_scnprintf = { [2] = SCA_HEX, /* arg */ }, },
310         { .name     = "lseek",      .errmsg = true,
311           .arg_scnprintf = { [2] = SCA_WHENCE, /* whence */ }, },
312         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
313         { .name     = "madvise",    .errmsg = true,
314           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
315                              [2] = SCA_MADV_BHV, /* behavior */ }, },
316         { .name     = "mmap",       .hexret = true,
317           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
318                              [2] = SCA_MMAP_PROT, /* prot */
319                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
320         { .name     = "mprotect",   .errmsg = true,
321           .arg_scnprintf = { [0] = SCA_HEX, /* start */
322                              [2] = SCA_MMAP_PROT, /* prot */ }, },
323         { .name     = "mremap",     .hexret = true,
324           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
325                              [4] = SCA_HEX, /* new_addr */ }, },
326         { .name     = "munmap",     .errmsg = true,
327           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
328         { .name     = "open",       .errmsg = true,
329           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
330         { .name     = "open_by_handle_at", .errmsg = true,
331           .arg_scnprintf = { [2] = SCA_OPEN_FLAGS, /* flags */ }, },
332         { .name     = "openat",     .errmsg = true,
333           .arg_scnprintf = { [2] = SCA_OPEN_FLAGS, /* flags */ }, },
334         { .name     = "poll",       .errmsg = true, .timeout = true, },
335         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
336         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
337         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
338         { .name     = "read",       .errmsg = true, },
339         { .name     = "recvfrom",   .errmsg = true, },
340         { .name     = "select",     .errmsg = true, .timeout = true, },
341         { .name     = "socket",     .errmsg = true, },
342         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
343         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
344 };
345
346 static int syscall_fmt__cmp(const void *name, const void *fmtp)
347 {
348         const struct syscall_fmt *fmt = fmtp;
349         return strcmp(name, fmt->name);
350 }
351
352 static struct syscall_fmt *syscall_fmt__find(const char *name)
353 {
354         const int nmemb = ARRAY_SIZE(syscall_fmts);
355         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
356 }
357
358 struct syscall {
359         struct event_format *tp_format;
360         const char          *name;
361         bool                filtered;
362         struct syscall_fmt  *fmt;
363         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
364 };
365
366 static size_t fprintf_duration(unsigned long t, FILE *fp)
367 {
368         double duration = (double)t / NSEC_PER_MSEC;
369         size_t printed = fprintf(fp, "(");
370
371         if (duration >= 1.0)
372                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
373         else if (duration >= 0.01)
374                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
375         else
376                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
377         return printed + fprintf(fp, "): ");
378 }
379
380 struct thread_trace {
381         u64               entry_time;
382         u64               exit_time;
383         bool              entry_pending;
384         unsigned long     nr_events;
385         char              *entry_str;
386         double            runtime_ms;
387 };
388
389 static struct thread_trace *thread_trace__new(void)
390 {
391         return zalloc(sizeof(struct thread_trace));
392 }
393
394 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
395 {
396         struct thread_trace *ttrace;
397
398         if (thread == NULL)
399                 goto fail;
400
401         if (thread->priv == NULL)
402                 thread->priv = thread_trace__new();
403                 
404         if (thread->priv == NULL)
405                 goto fail;
406
407         ttrace = thread->priv;
408         ++ttrace->nr_events;
409
410         return ttrace;
411 fail:
412         color_fprintf(fp, PERF_COLOR_RED,
413                       "WARNING: not enough memory, dropping samples!\n");
414         return NULL;
415 }
416
417 struct trace {
418         struct perf_tool        tool;
419         int                     audit_machine;
420         struct {
421                 int             max;
422                 struct syscall  *table;
423         } syscalls;
424         struct perf_record_opts opts;
425         struct machine          host;
426         u64                     base_time;
427         FILE                    *output;
428         unsigned long           nr_events;
429         struct strlist          *ev_qualifier;
430         bool                    not_ev_qualifier;
431         struct intlist          *tid_list;
432         struct intlist          *pid_list;
433         bool                    sched;
434         bool                    multiple_threads;
435         double                  duration_filter;
436         double                  runtime_ms;
437 };
438
439 static bool trace__filter_duration(struct trace *trace, double t)
440 {
441         return t < (trace->duration_filter * NSEC_PER_MSEC);
442 }
443
444 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
445 {
446         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
447
448         return fprintf(fp, "%10.3f ", ts);
449 }
450
451 static bool done = false;
452
453 static void sig_handler(int sig __maybe_unused)
454 {
455         done = true;
456 }
457
458 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
459                                         u64 duration, u64 tstamp, FILE *fp)
460 {
461         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
462         printed += fprintf_duration(duration, fp);
463
464         if (trace->multiple_threads)
465                 printed += fprintf(fp, "%d ", thread->tid);
466
467         return printed;
468 }
469
470 static int trace__process_event(struct trace *trace, struct machine *machine,
471                                 union perf_event *event)
472 {
473         int ret = 0;
474
475         switch (event->header.type) {
476         case PERF_RECORD_LOST:
477                 color_fprintf(trace->output, PERF_COLOR_RED,
478                               "LOST %" PRIu64 " events!\n", event->lost.lost);
479                 ret = machine__process_lost_event(machine, event);
480         default:
481                 ret = machine__process_event(machine, event);
482                 break;
483         }
484
485         return ret;
486 }
487
488 static int trace__tool_process(struct perf_tool *tool,
489                                union perf_event *event,
490                                struct perf_sample *sample __maybe_unused,
491                                struct machine *machine)
492 {
493         struct trace *trace = container_of(tool, struct trace, tool);
494         return trace__process_event(trace, machine, event);
495 }
496
497 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
498 {
499         int err = symbol__init();
500
501         if (err)
502                 return err;
503
504         machine__init(&trace->host, "", HOST_KERNEL_ID);
505         machine__create_kernel_maps(&trace->host);
506
507         if (perf_target__has_task(&trace->opts.target)) {
508                 err = perf_event__synthesize_thread_map(&trace->tool, evlist->threads,
509                                                         trace__tool_process,
510                                                         &trace->host);
511         } else {
512                 err = perf_event__synthesize_threads(&trace->tool, trace__tool_process,
513                                                      &trace->host);
514         }
515
516         if (err)
517                 symbol__exit();
518
519         return err;
520 }
521
522 static int syscall__set_arg_fmts(struct syscall *sc)
523 {
524         struct format_field *field;
525         int idx = 0;
526
527         sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
528         if (sc->arg_scnprintf == NULL)
529                 return -1;
530
531         for (field = sc->tp_format->format.fields->next; field; field = field->next) {
532                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
533                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
534                 else if (field->flags & FIELD_IS_POINTER)
535                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
536                 ++idx;
537         }
538
539         return 0;
540 }
541
542 static int trace__read_syscall_info(struct trace *trace, int id)
543 {
544         char tp_name[128];
545         struct syscall *sc;
546         const char *name = audit_syscall_to_name(id, trace->audit_machine);
547
548         if (name == NULL)
549                 return -1;
550
551         if (id > trace->syscalls.max) {
552                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
553
554                 if (nsyscalls == NULL)
555                         return -1;
556
557                 if (trace->syscalls.max != -1) {
558                         memset(nsyscalls + trace->syscalls.max + 1, 0,
559                                (id - trace->syscalls.max) * sizeof(*sc));
560                 } else {
561                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
562                 }
563
564                 trace->syscalls.table = nsyscalls;
565                 trace->syscalls.max   = id;
566         }
567
568         sc = trace->syscalls.table + id;
569         sc->name = name;
570
571         if (trace->ev_qualifier) {
572                 bool in = strlist__find(trace->ev_qualifier, name) != NULL;
573
574                 if (!(in ^ trace->not_ev_qualifier)) {
575                         sc->filtered = true;
576                         /*
577                          * No need to do read tracepoint information since this will be
578                          * filtered out.
579                          */
580                         return 0;
581                 }
582         }
583
584         sc->fmt  = syscall_fmt__find(sc->name);
585
586         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
587         sc->tp_format = event_format__new("syscalls", tp_name);
588
589         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
590                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
591                 sc->tp_format = event_format__new("syscalls", tp_name);
592         }
593
594         if (sc->tp_format == NULL)
595                 return -1;
596
597         return syscall__set_arg_fmts(sc);
598 }
599
600 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
601                                       unsigned long *args)
602 {
603         size_t printed = 0;
604
605         if (sc->tp_format != NULL) {
606                 struct format_field *field;
607                 u8 bit = 1;
608                 struct syscall_arg arg = {
609                         .idx  = 0,
610                         .mask = 0,
611                 };
612
613                 for (field = sc->tp_format->format.fields->next; field;
614                      field = field->next, ++arg.idx, bit <<= 1) {
615                         if (arg.mask & bit)
616                                 continue;
617
618                         printed += scnprintf(bf + printed, size - printed,
619                                              "%s%s: ", printed ? ", " : "", field->name);
620                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
621                                 arg.val = args[arg.idx];
622                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
623                                                                       size - printed, &arg);
624                         } else {
625                                 printed += scnprintf(bf + printed, size - printed,
626                                                      "%ld", args[arg.idx]);
627                         }
628                 }
629         } else {
630                 int i = 0;
631
632                 while (i < 6) {
633                         printed += scnprintf(bf + printed, size - printed,
634                                              "%sarg%d: %ld",
635                                              printed ? ", " : "", i, args[i]);
636                         ++i;
637                 }
638         }
639
640         return printed;
641 }
642
643 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
644                                   struct perf_sample *sample);
645
646 static struct syscall *trace__syscall_info(struct trace *trace,
647                                            struct perf_evsel *evsel,
648                                            struct perf_sample *sample)
649 {
650         int id = perf_evsel__intval(evsel, sample, "id");
651
652         if (id < 0) {
653
654                 /*
655                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
656                  * before that, leaving at a higher verbosity level till that is
657                  * explained. Reproduced with plain ftrace with:
658                  *
659                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
660                  * grep "NR -1 " /t/trace_pipe
661                  *
662                  * After generating some load on the machine.
663                  */
664                 if (verbose > 1) {
665                         static u64 n;
666                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
667                                 id, perf_evsel__name(evsel), ++n);
668                 }
669                 return NULL;
670         }
671
672         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
673             trace__read_syscall_info(trace, id))
674                 goto out_cant_read;
675
676         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
677                 goto out_cant_read;
678
679         return &trace->syscalls.table[id];
680
681 out_cant_read:
682         if (verbose) {
683                 fprintf(trace->output, "Problems reading syscall %d", id);
684                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
685                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
686                 fputs(" information\n", trace->output);
687         }
688         return NULL;
689 }
690
691 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
692                             struct perf_sample *sample)
693 {
694         char *msg;
695         void *args;
696         size_t printed = 0;
697         struct thread *thread;
698         struct syscall *sc = trace__syscall_info(trace, evsel, sample);
699         struct thread_trace *ttrace;
700
701         if (sc == NULL)
702                 return -1;
703
704         if (sc->filtered)
705                 return 0;
706
707         thread = machine__findnew_thread(&trace->host, sample->pid,
708                                          sample->tid);
709         ttrace = thread__trace(thread, trace->output);
710         if (ttrace == NULL)
711                 return -1;
712
713         args = perf_evsel__rawptr(evsel, sample, "args");
714         if (args == NULL) {
715                 fprintf(trace->output, "Problems reading syscall arguments\n");
716                 return -1;
717         }
718
719         ttrace = thread->priv;
720
721         if (ttrace->entry_str == NULL) {
722                 ttrace->entry_str = malloc(1024);
723                 if (!ttrace->entry_str)
724                         return -1;
725         }
726
727         ttrace->entry_time = sample->time;
728         msg = ttrace->entry_str;
729         printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
730
731         printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,  args);
732
733         if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) {
734                 if (!trace->duration_filter) {
735                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
736                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
737                 }
738         } else
739                 ttrace->entry_pending = true;
740
741         return 0;
742 }
743
744 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
745                            struct perf_sample *sample)
746 {
747         int ret;
748         u64 duration = 0;
749         struct thread *thread;
750         struct syscall *sc = trace__syscall_info(trace, evsel, sample);
751         struct thread_trace *ttrace;
752
753         if (sc == NULL)
754                 return -1;
755
756         if (sc->filtered)
757                 return 0;
758
759         thread = machine__findnew_thread(&trace->host, sample->pid,
760                                          sample->tid);
761         ttrace = thread__trace(thread, trace->output);
762         if (ttrace == NULL)
763                 return -1;
764
765         ret = perf_evsel__intval(evsel, sample, "ret");
766
767         ttrace = thread->priv;
768
769         ttrace->exit_time = sample->time;
770
771         if (ttrace->entry_time) {
772                 duration = sample->time - ttrace->entry_time;
773                 if (trace__filter_duration(trace, duration))
774                         goto out;
775         } else if (trace->duration_filter)
776                 goto out;
777
778         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
779
780         if (ttrace->entry_pending) {
781                 fprintf(trace->output, "%-70s", ttrace->entry_str);
782         } else {
783                 fprintf(trace->output, " ... [");
784                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
785                 fprintf(trace->output, "]: %s()", sc->name);
786         }
787
788         if (sc->fmt == NULL) {
789 signed_print:
790                 fprintf(trace->output, ") = %d", ret);
791         } else if (ret < 0 && sc->fmt->errmsg) {
792                 char bf[256];
793                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
794                            *e = audit_errno_to_name(-ret);
795
796                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
797         } else if (ret == 0 && sc->fmt->timeout)
798                 fprintf(trace->output, ") = 0 Timeout");
799         else if (sc->fmt->hexret)
800                 fprintf(trace->output, ") = %#x", ret);
801         else
802                 goto signed_print;
803
804         fputc('\n', trace->output);
805 out:
806         ttrace->entry_pending = false;
807
808         return 0;
809 }
810
811 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
812                                      struct perf_sample *sample)
813 {
814         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
815         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
816         struct thread *thread = machine__findnew_thread(&trace->host,
817                                                         sample->pid,
818                                                         sample->tid);
819         struct thread_trace *ttrace = thread__trace(thread, trace->output);
820
821         if (ttrace == NULL)
822                 goto out_dump;
823
824         ttrace->runtime_ms += runtime_ms;
825         trace->runtime_ms += runtime_ms;
826         return 0;
827
828 out_dump:
829         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
830                evsel->name,
831                perf_evsel__strval(evsel, sample, "comm"),
832                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
833                runtime,
834                perf_evsel__intval(evsel, sample, "vruntime"));
835         return 0;
836 }
837
838 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
839 {
840         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
841             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
842                 return false;
843
844         if (trace->pid_list || trace->tid_list)
845                 return true;
846
847         return false;
848 }
849
850 static int trace__process_sample(struct perf_tool *tool,
851                                  union perf_event *event __maybe_unused,
852                                  struct perf_sample *sample,
853                                  struct perf_evsel *evsel,
854                                  struct machine *machine __maybe_unused)
855 {
856         struct trace *trace = container_of(tool, struct trace, tool);
857         int err = 0;
858
859         tracepoint_handler handler = evsel->handler.func;
860
861         if (skip_sample(trace, sample))
862                 return 0;
863
864         if (trace->base_time == 0)
865                 trace->base_time = sample->time;
866
867         if (handler)
868                 handler(trace, evsel, sample);
869
870         return err;
871 }
872
873 static bool
874 perf_session__has_tp(struct perf_session *session, const char *name)
875 {
876         struct perf_evsel *evsel;
877
878         evsel = perf_evlist__find_tracepoint_by_name(session->evlist, name);
879
880         return evsel != NULL;
881 }
882
883 static int parse_target_str(struct trace *trace)
884 {
885         if (trace->opts.target.pid) {
886                 trace->pid_list = intlist__new(trace->opts.target.pid);
887                 if (trace->pid_list == NULL) {
888                         pr_err("Error parsing process id string\n");
889                         return -EINVAL;
890                 }
891         }
892
893         if (trace->opts.target.tid) {
894                 trace->tid_list = intlist__new(trace->opts.target.tid);
895                 if (trace->tid_list == NULL) {
896                         pr_err("Error parsing thread id string\n");
897                         return -EINVAL;
898                 }
899         }
900
901         return 0;
902 }
903
904 static int trace__run(struct trace *trace, int argc, const char **argv)
905 {
906         struct perf_evlist *evlist = perf_evlist__new();
907         struct perf_evsel *evsel;
908         int err = -1, i;
909         unsigned long before;
910         const bool forks = argc > 0;
911
912         if (evlist == NULL) {
913                 fprintf(trace->output, "Not enough memory to run!\n");
914                 goto out;
915         }
916
917         if (perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_enter", trace__sys_enter) ||
918             perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_exit", trace__sys_exit)) {
919                 fprintf(trace->output, "Couldn't read the raw_syscalls tracepoints information!\n");
920                 goto out_delete_evlist;
921         }
922
923         if (trace->sched &&
924             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
925                                    trace__sched_stat_runtime)) {
926                 fprintf(trace->output, "Couldn't read the sched_stat_runtime tracepoint information!\n");
927                 goto out_delete_evlist;
928         }
929
930         err = perf_evlist__create_maps(evlist, &trace->opts.target);
931         if (err < 0) {
932                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
933                 goto out_delete_evlist;
934         }
935
936         err = trace__symbols_init(trace, evlist);
937         if (err < 0) {
938                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
939                 goto out_delete_maps;
940         }
941
942         perf_evlist__config(evlist, &trace->opts);
943
944         signal(SIGCHLD, sig_handler);
945         signal(SIGINT, sig_handler);
946
947         if (forks) {
948                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
949                                                     argv, false, false);
950                 if (err < 0) {
951                         fprintf(trace->output, "Couldn't run the workload!\n");
952                         goto out_delete_maps;
953                 }
954         }
955
956         err = perf_evlist__open(evlist);
957         if (err < 0) {
958                 fprintf(trace->output, "Couldn't create the events: %s\n", strerror(errno));
959                 goto out_delete_maps;
960         }
961
962         err = perf_evlist__mmap(evlist, UINT_MAX, false);
963         if (err < 0) {
964                 fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
965                 goto out_close_evlist;
966         }
967
968         perf_evlist__enable(evlist);
969
970         if (forks)
971                 perf_evlist__start_workload(evlist);
972
973         trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
974 again:
975         before = trace->nr_events;
976
977         for (i = 0; i < evlist->nr_mmaps; i++) {
978                 union perf_event *event;
979
980                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
981                         const u32 type = event->header.type;
982                         tracepoint_handler handler;
983                         struct perf_sample sample;
984
985                         ++trace->nr_events;
986
987                         err = perf_evlist__parse_sample(evlist, event, &sample);
988                         if (err) {
989                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
990                                 continue;
991                         }
992
993                         if (trace->base_time == 0)
994                                 trace->base_time = sample.time;
995
996                         if (type != PERF_RECORD_SAMPLE) {
997                                 trace__process_event(trace, &trace->host, event);
998                                 continue;
999                         }
1000
1001                         evsel = perf_evlist__id2evsel(evlist, sample.id);
1002                         if (evsel == NULL) {
1003                                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
1004                                 continue;
1005                         }
1006
1007                         if (sample.raw_data == NULL) {
1008                                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
1009                                        perf_evsel__name(evsel), sample.tid,
1010                                        sample.cpu, sample.raw_size);
1011                                 continue;
1012                         }
1013
1014                         handler = evsel->handler.func;
1015                         handler(trace, evsel, &sample);
1016
1017                         if (done)
1018                                 goto out_unmap_evlist;
1019                 }
1020         }
1021
1022         if (trace->nr_events == before) {
1023                 if (done)
1024                         goto out_unmap_evlist;
1025
1026                 poll(evlist->pollfd, evlist->nr_fds, -1);
1027         }
1028
1029         if (done)
1030                 perf_evlist__disable(evlist);
1031
1032         goto again;
1033
1034 out_unmap_evlist:
1035         perf_evlist__munmap(evlist);
1036 out_close_evlist:
1037         perf_evlist__close(evlist);
1038 out_delete_maps:
1039         perf_evlist__delete_maps(evlist);
1040 out_delete_evlist:
1041         perf_evlist__delete(evlist);
1042 out:
1043         return err;
1044 }
1045
1046 static int trace__replay(struct trace *trace)
1047 {
1048         const struct perf_evsel_str_handler handlers[] = {
1049                 { "raw_syscalls:sys_enter",  trace__sys_enter, },
1050                 { "raw_syscalls:sys_exit",   trace__sys_exit, },
1051         };
1052
1053         struct perf_session *session;
1054         int err = -1;
1055
1056         trace->tool.sample        = trace__process_sample;
1057         trace->tool.mmap          = perf_event__process_mmap;
1058         trace->tool.mmap2         = perf_event__process_mmap2;
1059         trace->tool.comm          = perf_event__process_comm;
1060         trace->tool.exit          = perf_event__process_exit;
1061         trace->tool.fork          = perf_event__process_fork;
1062         trace->tool.attr          = perf_event__process_attr;
1063         trace->tool.tracing_data = perf_event__process_tracing_data;
1064         trace->tool.build_id      = perf_event__process_build_id;
1065
1066         trace->tool.ordered_samples = true;
1067         trace->tool.ordering_requires_timestamps = true;
1068
1069         /* add tid to output */
1070         trace->multiple_threads = true;
1071
1072         if (symbol__init() < 0)
1073                 return -1;
1074
1075         session = perf_session__new(input_name, O_RDONLY, 0, false,
1076                                     &trace->tool);
1077         if (session == NULL)
1078                 return -ENOMEM;
1079
1080         err = perf_session__set_tracepoints_handlers(session, handlers);
1081         if (err)
1082                 goto out;
1083
1084         if (!perf_session__has_tp(session, "raw_syscalls:sys_enter")) {
1085                 pr_err("Data file does not have raw_syscalls:sys_enter events\n");
1086                 goto out;
1087         }
1088
1089         if (!perf_session__has_tp(session, "raw_syscalls:sys_exit")) {
1090                 pr_err("Data file does not have raw_syscalls:sys_exit events\n");
1091                 goto out;
1092         }
1093
1094         err = parse_target_str(trace);
1095         if (err != 0)
1096                 goto out;
1097
1098         setup_pager();
1099
1100         err = perf_session__process_events(session, &trace->tool);
1101         if (err)
1102                 pr_err("Failed to process events, error %d", err);
1103
1104 out:
1105         perf_session__delete(session);
1106
1107         return err;
1108 }
1109
1110 static size_t trace__fprintf_threads_header(FILE *fp)
1111 {
1112         size_t printed;
1113
1114         printed  = fprintf(fp, "\n _____________________________________________________________________\n");
1115         printed += fprintf(fp," __)    Summary of events    (__\n\n");
1116         printed += fprintf(fp,"              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
1117         printed += fprintf(fp," _____________________________________________________________________\n\n");
1118
1119         return printed;
1120 }
1121
1122 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
1123 {
1124         size_t printed = trace__fprintf_threads_header(fp);
1125         struct rb_node *nd;
1126
1127         for (nd = rb_first(&trace->host.threads); nd; nd = rb_next(nd)) {
1128                 struct thread *thread = rb_entry(nd, struct thread, rb_node);
1129                 struct thread_trace *ttrace = thread->priv;
1130                 const char *color;
1131                 double ratio;
1132
1133                 if (ttrace == NULL)
1134                         continue;
1135
1136                 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
1137
1138                 color = PERF_COLOR_NORMAL;
1139                 if (ratio > 50.0)
1140                         color = PERF_COLOR_RED;
1141                 else if (ratio > 25.0)
1142                         color = PERF_COLOR_GREEN;
1143                 else if (ratio > 5.0)
1144                         color = PERF_COLOR_YELLOW;
1145
1146                 printed += color_fprintf(fp, color, "%20s", thread->comm);
1147                 printed += fprintf(fp, " - %-5d :%11lu   [", thread->tid, ttrace->nr_events);
1148                 printed += color_fprintf(fp, color, "%5.1f%%", ratio);
1149                 printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms);
1150         }
1151
1152         return printed;
1153 }
1154
1155 static int trace__set_duration(const struct option *opt, const char *str,
1156                                int unset __maybe_unused)
1157 {
1158         struct trace *trace = opt->value;
1159
1160         trace->duration_filter = atof(str);
1161         return 0;
1162 }
1163
1164 static int trace__open_output(struct trace *trace, const char *filename)
1165 {
1166         struct stat st;
1167
1168         if (!stat(filename, &st) && st.st_size) {
1169                 char oldname[PATH_MAX];
1170
1171                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
1172                 unlink(oldname);
1173                 rename(filename, oldname);
1174         }
1175
1176         trace->output = fopen(filename, "w");
1177
1178         return trace->output == NULL ? -errno : 0;
1179 }
1180
1181 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
1182 {
1183         const char * const trace_usage[] = {
1184                 "perf trace [<options>] [<command>]",
1185                 "perf trace [<options>] -- <command> [<options>]",
1186                 NULL
1187         };
1188         struct trace trace = {
1189                 .audit_machine = audit_detect_machine(),
1190                 .syscalls = {
1191                         . max = -1,
1192                 },
1193                 .opts = {
1194                         .target = {
1195                                 .uid       = UINT_MAX,
1196                                 .uses_mmap = true,
1197                         },
1198                         .user_freq     = UINT_MAX,
1199                         .user_interval = ULLONG_MAX,
1200                         .no_delay      = true,
1201                         .mmap_pages    = 1024,
1202                 },
1203                 .output = stdout,
1204         };
1205         const char *output_name = NULL;
1206         const char *ev_qualifier_str = NULL;
1207         const struct option trace_options[] = {
1208         OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
1209                     "list of events to trace"),
1210         OPT_STRING('o', "output", &output_name, "file", "output file name"),
1211         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
1212         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
1213                     "trace events on existing process id"),
1214         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
1215                     "trace events on existing thread id"),
1216         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
1217                     "system-wide collection from all CPUs"),
1218         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
1219                     "list of cpus to monitor"),
1220         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
1221                     "child tasks do not inherit counters"),
1222         OPT_UINTEGER('m', "mmap-pages", &trace.opts.mmap_pages,
1223                      "number of mmap data pages"),
1224         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
1225                    "user to profile"),
1226         OPT_CALLBACK(0, "duration", &trace, "float",
1227                      "show only events with duration > N.M ms",
1228                      trace__set_duration),
1229         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
1230         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
1231         OPT_END()
1232         };
1233         int err;
1234         char bf[BUFSIZ];
1235
1236         argc = parse_options(argc, argv, trace_options, trace_usage, 0);
1237
1238         if (output_name != NULL) {
1239                 err = trace__open_output(&trace, output_name);
1240                 if (err < 0) {
1241                         perror("failed to create output file");
1242                         goto out;
1243                 }
1244         }
1245
1246         if (ev_qualifier_str != NULL) {
1247                 const char *s = ev_qualifier_str;
1248
1249                 trace.not_ev_qualifier = *s == '!';
1250                 if (trace.not_ev_qualifier)
1251                         ++s;
1252                 trace.ev_qualifier = strlist__new(true, s);
1253                 if (trace.ev_qualifier == NULL) {
1254                         fputs("Not enough memory to parse event qualifier",
1255                               trace.output);
1256                         err = -ENOMEM;
1257                         goto out_close;
1258                 }
1259         }
1260
1261         err = perf_target__validate(&trace.opts.target);
1262         if (err) {
1263                 perf_target__strerror(&trace.opts.target, err, bf, sizeof(bf));
1264                 fprintf(trace.output, "%s", bf);
1265                 goto out_close;
1266         }
1267
1268         err = perf_target__parse_uid(&trace.opts.target);
1269         if (err) {
1270                 perf_target__strerror(&trace.opts.target, err, bf, sizeof(bf));
1271                 fprintf(trace.output, "%s", bf);
1272                 goto out_close;
1273         }
1274
1275         if (!argc && perf_target__none(&trace.opts.target))
1276                 trace.opts.target.system_wide = true;
1277
1278         if (input_name)
1279                 err = trace__replay(&trace);
1280         else
1281                 err = trace__run(&trace, argc, argv);
1282
1283         if (trace.sched && !err)
1284                 trace__fprintf_thread_summary(&trace, trace.output);
1285
1286 out_close:
1287         if (output_name != NULL)
1288                 fclose(trace.output);
1289 out:
1290         return err;
1291 }