Merge branch 'perf/urgent' into perf/core, to merge fixes before pulling more changes
[firefly-linux-kernel-4.4.55.git] / tools / perf / builtin-trace.c
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/exec_cmd.h"
7 #include "util/machine.h"
8 #include "util/session.h"
9 #include "util/thread.h"
10 #include "util/parse-options.h"
11 #include "util/strlist.h"
12 #include "util/intlist.h"
13 #include "util/thread_map.h"
14 #include "util/stat.h"
15 #include "trace-event.h"
16 #include "util/parse-events.h"
17
18 #include <libaudit.h>
19 #include <stdlib.h>
20 #include <sys/mman.h>
21 #include <linux/futex.h>
22
23 /* For older distros: */
24 #ifndef MAP_STACK
25 # define MAP_STACK              0x20000
26 #endif
27
28 #ifndef MADV_HWPOISON
29 # define MADV_HWPOISON          100
30 #endif
31
32 #ifndef MADV_MERGEABLE
33 # define MADV_MERGEABLE         12
34 #endif
35
36 #ifndef MADV_UNMERGEABLE
37 # define MADV_UNMERGEABLE       13
38 #endif
39
40 #ifndef EFD_SEMAPHORE
41 # define EFD_SEMAPHORE          1
42 #endif
43
44 #ifndef EFD_NONBLOCK
45 # define EFD_NONBLOCK           00004000
46 #endif
47
48 #ifndef EFD_CLOEXEC
49 # define EFD_CLOEXEC            02000000
50 #endif
51
52 #ifndef O_CLOEXEC
53 # define O_CLOEXEC              02000000
54 #endif
55
56 #ifndef SOCK_DCCP
57 # define SOCK_DCCP              6
58 #endif
59
60 #ifndef SOCK_CLOEXEC
61 # define SOCK_CLOEXEC           02000000
62 #endif
63
64 #ifndef SOCK_NONBLOCK
65 # define SOCK_NONBLOCK          00004000
66 #endif
67
68 #ifndef MSG_CMSG_CLOEXEC
69 # define MSG_CMSG_CLOEXEC       0x40000000
70 #endif
71
72 #ifndef PERF_FLAG_FD_NO_GROUP
73 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
74 #endif
75
76 #ifndef PERF_FLAG_FD_OUTPUT
77 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
78 #endif
79
80 #ifndef PERF_FLAG_PID_CGROUP
81 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
82 #endif
83
84 #ifndef PERF_FLAG_FD_CLOEXEC
85 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
86 #endif
87
88
89 struct tp_field {
90         int offset;
91         union {
92                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
93                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
94         };
95 };
96
97 #define TP_UINT_FIELD(bits) \
98 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
99 { \
100         u##bits value; \
101         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
102         return value;  \
103 }
104
105 TP_UINT_FIELD(8);
106 TP_UINT_FIELD(16);
107 TP_UINT_FIELD(32);
108 TP_UINT_FIELD(64);
109
110 #define TP_UINT_FIELD__SWAPPED(bits) \
111 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
112 { \
113         u##bits value; \
114         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
115         return bswap_##bits(value);\
116 }
117
118 TP_UINT_FIELD__SWAPPED(16);
119 TP_UINT_FIELD__SWAPPED(32);
120 TP_UINT_FIELD__SWAPPED(64);
121
122 static int tp_field__init_uint(struct tp_field *field,
123                                struct format_field *format_field,
124                                bool needs_swap)
125 {
126         field->offset = format_field->offset;
127
128         switch (format_field->size) {
129         case 1:
130                 field->integer = tp_field__u8;
131                 break;
132         case 2:
133                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
134                 break;
135         case 4:
136                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
137                 break;
138         case 8:
139                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
140                 break;
141         default:
142                 return -1;
143         }
144
145         return 0;
146 }
147
148 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
149 {
150         return sample->raw_data + field->offset;
151 }
152
153 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
154 {
155         field->offset = format_field->offset;
156         field->pointer = tp_field__ptr;
157         return 0;
158 }
159
160 struct syscall_tp {
161         struct tp_field id;
162         union {
163                 struct tp_field args, ret;
164         };
165 };
166
167 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
168                                           struct tp_field *field,
169                                           const char *name)
170 {
171         struct format_field *format_field = perf_evsel__field(evsel, name);
172
173         if (format_field == NULL)
174                 return -1;
175
176         return tp_field__init_uint(field, format_field, evsel->needs_swap);
177 }
178
179 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
180         ({ struct syscall_tp *sc = evsel->priv;\
181            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
182
183 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
184                                          struct tp_field *field,
185                                          const char *name)
186 {
187         struct format_field *format_field = perf_evsel__field(evsel, name);
188
189         if (format_field == NULL)
190                 return -1;
191
192         return tp_field__init_ptr(field, format_field);
193 }
194
195 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
196         ({ struct syscall_tp *sc = evsel->priv;\
197            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
198
199 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
200 {
201         zfree(&evsel->priv);
202         perf_evsel__delete(evsel);
203 }
204
205 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
206 {
207         evsel->priv = malloc(sizeof(struct syscall_tp));
208         if (evsel->priv != NULL) {
209                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
210                         goto out_delete;
211
212                 evsel->handler = handler;
213                 return 0;
214         }
215
216         return -ENOMEM;
217
218 out_delete:
219         zfree(&evsel->priv);
220         return -ENOENT;
221 }
222
223 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
224 {
225         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
226
227         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
228         if (evsel == NULL)
229                 evsel = perf_evsel__newtp("syscalls", direction);
230
231         if (evsel) {
232                 if (perf_evsel__init_syscall_tp(evsel, handler))
233                         goto out_delete;
234         }
235
236         return evsel;
237
238 out_delete:
239         perf_evsel__delete_priv(evsel);
240         return NULL;
241 }
242
243 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
244         ({ struct syscall_tp *fields = evsel->priv; \
245            fields->name.integer(&fields->name, sample); })
246
247 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
248         ({ struct syscall_tp *fields = evsel->priv; \
249            fields->name.pointer(&fields->name, sample); })
250
251 struct syscall_arg {
252         unsigned long val;
253         struct thread *thread;
254         struct trace  *trace;
255         void          *parm;
256         u8            idx;
257         u8            mask;
258 };
259
260 struct strarray {
261         int         offset;
262         int         nr_entries;
263         const char **entries;
264 };
265
266 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
267         .nr_entries = ARRAY_SIZE(array), \
268         .entries = array, \
269 }
270
271 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
272         .offset     = off, \
273         .nr_entries = ARRAY_SIZE(array), \
274         .entries = array, \
275 }
276
277 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
278                                                 const char *intfmt,
279                                                 struct syscall_arg *arg)
280 {
281         struct strarray *sa = arg->parm;
282         int idx = arg->val - sa->offset;
283
284         if (idx < 0 || idx >= sa->nr_entries)
285                 return scnprintf(bf, size, intfmt, arg->val);
286
287         return scnprintf(bf, size, "%s", sa->entries[idx]);
288 }
289
290 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
291                                               struct syscall_arg *arg)
292 {
293         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
294 }
295
296 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
297
298 #if defined(__i386__) || defined(__x86_64__)
299 /*
300  * FIXME: Make this available to all arches as soon as the ioctl beautifier
301  *        gets rewritten to support all arches.
302  */
303 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
304                                                  struct syscall_arg *arg)
305 {
306         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
307 }
308
309 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
310 #endif /* defined(__i386__) || defined(__x86_64__) */
311
312 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
313                                         struct syscall_arg *arg);
314
315 #define SCA_FD syscall_arg__scnprintf_fd
316
317 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
318                                            struct syscall_arg *arg)
319 {
320         int fd = arg->val;
321
322         if (fd == AT_FDCWD)
323                 return scnprintf(bf, size, "CWD");
324
325         return syscall_arg__scnprintf_fd(bf, size, arg);
326 }
327
328 #define SCA_FDAT syscall_arg__scnprintf_fd_at
329
330 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
331                                               struct syscall_arg *arg);
332
333 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
334
335 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
336                                          struct syscall_arg *arg)
337 {
338         return scnprintf(bf, size, "%#lx", arg->val);
339 }
340
341 #define SCA_HEX syscall_arg__scnprintf_hex
342
343 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
344                                          struct syscall_arg *arg)
345 {
346         return scnprintf(bf, size, "%d", arg->val);
347 }
348
349 #define SCA_INT syscall_arg__scnprintf_int
350
351 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
352                                                struct syscall_arg *arg)
353 {
354         int printed = 0, prot = arg->val;
355
356         if (prot == PROT_NONE)
357                 return scnprintf(bf, size, "NONE");
358 #define P_MMAP_PROT(n) \
359         if (prot & PROT_##n) { \
360                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
361                 prot &= ~PROT_##n; \
362         }
363
364         P_MMAP_PROT(EXEC);
365         P_MMAP_PROT(READ);
366         P_MMAP_PROT(WRITE);
367 #ifdef PROT_SEM
368         P_MMAP_PROT(SEM);
369 #endif
370         P_MMAP_PROT(GROWSDOWN);
371         P_MMAP_PROT(GROWSUP);
372 #undef P_MMAP_PROT
373
374         if (prot)
375                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
376
377         return printed;
378 }
379
380 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
381
382 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
383                                                 struct syscall_arg *arg)
384 {
385         int printed = 0, flags = arg->val;
386
387 #define P_MMAP_FLAG(n) \
388         if (flags & MAP_##n) { \
389                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
390                 flags &= ~MAP_##n; \
391         }
392
393         P_MMAP_FLAG(SHARED);
394         P_MMAP_FLAG(PRIVATE);
395 #ifdef MAP_32BIT
396         P_MMAP_FLAG(32BIT);
397 #endif
398         P_MMAP_FLAG(ANONYMOUS);
399         P_MMAP_FLAG(DENYWRITE);
400         P_MMAP_FLAG(EXECUTABLE);
401         P_MMAP_FLAG(FILE);
402         P_MMAP_FLAG(FIXED);
403         P_MMAP_FLAG(GROWSDOWN);
404 #ifdef MAP_HUGETLB
405         P_MMAP_FLAG(HUGETLB);
406 #endif
407         P_MMAP_FLAG(LOCKED);
408         P_MMAP_FLAG(NONBLOCK);
409         P_MMAP_FLAG(NORESERVE);
410         P_MMAP_FLAG(POPULATE);
411         P_MMAP_FLAG(STACK);
412 #ifdef MAP_UNINITIALIZED
413         P_MMAP_FLAG(UNINITIALIZED);
414 #endif
415 #undef P_MMAP_FLAG
416
417         if (flags)
418                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
419
420         return printed;
421 }
422
423 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
424
425 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
426                                                   struct syscall_arg *arg)
427 {
428         int printed = 0, flags = arg->val;
429
430 #define P_MREMAP_FLAG(n) \
431         if (flags & MREMAP_##n) { \
432                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
433                 flags &= ~MREMAP_##n; \
434         }
435
436         P_MREMAP_FLAG(MAYMOVE);
437 #ifdef MREMAP_FIXED
438         P_MREMAP_FLAG(FIXED);
439 #endif
440 #undef P_MREMAP_FLAG
441
442         if (flags)
443                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
444
445         return printed;
446 }
447
448 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
449
450 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
451                                                       struct syscall_arg *arg)
452 {
453         int behavior = arg->val;
454
455         switch (behavior) {
456 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
457         P_MADV_BHV(NORMAL);
458         P_MADV_BHV(RANDOM);
459         P_MADV_BHV(SEQUENTIAL);
460         P_MADV_BHV(WILLNEED);
461         P_MADV_BHV(DONTNEED);
462         P_MADV_BHV(REMOVE);
463         P_MADV_BHV(DONTFORK);
464         P_MADV_BHV(DOFORK);
465         P_MADV_BHV(HWPOISON);
466 #ifdef MADV_SOFT_OFFLINE
467         P_MADV_BHV(SOFT_OFFLINE);
468 #endif
469         P_MADV_BHV(MERGEABLE);
470         P_MADV_BHV(UNMERGEABLE);
471 #ifdef MADV_HUGEPAGE
472         P_MADV_BHV(HUGEPAGE);
473 #endif
474 #ifdef MADV_NOHUGEPAGE
475         P_MADV_BHV(NOHUGEPAGE);
476 #endif
477 #ifdef MADV_DONTDUMP
478         P_MADV_BHV(DONTDUMP);
479 #endif
480 #ifdef MADV_DODUMP
481         P_MADV_BHV(DODUMP);
482 #endif
483 #undef P_MADV_PHV
484         default: break;
485         }
486
487         return scnprintf(bf, size, "%#x", behavior);
488 }
489
490 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
491
492 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
493                                            struct syscall_arg *arg)
494 {
495         int printed = 0, op = arg->val;
496
497         if (op == 0)
498                 return scnprintf(bf, size, "NONE");
499 #define P_CMD(cmd) \
500         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
501                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
502                 op &= ~LOCK_##cmd; \
503         }
504
505         P_CMD(SH);
506         P_CMD(EX);
507         P_CMD(NB);
508         P_CMD(UN);
509         P_CMD(MAND);
510         P_CMD(RW);
511         P_CMD(READ);
512         P_CMD(WRITE);
513 #undef P_OP
514
515         if (op)
516                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
517
518         return printed;
519 }
520
521 #define SCA_FLOCK syscall_arg__scnprintf_flock
522
523 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
524 {
525         enum syscall_futex_args {
526                 SCF_UADDR   = (1 << 0),
527                 SCF_OP      = (1 << 1),
528                 SCF_VAL     = (1 << 2),
529                 SCF_TIMEOUT = (1 << 3),
530                 SCF_UADDR2  = (1 << 4),
531                 SCF_VAL3    = (1 << 5),
532         };
533         int op = arg->val;
534         int cmd = op & FUTEX_CMD_MASK;
535         size_t printed = 0;
536
537         switch (cmd) {
538 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
539         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
540         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
541         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
542         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
543         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
544         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
545         P_FUTEX_OP(WAKE_OP);                                                      break;
546         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
547         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
548         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
549         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
550         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
551         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
552         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
553         }
554
555         if (op & FUTEX_PRIVATE_FLAG)
556                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
557
558         if (op & FUTEX_CLOCK_REALTIME)
559                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
560
561         return printed;
562 }
563
564 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
565
566 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
567 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
568
569 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
570 static DEFINE_STRARRAY(itimers);
571
572 static const char *whences[] = { "SET", "CUR", "END",
573 #ifdef SEEK_DATA
574 "DATA",
575 #endif
576 #ifdef SEEK_HOLE
577 "HOLE",
578 #endif
579 };
580 static DEFINE_STRARRAY(whences);
581
582 static const char *fcntl_cmds[] = {
583         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
584         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
585         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
586         "F_GETOWNER_UIDS",
587 };
588 static DEFINE_STRARRAY(fcntl_cmds);
589
590 static const char *rlimit_resources[] = {
591         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
592         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
593         "RTTIME",
594 };
595 static DEFINE_STRARRAY(rlimit_resources);
596
597 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
598 static DEFINE_STRARRAY(sighow);
599
600 static const char *clockid[] = {
601         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
602         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
603 };
604 static DEFINE_STRARRAY(clockid);
605
606 static const char *socket_families[] = {
607         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
608         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
609         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
610         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
611         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
612         "ALG", "NFC", "VSOCK",
613 };
614 static DEFINE_STRARRAY(socket_families);
615
616 #ifndef SOCK_TYPE_MASK
617 #define SOCK_TYPE_MASK 0xf
618 #endif
619
620 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
621                                                       struct syscall_arg *arg)
622 {
623         size_t printed;
624         int type = arg->val,
625             flags = type & ~SOCK_TYPE_MASK;
626
627         type &= SOCK_TYPE_MASK;
628         /*
629          * Can't use a strarray, MIPS may override for ABI reasons.
630          */
631         switch (type) {
632 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
633         P_SK_TYPE(STREAM);
634         P_SK_TYPE(DGRAM);
635         P_SK_TYPE(RAW);
636         P_SK_TYPE(RDM);
637         P_SK_TYPE(SEQPACKET);
638         P_SK_TYPE(DCCP);
639         P_SK_TYPE(PACKET);
640 #undef P_SK_TYPE
641         default:
642                 printed = scnprintf(bf, size, "%#x", type);
643         }
644
645 #define P_SK_FLAG(n) \
646         if (flags & SOCK_##n) { \
647                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
648                 flags &= ~SOCK_##n; \
649         }
650
651         P_SK_FLAG(CLOEXEC);
652         P_SK_FLAG(NONBLOCK);
653 #undef P_SK_FLAG
654
655         if (flags)
656                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
657
658         return printed;
659 }
660
661 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
662
663 #ifndef MSG_PROBE
664 #define MSG_PROBE            0x10
665 #endif
666 #ifndef MSG_WAITFORONE
667 #define MSG_WAITFORONE  0x10000
668 #endif
669 #ifndef MSG_SENDPAGE_NOTLAST
670 #define MSG_SENDPAGE_NOTLAST 0x20000
671 #endif
672 #ifndef MSG_FASTOPEN
673 #define MSG_FASTOPEN         0x20000000
674 #endif
675
676 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
677                                                struct syscall_arg *arg)
678 {
679         int printed = 0, flags = arg->val;
680
681         if (flags == 0)
682                 return scnprintf(bf, size, "NONE");
683 #define P_MSG_FLAG(n) \
684         if (flags & MSG_##n) { \
685                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
686                 flags &= ~MSG_##n; \
687         }
688
689         P_MSG_FLAG(OOB);
690         P_MSG_FLAG(PEEK);
691         P_MSG_FLAG(DONTROUTE);
692         P_MSG_FLAG(TRYHARD);
693         P_MSG_FLAG(CTRUNC);
694         P_MSG_FLAG(PROBE);
695         P_MSG_FLAG(TRUNC);
696         P_MSG_FLAG(DONTWAIT);
697         P_MSG_FLAG(EOR);
698         P_MSG_FLAG(WAITALL);
699         P_MSG_FLAG(FIN);
700         P_MSG_FLAG(SYN);
701         P_MSG_FLAG(CONFIRM);
702         P_MSG_FLAG(RST);
703         P_MSG_FLAG(ERRQUEUE);
704         P_MSG_FLAG(NOSIGNAL);
705         P_MSG_FLAG(MORE);
706         P_MSG_FLAG(WAITFORONE);
707         P_MSG_FLAG(SENDPAGE_NOTLAST);
708         P_MSG_FLAG(FASTOPEN);
709         P_MSG_FLAG(CMSG_CLOEXEC);
710 #undef P_MSG_FLAG
711
712         if (flags)
713                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
714
715         return printed;
716 }
717
718 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
719
720 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
721                                                  struct syscall_arg *arg)
722 {
723         size_t printed = 0;
724         int mode = arg->val;
725
726         if (mode == F_OK) /* 0 */
727                 return scnprintf(bf, size, "F");
728 #define P_MODE(n) \
729         if (mode & n##_OK) { \
730                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
731                 mode &= ~n##_OK; \
732         }
733
734         P_MODE(R);
735         P_MODE(W);
736         P_MODE(X);
737 #undef P_MODE
738
739         if (mode)
740                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
741
742         return printed;
743 }
744
745 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
746
747 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
748                                                struct syscall_arg *arg)
749 {
750         int printed = 0, flags = arg->val;
751
752         if (!(flags & O_CREAT))
753                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
754
755         if (flags == 0)
756                 return scnprintf(bf, size, "RDONLY");
757 #define P_FLAG(n) \
758         if (flags & O_##n) { \
759                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
760                 flags &= ~O_##n; \
761         }
762
763         P_FLAG(APPEND);
764         P_FLAG(ASYNC);
765         P_FLAG(CLOEXEC);
766         P_FLAG(CREAT);
767         P_FLAG(DIRECT);
768         P_FLAG(DIRECTORY);
769         P_FLAG(EXCL);
770         P_FLAG(LARGEFILE);
771         P_FLAG(NOATIME);
772         P_FLAG(NOCTTY);
773 #ifdef O_NONBLOCK
774         P_FLAG(NONBLOCK);
775 #elif O_NDELAY
776         P_FLAG(NDELAY);
777 #endif
778 #ifdef O_PATH
779         P_FLAG(PATH);
780 #endif
781         P_FLAG(RDWR);
782 #ifdef O_DSYNC
783         if ((flags & O_SYNC) == O_SYNC)
784                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
785         else {
786                 P_FLAG(DSYNC);
787         }
788 #else
789         P_FLAG(SYNC);
790 #endif
791         P_FLAG(TRUNC);
792         P_FLAG(WRONLY);
793 #undef P_FLAG
794
795         if (flags)
796                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
797
798         return printed;
799 }
800
801 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
802
803 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
804                                                 struct syscall_arg *arg)
805 {
806         int printed = 0, flags = arg->val;
807
808         if (flags == 0)
809                 return 0;
810
811 #define P_FLAG(n) \
812         if (flags & PERF_FLAG_##n) { \
813                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
814                 flags &= ~PERF_FLAG_##n; \
815         }
816
817         P_FLAG(FD_NO_GROUP);
818         P_FLAG(FD_OUTPUT);
819         P_FLAG(PID_CGROUP);
820         P_FLAG(FD_CLOEXEC);
821 #undef P_FLAG
822
823         if (flags)
824                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
825
826         return printed;
827 }
828
829 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
830
831 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
832                                                    struct syscall_arg *arg)
833 {
834         int printed = 0, flags = arg->val;
835
836         if (flags == 0)
837                 return scnprintf(bf, size, "NONE");
838 #define P_FLAG(n) \
839         if (flags & EFD_##n) { \
840                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
841                 flags &= ~EFD_##n; \
842         }
843
844         P_FLAG(SEMAPHORE);
845         P_FLAG(CLOEXEC);
846         P_FLAG(NONBLOCK);
847 #undef P_FLAG
848
849         if (flags)
850                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
851
852         return printed;
853 }
854
855 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
856
857 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
858                                                 struct syscall_arg *arg)
859 {
860         int printed = 0, flags = arg->val;
861
862 #define P_FLAG(n) \
863         if (flags & O_##n) { \
864                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
865                 flags &= ~O_##n; \
866         }
867
868         P_FLAG(CLOEXEC);
869         P_FLAG(NONBLOCK);
870 #undef P_FLAG
871
872         if (flags)
873                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
874
875         return printed;
876 }
877
878 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
879
880 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
881 {
882         int sig = arg->val;
883
884         switch (sig) {
885 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
886         P_SIGNUM(HUP);
887         P_SIGNUM(INT);
888         P_SIGNUM(QUIT);
889         P_SIGNUM(ILL);
890         P_SIGNUM(TRAP);
891         P_SIGNUM(ABRT);
892         P_SIGNUM(BUS);
893         P_SIGNUM(FPE);
894         P_SIGNUM(KILL);
895         P_SIGNUM(USR1);
896         P_SIGNUM(SEGV);
897         P_SIGNUM(USR2);
898         P_SIGNUM(PIPE);
899         P_SIGNUM(ALRM);
900         P_SIGNUM(TERM);
901         P_SIGNUM(CHLD);
902         P_SIGNUM(CONT);
903         P_SIGNUM(STOP);
904         P_SIGNUM(TSTP);
905         P_SIGNUM(TTIN);
906         P_SIGNUM(TTOU);
907         P_SIGNUM(URG);
908         P_SIGNUM(XCPU);
909         P_SIGNUM(XFSZ);
910         P_SIGNUM(VTALRM);
911         P_SIGNUM(PROF);
912         P_SIGNUM(WINCH);
913         P_SIGNUM(IO);
914         P_SIGNUM(PWR);
915         P_SIGNUM(SYS);
916 #ifdef SIGEMT
917         P_SIGNUM(EMT);
918 #endif
919 #ifdef SIGSTKFLT
920         P_SIGNUM(STKFLT);
921 #endif
922 #ifdef SIGSWI
923         P_SIGNUM(SWI);
924 #endif
925         default: break;
926         }
927
928         return scnprintf(bf, size, "%#x", sig);
929 }
930
931 #define SCA_SIGNUM syscall_arg__scnprintf_signum
932
933 #if defined(__i386__) || defined(__x86_64__)
934 /*
935  * FIXME: Make this available to all arches.
936  */
937 #define TCGETS          0x5401
938
939 static const char *tioctls[] = {
940         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
941         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
942         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
943         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
944         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
945         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
946         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
947         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
948         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
949         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
950         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
951         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
952         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
953         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
954         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
955 };
956
957 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
958 #endif /* defined(__i386__) || defined(__x86_64__) */
959
960 #define STRARRAY(arg, name, array) \
961           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
962           .arg_parm      = { [arg] = &strarray__##array, }
963
964 static struct syscall_fmt {
965         const char *name;
966         const char *alias;
967         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
968         void       *arg_parm[6];
969         bool       errmsg;
970         bool       timeout;
971         bool       hexret;
972 } syscall_fmts[] = {
973         { .name     = "access",     .errmsg = true,
974           .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
975         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
976         { .name     = "brk",        .hexret = true,
977           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
978         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
979         { .name     = "close",      .errmsg = true,
980           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
981         { .name     = "connect",    .errmsg = true, },
982         { .name     = "dup",        .errmsg = true,
983           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
984         { .name     = "dup2",       .errmsg = true,
985           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
986         { .name     = "dup3",       .errmsg = true,
987           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
988         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
989         { .name     = "eventfd2",   .errmsg = true,
990           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
991         { .name     = "faccessat",  .errmsg = true,
992           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
993         { .name     = "fadvise64",  .errmsg = true,
994           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
995         { .name     = "fallocate",  .errmsg = true,
996           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
997         { .name     = "fchdir",     .errmsg = true,
998           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
999         { .name     = "fchmod",     .errmsg = true,
1000           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1001         { .name     = "fchmodat",   .errmsg = true,
1002           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1003         { .name     = "fchown",     .errmsg = true,
1004           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1005         { .name     = "fchownat",   .errmsg = true,
1006           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1007         { .name     = "fcntl",      .errmsg = true,
1008           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1009                              [1] = SCA_STRARRAY, /* cmd */ },
1010           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1011         { .name     = "fdatasync",  .errmsg = true,
1012           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1013         { .name     = "flock",      .errmsg = true,
1014           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1015                              [1] = SCA_FLOCK, /* cmd */ }, },
1016         { .name     = "fsetxattr",  .errmsg = true,
1017           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1018         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1019           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1020         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1021           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1022         { .name     = "fstatfs",    .errmsg = true,
1023           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1024         { .name     = "fsync",    .errmsg = true,
1025           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1026         { .name     = "ftruncate", .errmsg = true,
1027           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1028         { .name     = "futex",      .errmsg = true,
1029           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1030         { .name     = "futimesat", .errmsg = true,
1031           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1032         { .name     = "getdents",   .errmsg = true,
1033           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1034         { .name     = "getdents64", .errmsg = true,
1035           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1036         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1037         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1038         { .name     = "ioctl",      .errmsg = true,
1039           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1040 #if defined(__i386__) || defined(__x86_64__)
1041 /*
1042  * FIXME: Make this available to all arches.
1043  */
1044                              [1] = SCA_STRHEXARRAY, /* cmd */
1045                              [2] = SCA_HEX, /* arg */ },
1046           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1047 #else
1048                              [2] = SCA_HEX, /* arg */ }, },
1049 #endif
1050         { .name     = "kill",       .errmsg = true,
1051           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1052         { .name     = "linkat",     .errmsg = true,
1053           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1054         { .name     = "lseek",      .errmsg = true,
1055           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1056                              [2] = SCA_STRARRAY, /* whence */ },
1057           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1058         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
1059         { .name     = "madvise",    .errmsg = true,
1060           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1061                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1062         { .name     = "mkdirat",    .errmsg = true,
1063           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1064         { .name     = "mknodat",    .errmsg = true,
1065           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1066         { .name     = "mlock",      .errmsg = true,
1067           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1068         { .name     = "mlockall",   .errmsg = true,
1069           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1070         { .name     = "mmap",       .hexret = true,
1071           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1072                              [2] = SCA_MMAP_PROT, /* prot */
1073                              [3] = SCA_MMAP_FLAGS, /* flags */
1074                              [4] = SCA_FD,        /* fd */ }, },
1075         { .name     = "mprotect",   .errmsg = true,
1076           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1077                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1078         { .name     = "mremap",     .hexret = true,
1079           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1080                              [3] = SCA_MREMAP_FLAGS, /* flags */
1081                              [4] = SCA_HEX, /* new_addr */ }, },
1082         { .name     = "munlock",    .errmsg = true,
1083           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1084         { .name     = "munmap",     .errmsg = true,
1085           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1086         { .name     = "name_to_handle_at", .errmsg = true,
1087           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1088         { .name     = "newfstatat", .errmsg = true,
1089           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1090         { .name     = "open",       .errmsg = true,
1091           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1092         { .name     = "open_by_handle_at", .errmsg = true,
1093           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1094                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1095         { .name     = "openat",     .errmsg = true,
1096           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1097                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1098         { .name     = "perf_event_open", .errmsg = true,
1099           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1100                              [2] = SCA_INT, /* cpu */
1101                              [3] = SCA_FD,  /* group_fd */
1102                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1103         { .name     = "pipe2",      .errmsg = true,
1104           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1105         { .name     = "poll",       .errmsg = true, .timeout = true, },
1106         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1107         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1108           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1109         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1110           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1111         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1112         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1113           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1114         { .name     = "pwritev",    .errmsg = true,
1115           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1116         { .name     = "read",       .errmsg = true,
1117           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1118         { .name     = "readlinkat", .errmsg = true,
1119           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1120         { .name     = "readv",      .errmsg = true,
1121           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1122         { .name     = "recvfrom",   .errmsg = true,
1123           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1124         { .name     = "recvmmsg",   .errmsg = true,
1125           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1126         { .name     = "recvmsg",    .errmsg = true,
1127           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1128         { .name     = "renameat",   .errmsg = true,
1129           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1130         { .name     = "rt_sigaction", .errmsg = true,
1131           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1132         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1133         { .name     = "rt_sigqueueinfo", .errmsg = true,
1134           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1135         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1136           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1137         { .name     = "select",     .errmsg = true, .timeout = true, },
1138         { .name     = "sendmmsg",    .errmsg = true,
1139           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1140         { .name     = "sendmsg",    .errmsg = true,
1141           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1142         { .name     = "sendto",     .errmsg = true,
1143           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1144         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1145         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1146         { .name     = "shutdown",   .errmsg = true,
1147           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1148         { .name     = "socket",     .errmsg = true,
1149           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1150                              [1] = SCA_SK_TYPE, /* type */ },
1151           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1152         { .name     = "socketpair", .errmsg = true,
1153           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1154                              [1] = SCA_SK_TYPE, /* type */ },
1155           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1156         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
1157         { .name     = "symlinkat",  .errmsg = true,
1158           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1159         { .name     = "tgkill",     .errmsg = true,
1160           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1161         { .name     = "tkill",      .errmsg = true,
1162           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1163         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1164         { .name     = "unlinkat",   .errmsg = true,
1165           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1166         { .name     = "utimensat",  .errmsg = true,
1167           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1168         { .name     = "write",      .errmsg = true,
1169           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1170         { .name     = "writev",     .errmsg = true,
1171           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1172 };
1173
1174 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1175 {
1176         const struct syscall_fmt *fmt = fmtp;
1177         return strcmp(name, fmt->name);
1178 }
1179
1180 static struct syscall_fmt *syscall_fmt__find(const char *name)
1181 {
1182         const int nmemb = ARRAY_SIZE(syscall_fmts);
1183         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1184 }
1185
1186 struct syscall {
1187         struct event_format *tp_format;
1188         int                 nr_args;
1189         struct format_field *args;
1190         const char          *name;
1191         bool                is_exit;
1192         struct syscall_fmt  *fmt;
1193         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1194         void                **arg_parm;
1195 };
1196
1197 static size_t fprintf_duration(unsigned long t, FILE *fp)
1198 {
1199         double duration = (double)t / NSEC_PER_MSEC;
1200         size_t printed = fprintf(fp, "(");
1201
1202         if (duration >= 1.0)
1203                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1204         else if (duration >= 0.01)
1205                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1206         else
1207                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1208         return printed + fprintf(fp, "): ");
1209 }
1210
1211 struct thread_trace {
1212         u64               entry_time;
1213         u64               exit_time;
1214         bool              entry_pending;
1215         unsigned long     nr_events;
1216         unsigned long     pfmaj, pfmin;
1217         char              *entry_str;
1218         double            runtime_ms;
1219         struct {
1220                 int       max;
1221                 char      **table;
1222         } paths;
1223
1224         struct intlist *syscall_stats;
1225 };
1226
1227 static struct thread_trace *thread_trace__new(void)
1228 {
1229         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1230
1231         if (ttrace)
1232                 ttrace->paths.max = -1;
1233
1234         ttrace->syscall_stats = intlist__new(NULL);
1235
1236         return ttrace;
1237 }
1238
1239 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1240 {
1241         struct thread_trace *ttrace;
1242
1243         if (thread == NULL)
1244                 goto fail;
1245
1246         if (thread__priv(thread) == NULL)
1247                 thread__set_priv(thread, thread_trace__new());
1248
1249         if (thread__priv(thread) == NULL)
1250                 goto fail;
1251
1252         ttrace = thread__priv(thread);
1253         ++ttrace->nr_events;
1254
1255         return ttrace;
1256 fail:
1257         color_fprintf(fp, PERF_COLOR_RED,
1258                       "WARNING: not enough memory, dropping samples!\n");
1259         return NULL;
1260 }
1261
1262 #define TRACE_PFMAJ             (1 << 0)
1263 #define TRACE_PFMIN             (1 << 1)
1264
1265 struct trace {
1266         struct perf_tool        tool;
1267         struct {
1268                 int             machine;
1269                 int             open_id;
1270         }                       audit;
1271         struct {
1272                 int             max;
1273                 struct syscall  *table;
1274                 struct {
1275                         struct perf_evsel *sys_enter,
1276                                           *sys_exit;
1277                 }               events;
1278         } syscalls;
1279         struct record_opts      opts;
1280         struct perf_evlist      *evlist;
1281         struct machine          *host;
1282         struct thread           *current;
1283         u64                     base_time;
1284         FILE                    *output;
1285         unsigned long           nr_events;
1286         struct strlist          *ev_qualifier;
1287         struct {
1288                 size_t          nr;
1289                 int             *entries;
1290         }                       ev_qualifier_ids;
1291         const char              *last_vfs_getname;
1292         struct intlist          *tid_list;
1293         struct intlist          *pid_list;
1294         struct {
1295                 size_t          nr;
1296                 pid_t           *entries;
1297         }                       filter_pids;
1298         double                  duration_filter;
1299         double                  runtime_ms;
1300         struct {
1301                 u64             vfs_getname,
1302                                 proc_getname;
1303         } stats;
1304         bool                    not_ev_qualifier;
1305         bool                    live;
1306         bool                    full_time;
1307         bool                    sched;
1308         bool                    multiple_threads;
1309         bool                    summary;
1310         bool                    summary_only;
1311         bool                    show_comm;
1312         bool                    show_tool_stats;
1313         bool                    trace_syscalls;
1314         bool                    force;
1315         int                     trace_pgfaults;
1316 };
1317
1318 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1319 {
1320         struct thread_trace *ttrace = thread__priv(thread);
1321
1322         if (fd > ttrace->paths.max) {
1323                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1324
1325                 if (npath == NULL)
1326                         return -1;
1327
1328                 if (ttrace->paths.max != -1) {
1329                         memset(npath + ttrace->paths.max + 1, 0,
1330                                (fd - ttrace->paths.max) * sizeof(char *));
1331                 } else {
1332                         memset(npath, 0, (fd + 1) * sizeof(char *));
1333                 }
1334
1335                 ttrace->paths.table = npath;
1336                 ttrace->paths.max   = fd;
1337         }
1338
1339         ttrace->paths.table[fd] = strdup(pathname);
1340
1341         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1342 }
1343
1344 static int thread__read_fd_path(struct thread *thread, int fd)
1345 {
1346         char linkname[PATH_MAX], pathname[PATH_MAX];
1347         struct stat st;
1348         int ret;
1349
1350         if (thread->pid_ == thread->tid) {
1351                 scnprintf(linkname, sizeof(linkname),
1352                           "/proc/%d/fd/%d", thread->pid_, fd);
1353         } else {
1354                 scnprintf(linkname, sizeof(linkname),
1355                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1356         }
1357
1358         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1359                 return -1;
1360
1361         ret = readlink(linkname, pathname, sizeof(pathname));
1362
1363         if (ret < 0 || ret > st.st_size)
1364                 return -1;
1365
1366         pathname[ret] = '\0';
1367         return trace__set_fd_pathname(thread, fd, pathname);
1368 }
1369
1370 static const char *thread__fd_path(struct thread *thread, int fd,
1371                                    struct trace *trace)
1372 {
1373         struct thread_trace *ttrace = thread__priv(thread);
1374
1375         if (ttrace == NULL)
1376                 return NULL;
1377
1378         if (fd < 0)
1379                 return NULL;
1380
1381         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1382                 if (!trace->live)
1383                         return NULL;
1384                 ++trace->stats.proc_getname;
1385                 if (thread__read_fd_path(thread, fd))
1386                         return NULL;
1387         }
1388
1389         return ttrace->paths.table[fd];
1390 }
1391
1392 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1393                                         struct syscall_arg *arg)
1394 {
1395         int fd = arg->val;
1396         size_t printed = scnprintf(bf, size, "%d", fd);
1397         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1398
1399         if (path)
1400                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1401
1402         return printed;
1403 }
1404
1405 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1406                                               struct syscall_arg *arg)
1407 {
1408         int fd = arg->val;
1409         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1410         struct thread_trace *ttrace = thread__priv(arg->thread);
1411
1412         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1413                 zfree(&ttrace->paths.table[fd]);
1414
1415         return printed;
1416 }
1417
1418 static bool trace__filter_duration(struct trace *trace, double t)
1419 {
1420         return t < (trace->duration_filter * NSEC_PER_MSEC);
1421 }
1422
1423 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1424 {
1425         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1426
1427         return fprintf(fp, "%10.3f ", ts);
1428 }
1429
1430 static bool done = false;
1431 static bool interrupted = false;
1432
1433 static void sig_handler(int sig)
1434 {
1435         done = true;
1436         interrupted = sig == SIGINT;
1437 }
1438
1439 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1440                                         u64 duration, u64 tstamp, FILE *fp)
1441 {
1442         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1443         printed += fprintf_duration(duration, fp);
1444
1445         if (trace->multiple_threads) {
1446                 if (trace->show_comm)
1447                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1448                 printed += fprintf(fp, "%d ", thread->tid);
1449         }
1450
1451         return printed;
1452 }
1453
1454 static int trace__process_event(struct trace *trace, struct machine *machine,
1455                                 union perf_event *event, struct perf_sample *sample)
1456 {
1457         int ret = 0;
1458
1459         switch (event->header.type) {
1460         case PERF_RECORD_LOST:
1461                 color_fprintf(trace->output, PERF_COLOR_RED,
1462                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1463                 ret = machine__process_lost_event(machine, event, sample);
1464         default:
1465                 ret = machine__process_event(machine, event, sample);
1466                 break;
1467         }
1468
1469         return ret;
1470 }
1471
1472 static int trace__tool_process(struct perf_tool *tool,
1473                                union perf_event *event,
1474                                struct perf_sample *sample,
1475                                struct machine *machine)
1476 {
1477         struct trace *trace = container_of(tool, struct trace, tool);
1478         return trace__process_event(trace, machine, event, sample);
1479 }
1480
1481 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1482 {
1483         int err = symbol__init(NULL);
1484
1485         if (err)
1486                 return err;
1487
1488         trace->host = machine__new_host();
1489         if (trace->host == NULL)
1490                 return -ENOMEM;
1491
1492         if (trace_event__register_resolver(trace->host) < 0)
1493                 return -errno;
1494
1495         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1496                                             evlist->threads, trace__tool_process, false,
1497                                             trace->opts.proc_map_timeout);
1498         if (err)
1499                 symbol__exit();
1500
1501         return err;
1502 }
1503
1504 static int syscall__set_arg_fmts(struct syscall *sc)
1505 {
1506         struct format_field *field;
1507         int idx = 0;
1508
1509         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1510         if (sc->arg_scnprintf == NULL)
1511                 return -1;
1512
1513         if (sc->fmt)
1514                 sc->arg_parm = sc->fmt->arg_parm;
1515
1516         for (field = sc->args; field; field = field->next) {
1517                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1518                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1519                 else if (field->flags & FIELD_IS_POINTER)
1520                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1521                 ++idx;
1522         }
1523
1524         return 0;
1525 }
1526
1527 static int trace__read_syscall_info(struct trace *trace, int id)
1528 {
1529         char tp_name[128];
1530         struct syscall *sc;
1531         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1532
1533         if (name == NULL)
1534                 return -1;
1535
1536         if (id > trace->syscalls.max) {
1537                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1538
1539                 if (nsyscalls == NULL)
1540                         return -1;
1541
1542                 if (trace->syscalls.max != -1) {
1543                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1544                                (id - trace->syscalls.max) * sizeof(*sc));
1545                 } else {
1546                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1547                 }
1548
1549                 trace->syscalls.table = nsyscalls;
1550                 trace->syscalls.max   = id;
1551         }
1552
1553         sc = trace->syscalls.table + id;
1554         sc->name = name;
1555
1556         sc->fmt  = syscall_fmt__find(sc->name);
1557
1558         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1559         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1560
1561         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1562                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1563                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1564         }
1565
1566         if (sc->tp_format == NULL)
1567                 return -1;
1568
1569         sc->args = sc->tp_format->format.fields;
1570         sc->nr_args = sc->tp_format->format.nr_fields;
1571         /* drop nr field - not relevant here; does not exist on older kernels */
1572         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1573                 sc->args = sc->args->next;
1574                 --sc->nr_args;
1575         }
1576
1577         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1578
1579         return syscall__set_arg_fmts(sc);
1580 }
1581
1582 static int trace__validate_ev_qualifier(struct trace *trace)
1583 {
1584         int err = 0, i;
1585         struct str_node *pos;
1586
1587         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1588         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1589                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1590
1591         if (trace->ev_qualifier_ids.entries == NULL) {
1592                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1593                        trace->output);
1594                 err = -EINVAL;
1595                 goto out;
1596         }
1597
1598         i = 0;
1599
1600         strlist__for_each(pos, trace->ev_qualifier) {
1601                 const char *sc = pos->s;
1602                 int id = audit_name_to_syscall(sc, trace->audit.machine);
1603
1604                 if (id < 0) {
1605                         if (err == 0) {
1606                                 fputs("Error:\tInvalid syscall ", trace->output);
1607                                 err = -EINVAL;
1608                         } else {
1609                                 fputs(", ", trace->output);
1610                         }
1611
1612                         fputs(sc, trace->output);
1613                 }
1614
1615                 trace->ev_qualifier_ids.entries[i++] = id;
1616         }
1617
1618         if (err < 0) {
1619                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1620                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1621                 zfree(&trace->ev_qualifier_ids.entries);
1622                 trace->ev_qualifier_ids.nr = 0;
1623         }
1624 out:
1625         return err;
1626 }
1627
1628 /*
1629  * args is to be interpreted as a series of longs but we need to handle
1630  * 8-byte unaligned accesses. args points to raw_data within the event
1631  * and raw_data is guaranteed to be 8-byte unaligned because it is
1632  * preceded by raw_size which is a u32. So we need to copy args to a temp
1633  * variable to read it. Most notably this avoids extended load instructions
1634  * on unaligned addresses
1635  */
1636
1637 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1638                                       unsigned char *args, struct trace *trace,
1639                                       struct thread *thread)
1640 {
1641         size_t printed = 0;
1642         unsigned char *p;
1643         unsigned long val;
1644
1645         if (sc->args != NULL) {
1646                 struct format_field *field;
1647                 u8 bit = 1;
1648                 struct syscall_arg arg = {
1649                         .idx    = 0,
1650                         .mask   = 0,
1651                         .trace  = trace,
1652                         .thread = thread,
1653                 };
1654
1655                 for (field = sc->args; field;
1656                      field = field->next, ++arg.idx, bit <<= 1) {
1657                         if (arg.mask & bit)
1658                                 continue;
1659
1660                         /* special care for unaligned accesses */
1661                         p = args + sizeof(unsigned long) * arg.idx;
1662                         memcpy(&val, p, sizeof(val));
1663
1664                         /*
1665                          * Suppress this argument if its value is zero and
1666                          * and we don't have a string associated in an
1667                          * strarray for it.
1668                          */
1669                         if (val == 0 &&
1670                             !(sc->arg_scnprintf &&
1671                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1672                               sc->arg_parm[arg.idx]))
1673                                 continue;
1674
1675                         printed += scnprintf(bf + printed, size - printed,
1676                                              "%s%s: ", printed ? ", " : "", field->name);
1677                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1678                                 arg.val = val;
1679                                 if (sc->arg_parm)
1680                                         arg.parm = sc->arg_parm[arg.idx];
1681                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1682                                                                       size - printed, &arg);
1683                         } else {
1684                                 printed += scnprintf(bf + printed, size - printed,
1685                                                      "%ld", val);
1686                         }
1687                 }
1688         } else {
1689                 int i = 0;
1690
1691                 while (i < 6) {
1692                         /* special care for unaligned accesses */
1693                         p = args + sizeof(unsigned long) * i;
1694                         memcpy(&val, p, sizeof(val));
1695                         printed += scnprintf(bf + printed, size - printed,
1696                                              "%sarg%d: %ld",
1697                                              printed ? ", " : "", i, val);
1698                         ++i;
1699                 }
1700         }
1701
1702         return printed;
1703 }
1704
1705 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1706                                   union perf_event *event,
1707                                   struct perf_sample *sample);
1708
1709 static struct syscall *trace__syscall_info(struct trace *trace,
1710                                            struct perf_evsel *evsel, int id)
1711 {
1712
1713         if (id < 0) {
1714
1715                 /*
1716                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1717                  * before that, leaving at a higher verbosity level till that is
1718                  * explained. Reproduced with plain ftrace with:
1719                  *
1720                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1721                  * grep "NR -1 " /t/trace_pipe
1722                  *
1723                  * After generating some load on the machine.
1724                  */
1725                 if (verbose > 1) {
1726                         static u64 n;
1727                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1728                                 id, perf_evsel__name(evsel), ++n);
1729                 }
1730                 return NULL;
1731         }
1732
1733         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1734             trace__read_syscall_info(trace, id))
1735                 goto out_cant_read;
1736
1737         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1738                 goto out_cant_read;
1739
1740         return &trace->syscalls.table[id];
1741
1742 out_cant_read:
1743         if (verbose) {
1744                 fprintf(trace->output, "Problems reading syscall %d", id);
1745                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1746                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1747                 fputs(" information\n", trace->output);
1748         }
1749         return NULL;
1750 }
1751
1752 static void thread__update_stats(struct thread_trace *ttrace,
1753                                  int id, struct perf_sample *sample)
1754 {
1755         struct int_node *inode;
1756         struct stats *stats;
1757         u64 duration = 0;
1758
1759         inode = intlist__findnew(ttrace->syscall_stats, id);
1760         if (inode == NULL)
1761                 return;
1762
1763         stats = inode->priv;
1764         if (stats == NULL) {
1765                 stats = malloc(sizeof(struct stats));
1766                 if (stats == NULL)
1767                         return;
1768                 init_stats(stats);
1769                 inode->priv = stats;
1770         }
1771
1772         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1773                 duration = sample->time - ttrace->entry_time;
1774
1775         update_stats(stats, duration);
1776 }
1777
1778 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1779 {
1780         struct thread_trace *ttrace;
1781         u64 duration;
1782         size_t printed;
1783
1784         if (trace->current == NULL)
1785                 return 0;
1786
1787         ttrace = thread__priv(trace->current);
1788
1789         if (!ttrace->entry_pending)
1790                 return 0;
1791
1792         duration = sample->time - ttrace->entry_time;
1793
1794         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1795         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1796         ttrace->entry_pending = false;
1797
1798         return printed;
1799 }
1800
1801 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1802                             union perf_event *event __maybe_unused,
1803                             struct perf_sample *sample)
1804 {
1805         char *msg;
1806         void *args;
1807         size_t printed = 0;
1808         struct thread *thread;
1809         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1810         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1811         struct thread_trace *ttrace;
1812
1813         if (sc == NULL)
1814                 return -1;
1815
1816         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817         ttrace = thread__trace(thread, trace->output);
1818         if (ttrace == NULL)
1819                 goto out_put;
1820
1821         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1822
1823         if (ttrace->entry_str == NULL) {
1824                 ttrace->entry_str = malloc(1024);
1825                 if (!ttrace->entry_str)
1826                         goto out_put;
1827         }
1828
1829         if (!trace->summary_only)
1830                 trace__printf_interrupted_entry(trace, sample);
1831
1832         ttrace->entry_time = sample->time;
1833         msg = ttrace->entry_str;
1834         printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1835
1836         printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1837                                            args, trace, thread);
1838
1839         if (sc->is_exit) {
1840                 if (!trace->duration_filter && !trace->summary_only) {
1841                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1842                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1843                 }
1844         } else
1845                 ttrace->entry_pending = true;
1846
1847         if (trace->current != thread) {
1848                 thread__put(trace->current);
1849                 trace->current = thread__get(thread);
1850         }
1851         err = 0;
1852 out_put:
1853         thread__put(thread);
1854         return err;
1855 }
1856
1857 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1858                            union perf_event *event __maybe_unused,
1859                            struct perf_sample *sample)
1860 {
1861         long ret;
1862         u64 duration = 0;
1863         struct thread *thread;
1864         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1865         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1866         struct thread_trace *ttrace;
1867
1868         if (sc == NULL)
1869                 return -1;
1870
1871         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1872         ttrace = thread__trace(thread, trace->output);
1873         if (ttrace == NULL)
1874                 goto out_put;
1875
1876         if (trace->summary)
1877                 thread__update_stats(ttrace, id, sample);
1878
1879         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1880
1881         if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1882                 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1883                 trace->last_vfs_getname = NULL;
1884                 ++trace->stats.vfs_getname;
1885         }
1886
1887         ttrace->exit_time = sample->time;
1888
1889         if (ttrace->entry_time) {
1890                 duration = sample->time - ttrace->entry_time;
1891                 if (trace__filter_duration(trace, duration))
1892                         goto out;
1893         } else if (trace->duration_filter)
1894                 goto out;
1895
1896         if (trace->summary_only)
1897                 goto out;
1898
1899         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1900
1901         if (ttrace->entry_pending) {
1902                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1903         } else {
1904                 fprintf(trace->output, " ... [");
1905                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1906                 fprintf(trace->output, "]: %s()", sc->name);
1907         }
1908
1909         if (sc->fmt == NULL) {
1910 signed_print:
1911                 fprintf(trace->output, ") = %ld", ret);
1912         } else if (ret < 0 && sc->fmt->errmsg) {
1913                 char bf[STRERR_BUFSIZE];
1914                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1915                            *e = audit_errno_to_name(-ret);
1916
1917                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1918         } else if (ret == 0 && sc->fmt->timeout)
1919                 fprintf(trace->output, ") = 0 Timeout");
1920         else if (sc->fmt->hexret)
1921                 fprintf(trace->output, ") = %#lx", ret);
1922         else
1923                 goto signed_print;
1924
1925         fputc('\n', trace->output);
1926 out:
1927         ttrace->entry_pending = false;
1928         err = 0;
1929 out_put:
1930         thread__put(thread);
1931         return err;
1932 }
1933
1934 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1935                               union perf_event *event __maybe_unused,
1936                               struct perf_sample *sample)
1937 {
1938         trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1939         return 0;
1940 }
1941
1942 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1943                                      union perf_event *event __maybe_unused,
1944                                      struct perf_sample *sample)
1945 {
1946         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1947         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1948         struct thread *thread = machine__findnew_thread(trace->host,
1949                                                         sample->pid,
1950                                                         sample->tid);
1951         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1952
1953         if (ttrace == NULL)
1954                 goto out_dump;
1955
1956         ttrace->runtime_ms += runtime_ms;
1957         trace->runtime_ms += runtime_ms;
1958         thread__put(thread);
1959         return 0;
1960
1961 out_dump:
1962         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1963                evsel->name,
1964                perf_evsel__strval(evsel, sample, "comm"),
1965                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1966                runtime,
1967                perf_evsel__intval(evsel, sample, "vruntime"));
1968         thread__put(thread);
1969         return 0;
1970 }
1971
1972 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1973                                 union perf_event *event __maybe_unused,
1974                                 struct perf_sample *sample)
1975 {
1976         trace__printf_interrupted_entry(trace, sample);
1977         trace__fprintf_tstamp(trace, sample->time, trace->output);
1978
1979         if (trace->trace_syscalls)
1980                 fprintf(trace->output, "(         ): ");
1981
1982         fprintf(trace->output, "%s:", evsel->name);
1983
1984         if (evsel->tp_format) {
1985                 event_format__fprintf(evsel->tp_format, sample->cpu,
1986                                       sample->raw_data, sample->raw_size,
1987                                       trace->output);
1988         }
1989
1990         fprintf(trace->output, ")\n");
1991         return 0;
1992 }
1993
1994 static void print_location(FILE *f, struct perf_sample *sample,
1995                            struct addr_location *al,
1996                            bool print_dso, bool print_sym)
1997 {
1998
1999         if ((verbose || print_dso) && al->map)
2000                 fprintf(f, "%s@", al->map->dso->long_name);
2001
2002         if ((verbose || print_sym) && al->sym)
2003                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2004                         al->addr - al->sym->start);
2005         else if (al->map)
2006                 fprintf(f, "0x%" PRIx64, al->addr);
2007         else
2008                 fprintf(f, "0x%" PRIx64, sample->addr);
2009 }
2010
2011 static int trace__pgfault(struct trace *trace,
2012                           struct perf_evsel *evsel,
2013                           union perf_event *event,
2014                           struct perf_sample *sample)
2015 {
2016         struct thread *thread;
2017         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2018         struct addr_location al;
2019         char map_type = 'd';
2020         struct thread_trace *ttrace;
2021         int err = -1;
2022
2023         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2024         ttrace = thread__trace(thread, trace->output);
2025         if (ttrace == NULL)
2026                 goto out_put;
2027
2028         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2029                 ttrace->pfmaj++;
2030         else
2031                 ttrace->pfmin++;
2032
2033         if (trace->summary_only)
2034                 goto out;
2035
2036         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2037                               sample->ip, &al);
2038
2039         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2040
2041         fprintf(trace->output, "%sfault [",
2042                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2043                 "maj" : "min");
2044
2045         print_location(trace->output, sample, &al, false, true);
2046
2047         fprintf(trace->output, "] => ");
2048
2049         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2050                                    sample->addr, &al);
2051
2052         if (!al.map) {
2053                 thread__find_addr_location(thread, cpumode,
2054                                            MAP__FUNCTION, sample->addr, &al);
2055
2056                 if (al.map)
2057                         map_type = 'x';
2058                 else
2059                         map_type = '?';
2060         }
2061
2062         print_location(trace->output, sample, &al, true, false);
2063
2064         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2065 out:
2066         err = 0;
2067 out_put:
2068         thread__put(thread);
2069         return err;
2070 }
2071
2072 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2073 {
2074         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2075             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2076                 return false;
2077
2078         if (trace->pid_list || trace->tid_list)
2079                 return true;
2080
2081         return false;
2082 }
2083
2084 static int trace__process_sample(struct perf_tool *tool,
2085                                  union perf_event *event,
2086                                  struct perf_sample *sample,
2087                                  struct perf_evsel *evsel,
2088                                  struct machine *machine __maybe_unused)
2089 {
2090         struct trace *trace = container_of(tool, struct trace, tool);
2091         int err = 0;
2092
2093         tracepoint_handler handler = evsel->handler;
2094
2095         if (skip_sample(trace, sample))
2096                 return 0;
2097
2098         if (!trace->full_time && trace->base_time == 0)
2099                 trace->base_time = sample->time;
2100
2101         if (handler) {
2102                 ++trace->nr_events;
2103                 handler(trace, evsel, event, sample);
2104         }
2105
2106         return err;
2107 }
2108
2109 static int parse_target_str(struct trace *trace)
2110 {
2111         if (trace->opts.target.pid) {
2112                 trace->pid_list = intlist__new(trace->opts.target.pid);
2113                 if (trace->pid_list == NULL) {
2114                         pr_err("Error parsing process id string\n");
2115                         return -EINVAL;
2116                 }
2117         }
2118
2119         if (trace->opts.target.tid) {
2120                 trace->tid_list = intlist__new(trace->opts.target.tid);
2121                 if (trace->tid_list == NULL) {
2122                         pr_err("Error parsing thread id string\n");
2123                         return -EINVAL;
2124                 }
2125         }
2126
2127         return 0;
2128 }
2129
2130 static int trace__record(struct trace *trace, int argc, const char **argv)
2131 {
2132         unsigned int rec_argc, i, j;
2133         const char **rec_argv;
2134         const char * const record_args[] = {
2135                 "record",
2136                 "-R",
2137                 "-m", "1024",
2138                 "-c", "1",
2139         };
2140
2141         const char * const sc_args[] = { "-e", };
2142         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2143         const char * const majpf_args[] = { "-e", "major-faults" };
2144         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2145         const char * const minpf_args[] = { "-e", "minor-faults" };
2146         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2147
2148         /* +1 is for the event string below */
2149         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2150                 majpf_args_nr + minpf_args_nr + argc;
2151         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2152
2153         if (rec_argv == NULL)
2154                 return -ENOMEM;
2155
2156         j = 0;
2157         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2158                 rec_argv[j++] = record_args[i];
2159
2160         if (trace->trace_syscalls) {
2161                 for (i = 0; i < sc_args_nr; i++)
2162                         rec_argv[j++] = sc_args[i];
2163
2164                 /* event string may be different for older kernels - e.g., RHEL6 */
2165                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2166                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2167                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2168                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2169                 else {
2170                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2171                         return -1;
2172                 }
2173         }
2174
2175         if (trace->trace_pgfaults & TRACE_PFMAJ)
2176                 for (i = 0; i < majpf_args_nr; i++)
2177                         rec_argv[j++] = majpf_args[i];
2178
2179         if (trace->trace_pgfaults & TRACE_PFMIN)
2180                 for (i = 0; i < minpf_args_nr; i++)
2181                         rec_argv[j++] = minpf_args[i];
2182
2183         for (i = 0; i < (unsigned int)argc; i++)
2184                 rec_argv[j++] = argv[i];
2185
2186         return cmd_record(j, rec_argv, NULL);
2187 }
2188
2189 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2190
2191 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2192 {
2193         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2194         if (evsel == NULL)
2195                 return;
2196
2197         if (perf_evsel__field(evsel, "pathname") == NULL) {
2198                 perf_evsel__delete(evsel);
2199                 return;
2200         }
2201
2202         evsel->handler = trace__vfs_getname;
2203         perf_evlist__add(evlist, evsel);
2204 }
2205
2206 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2207                                     u64 config)
2208 {
2209         struct perf_evsel *evsel;
2210         struct perf_event_attr attr = {
2211                 .type = PERF_TYPE_SOFTWARE,
2212                 .mmap_data = 1,
2213         };
2214
2215         attr.config = config;
2216         attr.sample_period = 1;
2217
2218         event_attr_init(&attr);
2219
2220         evsel = perf_evsel__new(&attr);
2221         if (!evsel)
2222                 return -ENOMEM;
2223
2224         evsel->handler = trace__pgfault;
2225         perf_evlist__add(evlist, evsel);
2226
2227         return 0;
2228 }
2229
2230 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2231 {
2232         const u32 type = event->header.type;
2233         struct perf_evsel *evsel;
2234
2235         if (!trace->full_time && trace->base_time == 0)
2236                 trace->base_time = sample->time;
2237
2238         if (type != PERF_RECORD_SAMPLE) {
2239                 trace__process_event(trace, trace->host, event, sample);
2240                 return;
2241         }
2242
2243         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2244         if (evsel == NULL) {
2245                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2246                 return;
2247         }
2248
2249         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2250             sample->raw_data == NULL) {
2251                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2252                        perf_evsel__name(evsel), sample->tid,
2253                        sample->cpu, sample->raw_size);
2254         } else {
2255                 tracepoint_handler handler = evsel->handler;
2256                 handler(trace, evsel, event, sample);
2257         }
2258 }
2259
2260 static int trace__add_syscall_newtp(struct trace *trace)
2261 {
2262         int ret = -1;
2263         struct perf_evlist *evlist = trace->evlist;
2264         struct perf_evsel *sys_enter, *sys_exit;
2265
2266         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2267         if (sys_enter == NULL)
2268                 goto out;
2269
2270         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2271                 goto out_delete_sys_enter;
2272
2273         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2274         if (sys_exit == NULL)
2275                 goto out_delete_sys_enter;
2276
2277         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2278                 goto out_delete_sys_exit;
2279
2280         perf_evlist__add(evlist, sys_enter);
2281         perf_evlist__add(evlist, sys_exit);
2282
2283         trace->syscalls.events.sys_enter = sys_enter;
2284         trace->syscalls.events.sys_exit  = sys_exit;
2285
2286         ret = 0;
2287 out:
2288         return ret;
2289
2290 out_delete_sys_exit:
2291         perf_evsel__delete_priv(sys_exit);
2292 out_delete_sys_enter:
2293         perf_evsel__delete_priv(sys_enter);
2294         goto out;
2295 }
2296
2297 static int trace__set_ev_qualifier_filter(struct trace *trace)
2298 {
2299         int err = -1;
2300         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2301                                                 trace->ev_qualifier_ids.nr,
2302                                                 trace->ev_qualifier_ids.entries);
2303
2304         if (filter == NULL)
2305                 goto out_enomem;
2306
2307         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2308                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2309
2310         free(filter);
2311 out:
2312         return err;
2313 out_enomem:
2314         errno = ENOMEM;
2315         goto out;
2316 }
2317
2318 static int trace__run(struct trace *trace, int argc, const char **argv)
2319 {
2320         struct perf_evlist *evlist = trace->evlist;
2321         struct perf_evsel *evsel;
2322         int err = -1, i;
2323         unsigned long before;
2324         const bool forks = argc > 0;
2325         bool draining = false;
2326
2327         trace->live = true;
2328
2329         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2330                 goto out_error_raw_syscalls;
2331
2332         if (trace->trace_syscalls)
2333                 perf_evlist__add_vfs_getname(evlist);
2334
2335         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2336             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2337                 goto out_error_mem;
2338         }
2339
2340         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2341             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2342                 goto out_error_mem;
2343
2344         if (trace->sched &&
2345             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2346                                    trace__sched_stat_runtime))
2347                 goto out_error_sched_stat_runtime;
2348
2349         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2350         if (err < 0) {
2351                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2352                 goto out_delete_evlist;
2353         }
2354
2355         err = trace__symbols_init(trace, evlist);
2356         if (err < 0) {
2357                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2358                 goto out_delete_evlist;
2359         }
2360
2361         perf_evlist__config(evlist, &trace->opts);
2362
2363         signal(SIGCHLD, sig_handler);
2364         signal(SIGINT, sig_handler);
2365
2366         if (forks) {
2367                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2368                                                     argv, false, NULL);
2369                 if (err < 0) {
2370                         fprintf(trace->output, "Couldn't run the workload!\n");
2371                         goto out_delete_evlist;
2372                 }
2373         }
2374
2375         err = perf_evlist__open(evlist);
2376         if (err < 0)
2377                 goto out_error_open;
2378
2379         /*
2380          * Better not use !target__has_task() here because we need to cover the
2381          * case where no threads were specified in the command line, but a
2382          * workload was, and in that case we will fill in the thread_map when
2383          * we fork the workload in perf_evlist__prepare_workload.
2384          */
2385         if (trace->filter_pids.nr > 0)
2386                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2387         else if (thread_map__pid(evlist->threads, 0) == -1)
2388                 err = perf_evlist__set_filter_pid(evlist, getpid());
2389
2390         if (err < 0)
2391                 goto out_error_mem;
2392
2393         if (trace->ev_qualifier_ids.nr > 0) {
2394                 err = trace__set_ev_qualifier_filter(trace);
2395                 if (err < 0)
2396                         goto out_errno;
2397         }
2398
2399         pr_debug("%s\n", trace->syscalls.events.sys_exit->filter);
2400
2401         err = perf_evlist__apply_filters(evlist, &evsel);
2402         if (err < 0)
2403                 goto out_error_apply_filters;
2404
2405         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2406         if (err < 0)
2407                 goto out_error_mmap;
2408
2409         if (!target__none(&trace->opts.target))
2410                 perf_evlist__enable(evlist);
2411
2412         if (forks)
2413                 perf_evlist__start_workload(evlist);
2414
2415         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2416                                   evlist->threads->nr > 1 ||
2417                                   perf_evlist__first(evlist)->attr.inherit;
2418 again:
2419         before = trace->nr_events;
2420
2421         for (i = 0; i < evlist->nr_mmaps; i++) {
2422                 union perf_event *event;
2423
2424                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2425                         struct perf_sample sample;
2426
2427                         ++trace->nr_events;
2428
2429                         err = perf_evlist__parse_sample(evlist, event, &sample);
2430                         if (err) {
2431                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2432                                 goto next_event;
2433                         }
2434
2435                         trace__handle_event(trace, event, &sample);
2436 next_event:
2437                         perf_evlist__mmap_consume(evlist, i);
2438
2439                         if (interrupted)
2440                                 goto out_disable;
2441
2442                         if (done && !draining) {
2443                                 perf_evlist__disable(evlist);
2444                                 draining = true;
2445                         }
2446                 }
2447         }
2448
2449         if (trace->nr_events == before) {
2450                 int timeout = done ? 100 : -1;
2451
2452                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2453                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2454                                 draining = true;
2455
2456                         goto again;
2457                 }
2458         } else {
2459                 goto again;
2460         }
2461
2462 out_disable:
2463         thread__zput(trace->current);
2464
2465         perf_evlist__disable(evlist);
2466
2467         if (!err) {
2468                 if (trace->summary)
2469                         trace__fprintf_thread_summary(trace, trace->output);
2470
2471                 if (trace->show_tool_stats) {
2472                         fprintf(trace->output, "Stats:\n "
2473                                                " vfs_getname : %" PRIu64 "\n"
2474                                                " proc_getname: %" PRIu64 "\n",
2475                                 trace->stats.vfs_getname,
2476                                 trace->stats.proc_getname);
2477                 }
2478         }
2479
2480 out_delete_evlist:
2481         perf_evlist__delete(evlist);
2482         trace->evlist = NULL;
2483         trace->live = false;
2484         return err;
2485 {
2486         char errbuf[BUFSIZ];
2487
2488 out_error_sched_stat_runtime:
2489         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2490         goto out_error;
2491
2492 out_error_raw_syscalls:
2493         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2494         goto out_error;
2495
2496 out_error_mmap:
2497         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2498         goto out_error;
2499
2500 out_error_open:
2501         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2502
2503 out_error:
2504         fprintf(trace->output, "%s\n", errbuf);
2505         goto out_delete_evlist;
2506
2507 out_error_apply_filters:
2508         fprintf(trace->output,
2509                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2510                 evsel->filter, perf_evsel__name(evsel), errno,
2511                 strerror_r(errno, errbuf, sizeof(errbuf)));
2512         goto out_delete_evlist;
2513 }
2514 out_error_mem:
2515         fprintf(trace->output, "Not enough memory to run!\n");
2516         goto out_delete_evlist;
2517
2518 out_errno:
2519         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2520         goto out_delete_evlist;
2521 }
2522
2523 static int trace__replay(struct trace *trace)
2524 {
2525         const struct perf_evsel_str_handler handlers[] = {
2526                 { "probe:vfs_getname",       trace__vfs_getname, },
2527         };
2528         struct perf_data_file file = {
2529                 .path  = input_name,
2530                 .mode  = PERF_DATA_MODE_READ,
2531                 .force = trace->force,
2532         };
2533         struct perf_session *session;
2534         struct perf_evsel *evsel;
2535         int err = -1;
2536
2537         trace->tool.sample        = trace__process_sample;
2538         trace->tool.mmap          = perf_event__process_mmap;
2539         trace->tool.mmap2         = perf_event__process_mmap2;
2540         trace->tool.comm          = perf_event__process_comm;
2541         trace->tool.exit          = perf_event__process_exit;
2542         trace->tool.fork          = perf_event__process_fork;
2543         trace->tool.attr          = perf_event__process_attr;
2544         trace->tool.tracing_data = perf_event__process_tracing_data;
2545         trace->tool.build_id      = perf_event__process_build_id;
2546
2547         trace->tool.ordered_events = true;
2548         trace->tool.ordering_requires_timestamps = true;
2549
2550         /* add tid to output */
2551         trace->multiple_threads = true;
2552
2553         session = perf_session__new(&file, false, &trace->tool);
2554         if (session == NULL)
2555                 return -1;
2556
2557         if (symbol__init(&session->header.env) < 0)
2558                 goto out;
2559
2560         trace->host = &session->machines.host;
2561
2562         err = perf_session__set_tracepoints_handlers(session, handlers);
2563         if (err)
2564                 goto out;
2565
2566         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2567                                                      "raw_syscalls:sys_enter");
2568         /* older kernels have syscalls tp versus raw_syscalls */
2569         if (evsel == NULL)
2570                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2571                                                              "syscalls:sys_enter");
2572
2573         if (evsel &&
2574             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2575             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2576                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2577                 goto out;
2578         }
2579
2580         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2581                                                      "raw_syscalls:sys_exit");
2582         if (evsel == NULL)
2583                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2584                                                              "syscalls:sys_exit");
2585         if (evsel &&
2586             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2587             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2588                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2589                 goto out;
2590         }
2591
2592         evlist__for_each(session->evlist, evsel) {
2593                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2594                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2595                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2596                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2597                         evsel->handler = trace__pgfault;
2598         }
2599
2600         err = parse_target_str(trace);
2601         if (err != 0)
2602                 goto out;
2603
2604         setup_pager();
2605
2606         err = perf_session__process_events(session);
2607         if (err)
2608                 pr_err("Failed to process events, error %d", err);
2609
2610         else if (trace->summary)
2611                 trace__fprintf_thread_summary(trace, trace->output);
2612
2613 out:
2614         perf_session__delete(session);
2615
2616         return err;
2617 }
2618
2619 static size_t trace__fprintf_threads_header(FILE *fp)
2620 {
2621         size_t printed;
2622
2623         printed  = fprintf(fp, "\n Summary of events:\n\n");
2624
2625         return printed;
2626 }
2627
2628 static size_t thread__dump_stats(struct thread_trace *ttrace,
2629                                  struct trace *trace, FILE *fp)
2630 {
2631         struct stats *stats;
2632         size_t printed = 0;
2633         struct syscall *sc;
2634         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2635
2636         if (inode == NULL)
2637                 return 0;
2638
2639         printed += fprintf(fp, "\n");
2640
2641         printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2642         printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2643         printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2644
2645         /* each int_node is a syscall */
2646         while (inode) {
2647                 stats = inode->priv;
2648                 if (stats) {
2649                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2650                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2651                         double avg = avg_stats(stats);
2652                         double pct;
2653                         u64 n = (u64) stats->n;
2654
2655                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2656                         avg /= NSEC_PER_MSEC;
2657
2658                         sc = &trace->syscalls.table[inode->i];
2659                         printed += fprintf(fp, "   %-15s", sc->name);
2660                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2661                                            n, min, avg);
2662                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2663                 }
2664
2665                 inode = intlist__next(inode);
2666         }
2667
2668         printed += fprintf(fp, "\n\n");
2669
2670         return printed;
2671 }
2672
2673 /* struct used to pass data to per-thread function */
2674 struct summary_data {
2675         FILE *fp;
2676         struct trace *trace;
2677         size_t printed;
2678 };
2679
2680 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2681 {
2682         struct summary_data *data = priv;
2683         FILE *fp = data->fp;
2684         size_t printed = data->printed;
2685         struct trace *trace = data->trace;
2686         struct thread_trace *ttrace = thread__priv(thread);
2687         double ratio;
2688
2689         if (ttrace == NULL)
2690                 return 0;
2691
2692         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2693
2694         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2695         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2696         printed += fprintf(fp, "%.1f%%", ratio);
2697         if (ttrace->pfmaj)
2698                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2699         if (ttrace->pfmin)
2700                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2701         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2702         printed += thread__dump_stats(ttrace, trace, fp);
2703
2704         data->printed += printed;
2705
2706         return 0;
2707 }
2708
2709 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2710 {
2711         struct summary_data data = {
2712                 .fp = fp,
2713                 .trace = trace
2714         };
2715         data.printed = trace__fprintf_threads_header(fp);
2716
2717         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2718
2719         return data.printed;
2720 }
2721
2722 static int trace__set_duration(const struct option *opt, const char *str,
2723                                int unset __maybe_unused)
2724 {
2725         struct trace *trace = opt->value;
2726
2727         trace->duration_filter = atof(str);
2728         return 0;
2729 }
2730
2731 static int trace__set_filter_pids(const struct option *opt, const char *str,
2732                                   int unset __maybe_unused)
2733 {
2734         int ret = -1;
2735         size_t i;
2736         struct trace *trace = opt->value;
2737         /*
2738          * FIXME: introduce a intarray class, plain parse csv and create a
2739          * { int nr, int entries[] } struct...
2740          */
2741         struct intlist *list = intlist__new(str);
2742
2743         if (list == NULL)
2744                 return -1;
2745
2746         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2747         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2748
2749         if (trace->filter_pids.entries == NULL)
2750                 goto out;
2751
2752         trace->filter_pids.entries[0] = getpid();
2753
2754         for (i = 1; i < trace->filter_pids.nr; ++i)
2755                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2756
2757         intlist__delete(list);
2758         ret = 0;
2759 out:
2760         return ret;
2761 }
2762
2763 static int trace__open_output(struct trace *trace, const char *filename)
2764 {
2765         struct stat st;
2766
2767         if (!stat(filename, &st) && st.st_size) {
2768                 char oldname[PATH_MAX];
2769
2770                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2771                 unlink(oldname);
2772                 rename(filename, oldname);
2773         }
2774
2775         trace->output = fopen(filename, "w");
2776
2777         return trace->output == NULL ? -errno : 0;
2778 }
2779
2780 static int parse_pagefaults(const struct option *opt, const char *str,
2781                             int unset __maybe_unused)
2782 {
2783         int *trace_pgfaults = opt->value;
2784
2785         if (strcmp(str, "all") == 0)
2786                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2787         else if (strcmp(str, "maj") == 0)
2788                 *trace_pgfaults |= TRACE_PFMAJ;
2789         else if (strcmp(str, "min") == 0)
2790                 *trace_pgfaults |= TRACE_PFMIN;
2791         else
2792                 return -1;
2793
2794         return 0;
2795 }
2796
2797 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2798 {
2799         struct perf_evsel *evsel;
2800
2801         evlist__for_each(evlist, evsel)
2802                 evsel->handler = handler;
2803 }
2804
2805 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2806 {
2807         const char *trace_usage[] = {
2808                 "perf trace [<options>] [<command>]",
2809                 "perf trace [<options>] -- <command> [<options>]",
2810                 "perf trace record [<options>] [<command>]",
2811                 "perf trace record [<options>] -- <command> [<options>]",
2812                 NULL
2813         };
2814         struct trace trace = {
2815                 .audit = {
2816                         .machine = audit_detect_machine(),
2817                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
2818                 },
2819                 .syscalls = {
2820                         . max = -1,
2821                 },
2822                 .opts = {
2823                         .target = {
2824                                 .uid       = UINT_MAX,
2825                                 .uses_mmap = true,
2826                         },
2827                         .user_freq     = UINT_MAX,
2828                         .user_interval = ULLONG_MAX,
2829                         .no_buffering  = true,
2830                         .mmap_pages    = UINT_MAX,
2831                         .proc_map_timeout  = 500,
2832                 },
2833                 .output = stdout,
2834                 .show_comm = true,
2835                 .trace_syscalls = true,
2836         };
2837         const char *output_name = NULL;
2838         const char *ev_qualifier_str = NULL;
2839         const struct option trace_options[] = {
2840         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2841                      "event selector. use 'perf list' to list available events",
2842                      parse_events_option),
2843         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2844                     "show the thread COMM next to its id"),
2845         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2846         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2847         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2848         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2849         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2850                     "trace events on existing process id"),
2851         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2852                     "trace events on existing thread id"),
2853         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2854                      "pids to filter (by the kernel)", trace__set_filter_pids),
2855         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2856                     "system-wide collection from all CPUs"),
2857         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2858                     "list of cpus to monitor"),
2859         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2860                     "child tasks do not inherit counters"),
2861         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2862                      "number of mmap data pages",
2863                      perf_evlist__parse_mmap_pages),
2864         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2865                    "user to profile"),
2866         OPT_CALLBACK(0, "duration", &trace, "float",
2867                      "show only events with duration > N.M ms",
2868                      trace__set_duration),
2869         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2870         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2871         OPT_BOOLEAN('T', "time", &trace.full_time,
2872                     "Show full timestamp, not time relative to first start"),
2873         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2874                     "Show only syscall summary with statistics"),
2875         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2876                     "Show all syscalls and summary with statistics"),
2877         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2878                      "Trace pagefaults", parse_pagefaults, "maj"),
2879         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2880         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2881         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2882                         "per thread proc mmap processing timeout in ms"),
2883         OPT_END()
2884         };
2885         const char * const trace_subcommands[] = { "record", NULL };
2886         int err;
2887         char bf[BUFSIZ];
2888
2889         signal(SIGSEGV, sighandler_dump_stack);
2890         signal(SIGFPE, sighandler_dump_stack);
2891
2892         trace.evlist = perf_evlist__new();
2893
2894         if (trace.evlist == NULL) {
2895                 pr_err("Not enough memory to run!\n");
2896                 err = -ENOMEM;
2897                 goto out;
2898         }
2899
2900         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2901                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2902
2903         if (trace.trace_pgfaults) {
2904                 trace.opts.sample_address = true;
2905                 trace.opts.sample_time = true;
2906         }
2907
2908         if (trace.evlist->nr_entries > 0)
2909                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2910
2911         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2912                 return trace__record(&trace, argc-1, &argv[1]);
2913
2914         /* summary_only implies summary option, but don't overwrite summary if set */
2915         if (trace.summary_only)
2916                 trace.summary = trace.summary_only;
2917
2918         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2919             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2920                 pr_err("Please specify something to trace.\n");
2921                 return -1;
2922         }
2923
2924         if (output_name != NULL) {
2925                 err = trace__open_output(&trace, output_name);
2926                 if (err < 0) {
2927                         perror("failed to create output file");
2928                         goto out;
2929                 }
2930         }
2931
2932         if (ev_qualifier_str != NULL) {
2933                 const char *s = ev_qualifier_str;
2934                 struct strlist_config slist_config = {
2935                         .dirname = system_path(STRACE_GROUPS_DIR),
2936                 };
2937
2938                 trace.not_ev_qualifier = *s == '!';
2939                 if (trace.not_ev_qualifier)
2940                         ++s;
2941                 trace.ev_qualifier = strlist__new(s, &slist_config);
2942                 if (trace.ev_qualifier == NULL) {
2943                         fputs("Not enough memory to parse event qualifier",
2944                               trace.output);
2945                         err = -ENOMEM;
2946                         goto out_close;
2947                 }
2948
2949                 err = trace__validate_ev_qualifier(&trace);
2950                 if (err)
2951                         goto out_close;
2952         }
2953
2954         err = target__validate(&trace.opts.target);
2955         if (err) {
2956                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2957                 fprintf(trace.output, "%s", bf);
2958                 goto out_close;
2959         }
2960
2961         err = target__parse_uid(&trace.opts.target);
2962         if (err) {
2963                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2964                 fprintf(trace.output, "%s", bf);
2965                 goto out_close;
2966         }
2967
2968         if (!argc && target__none(&trace.opts.target))
2969                 trace.opts.target.system_wide = true;
2970
2971         if (input_name)
2972                 err = trace__replay(&trace);
2973         else
2974                 err = trace__run(&trace, argc, argv);
2975
2976 out_close:
2977         if (output_name != NULL)
2978                 fclose(trace.output);
2979 out:
2980         return err;
2981 }