perf trace: Use a constant for the syscall formatting buffer
[firefly-linux-kernel-4.4.55.git] / tools / perf / builtin-trace.c
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/exec_cmd.h"
7 #include "util/machine.h"
8 #include "util/session.h"
9 #include "util/thread.h"
10 #include "util/parse-options.h"
11 #include "util/strlist.h"
12 #include "util/intlist.h"
13 #include "util/thread_map.h"
14 #include "util/stat.h"
15 #include "trace-event.h"
16 #include "util/parse-events.h"
17
18 #include <libaudit.h>
19 #include <stdlib.h>
20 #include <sys/mman.h>
21 #include <linux/futex.h>
22
23 /* For older distros: */
24 #ifndef MAP_STACK
25 # define MAP_STACK              0x20000
26 #endif
27
28 #ifndef MADV_HWPOISON
29 # define MADV_HWPOISON          100
30 #endif
31
32 #ifndef MADV_MERGEABLE
33 # define MADV_MERGEABLE         12
34 #endif
35
36 #ifndef MADV_UNMERGEABLE
37 # define MADV_UNMERGEABLE       13
38 #endif
39
40 #ifndef EFD_SEMAPHORE
41 # define EFD_SEMAPHORE          1
42 #endif
43
44 #ifndef EFD_NONBLOCK
45 # define EFD_NONBLOCK           00004000
46 #endif
47
48 #ifndef EFD_CLOEXEC
49 # define EFD_CLOEXEC            02000000
50 #endif
51
52 #ifndef O_CLOEXEC
53 # define O_CLOEXEC              02000000
54 #endif
55
56 #ifndef SOCK_DCCP
57 # define SOCK_DCCP              6
58 #endif
59
60 #ifndef SOCK_CLOEXEC
61 # define SOCK_CLOEXEC           02000000
62 #endif
63
64 #ifndef SOCK_NONBLOCK
65 # define SOCK_NONBLOCK          00004000
66 #endif
67
68 #ifndef MSG_CMSG_CLOEXEC
69 # define MSG_CMSG_CLOEXEC       0x40000000
70 #endif
71
72 #ifndef PERF_FLAG_FD_NO_GROUP
73 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
74 #endif
75
76 #ifndef PERF_FLAG_FD_OUTPUT
77 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
78 #endif
79
80 #ifndef PERF_FLAG_PID_CGROUP
81 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
82 #endif
83
84 #ifndef PERF_FLAG_FD_CLOEXEC
85 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
86 #endif
87
88
89 struct tp_field {
90         int offset;
91         union {
92                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
93                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
94         };
95 };
96
97 #define TP_UINT_FIELD(bits) \
98 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
99 { \
100         u##bits value; \
101         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
102         return value;  \
103 }
104
105 TP_UINT_FIELD(8);
106 TP_UINT_FIELD(16);
107 TP_UINT_FIELD(32);
108 TP_UINT_FIELD(64);
109
110 #define TP_UINT_FIELD__SWAPPED(bits) \
111 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
112 { \
113         u##bits value; \
114         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
115         return bswap_##bits(value);\
116 }
117
118 TP_UINT_FIELD__SWAPPED(16);
119 TP_UINT_FIELD__SWAPPED(32);
120 TP_UINT_FIELD__SWAPPED(64);
121
122 static int tp_field__init_uint(struct tp_field *field,
123                                struct format_field *format_field,
124                                bool needs_swap)
125 {
126         field->offset = format_field->offset;
127
128         switch (format_field->size) {
129         case 1:
130                 field->integer = tp_field__u8;
131                 break;
132         case 2:
133                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
134                 break;
135         case 4:
136                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
137                 break;
138         case 8:
139                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
140                 break;
141         default:
142                 return -1;
143         }
144
145         return 0;
146 }
147
148 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
149 {
150         return sample->raw_data + field->offset;
151 }
152
153 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
154 {
155         field->offset = format_field->offset;
156         field->pointer = tp_field__ptr;
157         return 0;
158 }
159
160 struct syscall_tp {
161         struct tp_field id;
162         union {
163                 struct tp_field args, ret;
164         };
165 };
166
167 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
168                                           struct tp_field *field,
169                                           const char *name)
170 {
171         struct format_field *format_field = perf_evsel__field(evsel, name);
172
173         if (format_field == NULL)
174                 return -1;
175
176         return tp_field__init_uint(field, format_field, evsel->needs_swap);
177 }
178
179 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
180         ({ struct syscall_tp *sc = evsel->priv;\
181            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
182
183 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
184                                          struct tp_field *field,
185                                          const char *name)
186 {
187         struct format_field *format_field = perf_evsel__field(evsel, name);
188
189         if (format_field == NULL)
190                 return -1;
191
192         return tp_field__init_ptr(field, format_field);
193 }
194
195 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
196         ({ struct syscall_tp *sc = evsel->priv;\
197            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
198
199 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
200 {
201         zfree(&evsel->priv);
202         perf_evsel__delete(evsel);
203 }
204
205 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
206 {
207         evsel->priv = malloc(sizeof(struct syscall_tp));
208         if (evsel->priv != NULL) {
209                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
210                         goto out_delete;
211
212                 evsel->handler = handler;
213                 return 0;
214         }
215
216         return -ENOMEM;
217
218 out_delete:
219         zfree(&evsel->priv);
220         return -ENOENT;
221 }
222
223 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
224 {
225         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
226
227         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
228         if (evsel == NULL)
229                 evsel = perf_evsel__newtp("syscalls", direction);
230
231         if (evsel) {
232                 if (perf_evsel__init_syscall_tp(evsel, handler))
233                         goto out_delete;
234         }
235
236         return evsel;
237
238 out_delete:
239         perf_evsel__delete_priv(evsel);
240         return NULL;
241 }
242
243 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
244         ({ struct syscall_tp *fields = evsel->priv; \
245            fields->name.integer(&fields->name, sample); })
246
247 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
248         ({ struct syscall_tp *fields = evsel->priv; \
249            fields->name.pointer(&fields->name, sample); })
250
251 struct syscall_arg {
252         unsigned long val;
253         struct thread *thread;
254         struct trace  *trace;
255         void          *parm;
256         u8            idx;
257         u8            mask;
258 };
259
260 struct strarray {
261         int         offset;
262         int         nr_entries;
263         const char **entries;
264 };
265
266 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
267         .nr_entries = ARRAY_SIZE(array), \
268         .entries = array, \
269 }
270
271 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
272         .offset     = off, \
273         .nr_entries = ARRAY_SIZE(array), \
274         .entries = array, \
275 }
276
277 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
278                                                 const char *intfmt,
279                                                 struct syscall_arg *arg)
280 {
281         struct strarray *sa = arg->parm;
282         int idx = arg->val - sa->offset;
283
284         if (idx < 0 || idx >= sa->nr_entries)
285                 return scnprintf(bf, size, intfmt, arg->val);
286
287         return scnprintf(bf, size, "%s", sa->entries[idx]);
288 }
289
290 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
291                                               struct syscall_arg *arg)
292 {
293         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
294 }
295
296 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
297
298 #if defined(__i386__) || defined(__x86_64__)
299 /*
300  * FIXME: Make this available to all arches as soon as the ioctl beautifier
301  *        gets rewritten to support all arches.
302  */
303 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
304                                                  struct syscall_arg *arg)
305 {
306         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
307 }
308
309 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
310 #endif /* defined(__i386__) || defined(__x86_64__) */
311
312 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
313                                         struct syscall_arg *arg);
314
315 #define SCA_FD syscall_arg__scnprintf_fd
316
317 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
318                                            struct syscall_arg *arg)
319 {
320         int fd = arg->val;
321
322         if (fd == AT_FDCWD)
323                 return scnprintf(bf, size, "CWD");
324
325         return syscall_arg__scnprintf_fd(bf, size, arg);
326 }
327
328 #define SCA_FDAT syscall_arg__scnprintf_fd_at
329
330 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
331                                               struct syscall_arg *arg);
332
333 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
334
335 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
336                                          struct syscall_arg *arg)
337 {
338         return scnprintf(bf, size, "%#lx", arg->val);
339 }
340
341 #define SCA_HEX syscall_arg__scnprintf_hex
342
343 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
344                                          struct syscall_arg *arg)
345 {
346         return scnprintf(bf, size, "%d", arg->val);
347 }
348
349 #define SCA_INT syscall_arg__scnprintf_int
350
351 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
352                                                struct syscall_arg *arg)
353 {
354         int printed = 0, prot = arg->val;
355
356         if (prot == PROT_NONE)
357                 return scnprintf(bf, size, "NONE");
358 #define P_MMAP_PROT(n) \
359         if (prot & PROT_##n) { \
360                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
361                 prot &= ~PROT_##n; \
362         }
363
364         P_MMAP_PROT(EXEC);
365         P_MMAP_PROT(READ);
366         P_MMAP_PROT(WRITE);
367 #ifdef PROT_SEM
368         P_MMAP_PROT(SEM);
369 #endif
370         P_MMAP_PROT(GROWSDOWN);
371         P_MMAP_PROT(GROWSUP);
372 #undef P_MMAP_PROT
373
374         if (prot)
375                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
376
377         return printed;
378 }
379
380 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
381
382 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
383                                                 struct syscall_arg *arg)
384 {
385         int printed = 0, flags = arg->val;
386
387 #define P_MMAP_FLAG(n) \
388         if (flags & MAP_##n) { \
389                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
390                 flags &= ~MAP_##n; \
391         }
392
393         P_MMAP_FLAG(SHARED);
394         P_MMAP_FLAG(PRIVATE);
395 #ifdef MAP_32BIT
396         P_MMAP_FLAG(32BIT);
397 #endif
398         P_MMAP_FLAG(ANONYMOUS);
399         P_MMAP_FLAG(DENYWRITE);
400         P_MMAP_FLAG(EXECUTABLE);
401         P_MMAP_FLAG(FILE);
402         P_MMAP_FLAG(FIXED);
403         P_MMAP_FLAG(GROWSDOWN);
404 #ifdef MAP_HUGETLB
405         P_MMAP_FLAG(HUGETLB);
406 #endif
407         P_MMAP_FLAG(LOCKED);
408         P_MMAP_FLAG(NONBLOCK);
409         P_MMAP_FLAG(NORESERVE);
410         P_MMAP_FLAG(POPULATE);
411         P_MMAP_FLAG(STACK);
412 #ifdef MAP_UNINITIALIZED
413         P_MMAP_FLAG(UNINITIALIZED);
414 #endif
415 #undef P_MMAP_FLAG
416
417         if (flags)
418                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
419
420         return printed;
421 }
422
423 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
424
425 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
426                                                   struct syscall_arg *arg)
427 {
428         int printed = 0, flags = arg->val;
429
430 #define P_MREMAP_FLAG(n) \
431         if (flags & MREMAP_##n) { \
432                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
433                 flags &= ~MREMAP_##n; \
434         }
435
436         P_MREMAP_FLAG(MAYMOVE);
437 #ifdef MREMAP_FIXED
438         P_MREMAP_FLAG(FIXED);
439 #endif
440 #undef P_MREMAP_FLAG
441
442         if (flags)
443                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
444
445         return printed;
446 }
447
448 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
449
450 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
451                                                       struct syscall_arg *arg)
452 {
453         int behavior = arg->val;
454
455         switch (behavior) {
456 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
457         P_MADV_BHV(NORMAL);
458         P_MADV_BHV(RANDOM);
459         P_MADV_BHV(SEQUENTIAL);
460         P_MADV_BHV(WILLNEED);
461         P_MADV_BHV(DONTNEED);
462         P_MADV_BHV(REMOVE);
463         P_MADV_BHV(DONTFORK);
464         P_MADV_BHV(DOFORK);
465         P_MADV_BHV(HWPOISON);
466 #ifdef MADV_SOFT_OFFLINE
467         P_MADV_BHV(SOFT_OFFLINE);
468 #endif
469         P_MADV_BHV(MERGEABLE);
470         P_MADV_BHV(UNMERGEABLE);
471 #ifdef MADV_HUGEPAGE
472         P_MADV_BHV(HUGEPAGE);
473 #endif
474 #ifdef MADV_NOHUGEPAGE
475         P_MADV_BHV(NOHUGEPAGE);
476 #endif
477 #ifdef MADV_DONTDUMP
478         P_MADV_BHV(DONTDUMP);
479 #endif
480 #ifdef MADV_DODUMP
481         P_MADV_BHV(DODUMP);
482 #endif
483 #undef P_MADV_PHV
484         default: break;
485         }
486
487         return scnprintf(bf, size, "%#x", behavior);
488 }
489
490 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
491
492 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
493                                            struct syscall_arg *arg)
494 {
495         int printed = 0, op = arg->val;
496
497         if (op == 0)
498                 return scnprintf(bf, size, "NONE");
499 #define P_CMD(cmd) \
500         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
501                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
502                 op &= ~LOCK_##cmd; \
503         }
504
505         P_CMD(SH);
506         P_CMD(EX);
507         P_CMD(NB);
508         P_CMD(UN);
509         P_CMD(MAND);
510         P_CMD(RW);
511         P_CMD(READ);
512         P_CMD(WRITE);
513 #undef P_OP
514
515         if (op)
516                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
517
518         return printed;
519 }
520
521 #define SCA_FLOCK syscall_arg__scnprintf_flock
522
523 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
524 {
525         enum syscall_futex_args {
526                 SCF_UADDR   = (1 << 0),
527                 SCF_OP      = (1 << 1),
528                 SCF_VAL     = (1 << 2),
529                 SCF_TIMEOUT = (1 << 3),
530                 SCF_UADDR2  = (1 << 4),
531                 SCF_VAL3    = (1 << 5),
532         };
533         int op = arg->val;
534         int cmd = op & FUTEX_CMD_MASK;
535         size_t printed = 0;
536
537         switch (cmd) {
538 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
539         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
540         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
541         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
542         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
543         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
544         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
545         P_FUTEX_OP(WAKE_OP);                                                      break;
546         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
547         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
548         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
549         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
550         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
551         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
552         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
553         }
554
555         if (op & FUTEX_PRIVATE_FLAG)
556                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
557
558         if (op & FUTEX_CLOCK_REALTIME)
559                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
560
561         return printed;
562 }
563
564 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
565
566 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
567 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
568
569 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
570 static DEFINE_STRARRAY(itimers);
571
572 static const char *whences[] = { "SET", "CUR", "END",
573 #ifdef SEEK_DATA
574 "DATA",
575 #endif
576 #ifdef SEEK_HOLE
577 "HOLE",
578 #endif
579 };
580 static DEFINE_STRARRAY(whences);
581
582 static const char *fcntl_cmds[] = {
583         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
584         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
585         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
586         "F_GETOWNER_UIDS",
587 };
588 static DEFINE_STRARRAY(fcntl_cmds);
589
590 static const char *rlimit_resources[] = {
591         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
592         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
593         "RTTIME",
594 };
595 static DEFINE_STRARRAY(rlimit_resources);
596
597 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
598 static DEFINE_STRARRAY(sighow);
599
600 static const char *clockid[] = {
601         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
602         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
603 };
604 static DEFINE_STRARRAY(clockid);
605
606 static const char *socket_families[] = {
607         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
608         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
609         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
610         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
611         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
612         "ALG", "NFC", "VSOCK",
613 };
614 static DEFINE_STRARRAY(socket_families);
615
616 #ifndef SOCK_TYPE_MASK
617 #define SOCK_TYPE_MASK 0xf
618 #endif
619
620 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
621                                                       struct syscall_arg *arg)
622 {
623         size_t printed;
624         int type = arg->val,
625             flags = type & ~SOCK_TYPE_MASK;
626
627         type &= SOCK_TYPE_MASK;
628         /*
629          * Can't use a strarray, MIPS may override for ABI reasons.
630          */
631         switch (type) {
632 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
633         P_SK_TYPE(STREAM);
634         P_SK_TYPE(DGRAM);
635         P_SK_TYPE(RAW);
636         P_SK_TYPE(RDM);
637         P_SK_TYPE(SEQPACKET);
638         P_SK_TYPE(DCCP);
639         P_SK_TYPE(PACKET);
640 #undef P_SK_TYPE
641         default:
642                 printed = scnprintf(bf, size, "%#x", type);
643         }
644
645 #define P_SK_FLAG(n) \
646         if (flags & SOCK_##n) { \
647                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
648                 flags &= ~SOCK_##n; \
649         }
650
651         P_SK_FLAG(CLOEXEC);
652         P_SK_FLAG(NONBLOCK);
653 #undef P_SK_FLAG
654
655         if (flags)
656                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
657
658         return printed;
659 }
660
661 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
662
663 #ifndef MSG_PROBE
664 #define MSG_PROBE            0x10
665 #endif
666 #ifndef MSG_WAITFORONE
667 #define MSG_WAITFORONE  0x10000
668 #endif
669 #ifndef MSG_SENDPAGE_NOTLAST
670 #define MSG_SENDPAGE_NOTLAST 0x20000
671 #endif
672 #ifndef MSG_FASTOPEN
673 #define MSG_FASTOPEN         0x20000000
674 #endif
675
676 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
677                                                struct syscall_arg *arg)
678 {
679         int printed = 0, flags = arg->val;
680
681         if (flags == 0)
682                 return scnprintf(bf, size, "NONE");
683 #define P_MSG_FLAG(n) \
684         if (flags & MSG_##n) { \
685                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
686                 flags &= ~MSG_##n; \
687         }
688
689         P_MSG_FLAG(OOB);
690         P_MSG_FLAG(PEEK);
691         P_MSG_FLAG(DONTROUTE);
692         P_MSG_FLAG(TRYHARD);
693         P_MSG_FLAG(CTRUNC);
694         P_MSG_FLAG(PROBE);
695         P_MSG_FLAG(TRUNC);
696         P_MSG_FLAG(DONTWAIT);
697         P_MSG_FLAG(EOR);
698         P_MSG_FLAG(WAITALL);
699         P_MSG_FLAG(FIN);
700         P_MSG_FLAG(SYN);
701         P_MSG_FLAG(CONFIRM);
702         P_MSG_FLAG(RST);
703         P_MSG_FLAG(ERRQUEUE);
704         P_MSG_FLAG(NOSIGNAL);
705         P_MSG_FLAG(MORE);
706         P_MSG_FLAG(WAITFORONE);
707         P_MSG_FLAG(SENDPAGE_NOTLAST);
708         P_MSG_FLAG(FASTOPEN);
709         P_MSG_FLAG(CMSG_CLOEXEC);
710 #undef P_MSG_FLAG
711
712         if (flags)
713                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
714
715         return printed;
716 }
717
718 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
719
720 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
721                                                  struct syscall_arg *arg)
722 {
723         size_t printed = 0;
724         int mode = arg->val;
725
726         if (mode == F_OK) /* 0 */
727                 return scnprintf(bf, size, "F");
728 #define P_MODE(n) \
729         if (mode & n##_OK) { \
730                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
731                 mode &= ~n##_OK; \
732         }
733
734         P_MODE(R);
735         P_MODE(W);
736         P_MODE(X);
737 #undef P_MODE
738
739         if (mode)
740                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
741
742         return printed;
743 }
744
745 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
746
747 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
748                                                struct syscall_arg *arg)
749 {
750         int printed = 0, flags = arg->val;
751
752         if (!(flags & O_CREAT))
753                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
754
755         if (flags == 0)
756                 return scnprintf(bf, size, "RDONLY");
757 #define P_FLAG(n) \
758         if (flags & O_##n) { \
759                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
760                 flags &= ~O_##n; \
761         }
762
763         P_FLAG(APPEND);
764         P_FLAG(ASYNC);
765         P_FLAG(CLOEXEC);
766         P_FLAG(CREAT);
767         P_FLAG(DIRECT);
768         P_FLAG(DIRECTORY);
769         P_FLAG(EXCL);
770         P_FLAG(LARGEFILE);
771         P_FLAG(NOATIME);
772         P_FLAG(NOCTTY);
773 #ifdef O_NONBLOCK
774         P_FLAG(NONBLOCK);
775 #elif O_NDELAY
776         P_FLAG(NDELAY);
777 #endif
778 #ifdef O_PATH
779         P_FLAG(PATH);
780 #endif
781         P_FLAG(RDWR);
782 #ifdef O_DSYNC
783         if ((flags & O_SYNC) == O_SYNC)
784                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
785         else {
786                 P_FLAG(DSYNC);
787         }
788 #else
789         P_FLAG(SYNC);
790 #endif
791         P_FLAG(TRUNC);
792         P_FLAG(WRONLY);
793 #undef P_FLAG
794
795         if (flags)
796                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
797
798         return printed;
799 }
800
801 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
802
803 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
804                                                 struct syscall_arg *arg)
805 {
806         int printed = 0, flags = arg->val;
807
808         if (flags == 0)
809                 return 0;
810
811 #define P_FLAG(n) \
812         if (flags & PERF_FLAG_##n) { \
813                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
814                 flags &= ~PERF_FLAG_##n; \
815         }
816
817         P_FLAG(FD_NO_GROUP);
818         P_FLAG(FD_OUTPUT);
819         P_FLAG(PID_CGROUP);
820         P_FLAG(FD_CLOEXEC);
821 #undef P_FLAG
822
823         if (flags)
824                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
825
826         return printed;
827 }
828
829 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
830
831 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
832                                                    struct syscall_arg *arg)
833 {
834         int printed = 0, flags = arg->val;
835
836         if (flags == 0)
837                 return scnprintf(bf, size, "NONE");
838 #define P_FLAG(n) \
839         if (flags & EFD_##n) { \
840                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
841                 flags &= ~EFD_##n; \
842         }
843
844         P_FLAG(SEMAPHORE);
845         P_FLAG(CLOEXEC);
846         P_FLAG(NONBLOCK);
847 #undef P_FLAG
848
849         if (flags)
850                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
851
852         return printed;
853 }
854
855 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
856
857 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
858                                                 struct syscall_arg *arg)
859 {
860         int printed = 0, flags = arg->val;
861
862 #define P_FLAG(n) \
863         if (flags & O_##n) { \
864                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
865                 flags &= ~O_##n; \
866         }
867
868         P_FLAG(CLOEXEC);
869         P_FLAG(NONBLOCK);
870 #undef P_FLAG
871
872         if (flags)
873                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
874
875         return printed;
876 }
877
878 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
879
880 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
881 {
882         int sig = arg->val;
883
884         switch (sig) {
885 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
886         P_SIGNUM(HUP);
887         P_SIGNUM(INT);
888         P_SIGNUM(QUIT);
889         P_SIGNUM(ILL);
890         P_SIGNUM(TRAP);
891         P_SIGNUM(ABRT);
892         P_SIGNUM(BUS);
893         P_SIGNUM(FPE);
894         P_SIGNUM(KILL);
895         P_SIGNUM(USR1);
896         P_SIGNUM(SEGV);
897         P_SIGNUM(USR2);
898         P_SIGNUM(PIPE);
899         P_SIGNUM(ALRM);
900         P_SIGNUM(TERM);
901         P_SIGNUM(CHLD);
902         P_SIGNUM(CONT);
903         P_SIGNUM(STOP);
904         P_SIGNUM(TSTP);
905         P_SIGNUM(TTIN);
906         P_SIGNUM(TTOU);
907         P_SIGNUM(URG);
908         P_SIGNUM(XCPU);
909         P_SIGNUM(XFSZ);
910         P_SIGNUM(VTALRM);
911         P_SIGNUM(PROF);
912         P_SIGNUM(WINCH);
913         P_SIGNUM(IO);
914         P_SIGNUM(PWR);
915         P_SIGNUM(SYS);
916 #ifdef SIGEMT
917         P_SIGNUM(EMT);
918 #endif
919 #ifdef SIGSTKFLT
920         P_SIGNUM(STKFLT);
921 #endif
922 #ifdef SIGSWI
923         P_SIGNUM(SWI);
924 #endif
925         default: break;
926         }
927
928         return scnprintf(bf, size, "%#x", sig);
929 }
930
931 #define SCA_SIGNUM syscall_arg__scnprintf_signum
932
933 #if defined(__i386__) || defined(__x86_64__)
934 /*
935  * FIXME: Make this available to all arches.
936  */
937 #define TCGETS          0x5401
938
939 static const char *tioctls[] = {
940         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
941         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
942         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
943         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
944         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
945         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
946         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
947         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
948         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
949         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
950         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
951         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
952         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
953         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
954         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
955 };
956
957 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
958 #endif /* defined(__i386__) || defined(__x86_64__) */
959
960 #define STRARRAY(arg, name, array) \
961           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
962           .arg_parm      = { [arg] = &strarray__##array, }
963
964 static struct syscall_fmt {
965         const char *name;
966         const char *alias;
967         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
968         void       *arg_parm[6];
969         bool       errmsg;
970         bool       timeout;
971         bool       hexret;
972 } syscall_fmts[] = {
973         { .name     = "access",     .errmsg = true,
974           .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
975         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
976         { .name     = "brk",        .hexret = true,
977           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
978         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
979         { .name     = "close",      .errmsg = true,
980           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
981         { .name     = "connect",    .errmsg = true, },
982         { .name     = "dup",        .errmsg = true,
983           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
984         { .name     = "dup2",       .errmsg = true,
985           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
986         { .name     = "dup3",       .errmsg = true,
987           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
988         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
989         { .name     = "eventfd2",   .errmsg = true,
990           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
991         { .name     = "faccessat",  .errmsg = true,
992           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
993         { .name     = "fadvise64",  .errmsg = true,
994           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
995         { .name     = "fallocate",  .errmsg = true,
996           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
997         { .name     = "fchdir",     .errmsg = true,
998           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
999         { .name     = "fchmod",     .errmsg = true,
1000           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1001         { .name     = "fchmodat",   .errmsg = true,
1002           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1003         { .name     = "fchown",     .errmsg = true,
1004           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1005         { .name     = "fchownat",   .errmsg = true,
1006           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1007         { .name     = "fcntl",      .errmsg = true,
1008           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1009                              [1] = SCA_STRARRAY, /* cmd */ },
1010           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1011         { .name     = "fdatasync",  .errmsg = true,
1012           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1013         { .name     = "flock",      .errmsg = true,
1014           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1015                              [1] = SCA_FLOCK, /* cmd */ }, },
1016         { .name     = "fsetxattr",  .errmsg = true,
1017           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1018         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1019           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1020         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1021           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1022         { .name     = "fstatfs",    .errmsg = true,
1023           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1024         { .name     = "fsync",    .errmsg = true,
1025           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1026         { .name     = "ftruncate", .errmsg = true,
1027           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1028         { .name     = "futex",      .errmsg = true,
1029           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1030         { .name     = "futimesat", .errmsg = true,
1031           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1032         { .name     = "getdents",   .errmsg = true,
1033           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1034         { .name     = "getdents64", .errmsg = true,
1035           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1036         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1037         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1038         { .name     = "ioctl",      .errmsg = true,
1039           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1040 #if defined(__i386__) || defined(__x86_64__)
1041 /*
1042  * FIXME: Make this available to all arches.
1043  */
1044                              [1] = SCA_STRHEXARRAY, /* cmd */
1045                              [2] = SCA_HEX, /* arg */ },
1046           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1047 #else
1048                              [2] = SCA_HEX, /* arg */ }, },
1049 #endif
1050         { .name     = "kill",       .errmsg = true,
1051           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1052         { .name     = "linkat",     .errmsg = true,
1053           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1054         { .name     = "lseek",      .errmsg = true,
1055           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1056                              [2] = SCA_STRARRAY, /* whence */ },
1057           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1058         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
1059         { .name     = "madvise",    .errmsg = true,
1060           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1061                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1062         { .name     = "mkdirat",    .errmsg = true,
1063           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1064         { .name     = "mknodat",    .errmsg = true,
1065           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1066         { .name     = "mlock",      .errmsg = true,
1067           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1068         { .name     = "mlockall",   .errmsg = true,
1069           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1070         { .name     = "mmap",       .hexret = true,
1071           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1072                              [2] = SCA_MMAP_PROT, /* prot */
1073                              [3] = SCA_MMAP_FLAGS, /* flags */
1074                              [4] = SCA_FD,        /* fd */ }, },
1075         { .name     = "mprotect",   .errmsg = true,
1076           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1077                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1078         { .name     = "mremap",     .hexret = true,
1079           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1080                              [3] = SCA_MREMAP_FLAGS, /* flags */
1081                              [4] = SCA_HEX, /* new_addr */ }, },
1082         { .name     = "munlock",    .errmsg = true,
1083           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1084         { .name     = "munmap",     .errmsg = true,
1085           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1086         { .name     = "name_to_handle_at", .errmsg = true,
1087           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1088         { .name     = "newfstatat", .errmsg = true,
1089           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1090         { .name     = "open",       .errmsg = true,
1091           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1092         { .name     = "open_by_handle_at", .errmsg = true,
1093           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1094                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1095         { .name     = "openat",     .errmsg = true,
1096           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1097                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1098         { .name     = "perf_event_open", .errmsg = true,
1099           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1100                              [2] = SCA_INT, /* cpu */
1101                              [3] = SCA_FD,  /* group_fd */
1102                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1103         { .name     = "pipe2",      .errmsg = true,
1104           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1105         { .name     = "poll",       .errmsg = true, .timeout = true, },
1106         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1107         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1108           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1109         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1110           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1111         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1112         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1113           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1114         { .name     = "pwritev",    .errmsg = true,
1115           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1116         { .name     = "read",       .errmsg = true,
1117           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1118         { .name     = "readlinkat", .errmsg = true,
1119           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1120         { .name     = "readv",      .errmsg = true,
1121           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1122         { .name     = "recvfrom",   .errmsg = true,
1123           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1124         { .name     = "recvmmsg",   .errmsg = true,
1125           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1126         { .name     = "recvmsg",    .errmsg = true,
1127           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1128         { .name     = "renameat",   .errmsg = true,
1129           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1130         { .name     = "rt_sigaction", .errmsg = true,
1131           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1132         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1133         { .name     = "rt_sigqueueinfo", .errmsg = true,
1134           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1135         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1136           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1137         { .name     = "select",     .errmsg = true, .timeout = true, },
1138         { .name     = "sendmmsg",    .errmsg = true,
1139           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1140         { .name     = "sendmsg",    .errmsg = true,
1141           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1142         { .name     = "sendto",     .errmsg = true,
1143           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1144         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1145         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1146         { .name     = "shutdown",   .errmsg = true,
1147           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1148         { .name     = "socket",     .errmsg = true,
1149           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1150                              [1] = SCA_SK_TYPE, /* type */ },
1151           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1152         { .name     = "socketpair", .errmsg = true,
1153           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1154                              [1] = SCA_SK_TYPE, /* type */ },
1155           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1156         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
1157         { .name     = "symlinkat",  .errmsg = true,
1158           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1159         { .name     = "tgkill",     .errmsg = true,
1160           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1161         { .name     = "tkill",      .errmsg = true,
1162           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1163         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1164         { .name     = "unlinkat",   .errmsg = true,
1165           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1166         { .name     = "utimensat",  .errmsg = true,
1167           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1168         { .name     = "write",      .errmsg = true,
1169           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1170         { .name     = "writev",     .errmsg = true,
1171           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1172 };
1173
1174 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1175 {
1176         const struct syscall_fmt *fmt = fmtp;
1177         return strcmp(name, fmt->name);
1178 }
1179
1180 static struct syscall_fmt *syscall_fmt__find(const char *name)
1181 {
1182         const int nmemb = ARRAY_SIZE(syscall_fmts);
1183         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1184 }
1185
1186 struct syscall {
1187         struct event_format *tp_format;
1188         int                 nr_args;
1189         struct format_field *args;
1190         const char          *name;
1191         bool                is_exit;
1192         struct syscall_fmt  *fmt;
1193         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1194         void                **arg_parm;
1195 };
1196
1197 static size_t fprintf_duration(unsigned long t, FILE *fp)
1198 {
1199         double duration = (double)t / NSEC_PER_MSEC;
1200         size_t printed = fprintf(fp, "(");
1201
1202         if (duration >= 1.0)
1203                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1204         else if (duration >= 0.01)
1205                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1206         else
1207                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1208         return printed + fprintf(fp, "): ");
1209 }
1210
1211 struct thread_trace {
1212         u64               entry_time;
1213         u64               exit_time;
1214         bool              entry_pending;
1215         unsigned long     nr_events;
1216         unsigned long     pfmaj, pfmin;
1217         char              *entry_str;
1218         double            runtime_ms;
1219         struct {
1220                 int       max;
1221                 char      **table;
1222         } paths;
1223
1224         struct intlist *syscall_stats;
1225 };
1226
1227 static struct thread_trace *thread_trace__new(void)
1228 {
1229         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1230
1231         if (ttrace)
1232                 ttrace->paths.max = -1;
1233
1234         ttrace->syscall_stats = intlist__new(NULL);
1235
1236         return ttrace;
1237 }
1238
1239 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1240 {
1241         struct thread_trace *ttrace;
1242
1243         if (thread == NULL)
1244                 goto fail;
1245
1246         if (thread__priv(thread) == NULL)
1247                 thread__set_priv(thread, thread_trace__new());
1248
1249         if (thread__priv(thread) == NULL)
1250                 goto fail;
1251
1252         ttrace = thread__priv(thread);
1253         ++ttrace->nr_events;
1254
1255         return ttrace;
1256 fail:
1257         color_fprintf(fp, PERF_COLOR_RED,
1258                       "WARNING: not enough memory, dropping samples!\n");
1259         return NULL;
1260 }
1261
1262 #define TRACE_PFMAJ             (1 << 0)
1263 #define TRACE_PFMIN             (1 << 1)
1264
1265 static const size_t trace__entry_str_size = 2048;
1266
1267 struct trace {
1268         struct perf_tool        tool;
1269         struct {
1270                 int             machine;
1271                 int             open_id;
1272         }                       audit;
1273         struct {
1274                 int             max;
1275                 struct syscall  *table;
1276                 struct {
1277                         struct perf_evsel *sys_enter,
1278                                           *sys_exit;
1279                 }               events;
1280         } syscalls;
1281         struct record_opts      opts;
1282         struct perf_evlist      *evlist;
1283         struct machine          *host;
1284         struct thread           *current;
1285         u64                     base_time;
1286         FILE                    *output;
1287         unsigned long           nr_events;
1288         struct strlist          *ev_qualifier;
1289         struct {
1290                 size_t          nr;
1291                 int             *entries;
1292         }                       ev_qualifier_ids;
1293         const char              *last_vfs_getname;
1294         struct intlist          *tid_list;
1295         struct intlist          *pid_list;
1296         struct {
1297                 size_t          nr;
1298                 pid_t           *entries;
1299         }                       filter_pids;
1300         double                  duration_filter;
1301         double                  runtime_ms;
1302         struct {
1303                 u64             vfs_getname,
1304                                 proc_getname;
1305         } stats;
1306         bool                    not_ev_qualifier;
1307         bool                    live;
1308         bool                    full_time;
1309         bool                    sched;
1310         bool                    multiple_threads;
1311         bool                    summary;
1312         bool                    summary_only;
1313         bool                    show_comm;
1314         bool                    show_tool_stats;
1315         bool                    trace_syscalls;
1316         bool                    force;
1317         bool                    vfs_getname;
1318         int                     trace_pgfaults;
1319 };
1320
1321 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1322 {
1323         struct thread_trace *ttrace = thread__priv(thread);
1324
1325         if (fd > ttrace->paths.max) {
1326                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1327
1328                 if (npath == NULL)
1329                         return -1;
1330
1331                 if (ttrace->paths.max != -1) {
1332                         memset(npath + ttrace->paths.max + 1, 0,
1333                                (fd - ttrace->paths.max) * sizeof(char *));
1334                 } else {
1335                         memset(npath, 0, (fd + 1) * sizeof(char *));
1336                 }
1337
1338                 ttrace->paths.table = npath;
1339                 ttrace->paths.max   = fd;
1340         }
1341
1342         ttrace->paths.table[fd] = strdup(pathname);
1343
1344         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1345 }
1346
1347 static int thread__read_fd_path(struct thread *thread, int fd)
1348 {
1349         char linkname[PATH_MAX], pathname[PATH_MAX];
1350         struct stat st;
1351         int ret;
1352
1353         if (thread->pid_ == thread->tid) {
1354                 scnprintf(linkname, sizeof(linkname),
1355                           "/proc/%d/fd/%d", thread->pid_, fd);
1356         } else {
1357                 scnprintf(linkname, sizeof(linkname),
1358                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1359         }
1360
1361         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1362                 return -1;
1363
1364         ret = readlink(linkname, pathname, sizeof(pathname));
1365
1366         if (ret < 0 || ret > st.st_size)
1367                 return -1;
1368
1369         pathname[ret] = '\0';
1370         return trace__set_fd_pathname(thread, fd, pathname);
1371 }
1372
1373 static const char *thread__fd_path(struct thread *thread, int fd,
1374                                    struct trace *trace)
1375 {
1376         struct thread_trace *ttrace = thread__priv(thread);
1377
1378         if (ttrace == NULL)
1379                 return NULL;
1380
1381         if (fd < 0)
1382                 return NULL;
1383
1384         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1385                 if (!trace->live)
1386                         return NULL;
1387                 ++trace->stats.proc_getname;
1388                 if (thread__read_fd_path(thread, fd))
1389                         return NULL;
1390         }
1391
1392         return ttrace->paths.table[fd];
1393 }
1394
1395 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1396                                         struct syscall_arg *arg)
1397 {
1398         int fd = arg->val;
1399         size_t printed = scnprintf(bf, size, "%d", fd);
1400         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1401
1402         if (path)
1403                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1404
1405         return printed;
1406 }
1407
1408 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1409                                               struct syscall_arg *arg)
1410 {
1411         int fd = arg->val;
1412         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1413         struct thread_trace *ttrace = thread__priv(arg->thread);
1414
1415         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1416                 zfree(&ttrace->paths.table[fd]);
1417
1418         return printed;
1419 }
1420
1421 static bool trace__filter_duration(struct trace *trace, double t)
1422 {
1423         return t < (trace->duration_filter * NSEC_PER_MSEC);
1424 }
1425
1426 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1427 {
1428         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1429
1430         return fprintf(fp, "%10.3f ", ts);
1431 }
1432
1433 static bool done = false;
1434 static bool interrupted = false;
1435
1436 static void sig_handler(int sig)
1437 {
1438         done = true;
1439         interrupted = sig == SIGINT;
1440 }
1441
1442 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1443                                         u64 duration, u64 tstamp, FILE *fp)
1444 {
1445         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1446         printed += fprintf_duration(duration, fp);
1447
1448         if (trace->multiple_threads) {
1449                 if (trace->show_comm)
1450                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1451                 printed += fprintf(fp, "%d ", thread->tid);
1452         }
1453
1454         return printed;
1455 }
1456
1457 static int trace__process_event(struct trace *trace, struct machine *machine,
1458                                 union perf_event *event, struct perf_sample *sample)
1459 {
1460         int ret = 0;
1461
1462         switch (event->header.type) {
1463         case PERF_RECORD_LOST:
1464                 color_fprintf(trace->output, PERF_COLOR_RED,
1465                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1466                 ret = machine__process_lost_event(machine, event, sample);
1467         default:
1468                 ret = machine__process_event(machine, event, sample);
1469                 break;
1470         }
1471
1472         return ret;
1473 }
1474
1475 static int trace__tool_process(struct perf_tool *tool,
1476                                union perf_event *event,
1477                                struct perf_sample *sample,
1478                                struct machine *machine)
1479 {
1480         struct trace *trace = container_of(tool, struct trace, tool);
1481         return trace__process_event(trace, machine, event, sample);
1482 }
1483
1484 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1485 {
1486         int err = symbol__init(NULL);
1487
1488         if (err)
1489                 return err;
1490
1491         trace->host = machine__new_host();
1492         if (trace->host == NULL)
1493                 return -ENOMEM;
1494
1495         if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1496                 return -errno;
1497
1498         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1499                                             evlist->threads, trace__tool_process, false,
1500                                             trace->opts.proc_map_timeout);
1501         if (err)
1502                 symbol__exit();
1503
1504         return err;
1505 }
1506
1507 static int syscall__set_arg_fmts(struct syscall *sc)
1508 {
1509         struct format_field *field;
1510         int idx = 0;
1511
1512         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1513         if (sc->arg_scnprintf == NULL)
1514                 return -1;
1515
1516         if (sc->fmt)
1517                 sc->arg_parm = sc->fmt->arg_parm;
1518
1519         for (field = sc->args; field; field = field->next) {
1520                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1521                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1522                 else if (field->flags & FIELD_IS_POINTER)
1523                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1524                 ++idx;
1525         }
1526
1527         return 0;
1528 }
1529
1530 static int trace__read_syscall_info(struct trace *trace, int id)
1531 {
1532         char tp_name[128];
1533         struct syscall *sc;
1534         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1535
1536         if (name == NULL)
1537                 return -1;
1538
1539         if (id > trace->syscalls.max) {
1540                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1541
1542                 if (nsyscalls == NULL)
1543                         return -1;
1544
1545                 if (trace->syscalls.max != -1) {
1546                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1547                                (id - trace->syscalls.max) * sizeof(*sc));
1548                 } else {
1549                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1550                 }
1551
1552                 trace->syscalls.table = nsyscalls;
1553                 trace->syscalls.max   = id;
1554         }
1555
1556         sc = trace->syscalls.table + id;
1557         sc->name = name;
1558
1559         sc->fmt  = syscall_fmt__find(sc->name);
1560
1561         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1562         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1563
1564         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1565                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1566                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1567         }
1568
1569         if (sc->tp_format == NULL)
1570                 return -1;
1571
1572         sc->args = sc->tp_format->format.fields;
1573         sc->nr_args = sc->tp_format->format.nr_fields;
1574         /* drop nr field - not relevant here; does not exist on older kernels */
1575         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1576                 sc->args = sc->args->next;
1577                 --sc->nr_args;
1578         }
1579
1580         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1581
1582         return syscall__set_arg_fmts(sc);
1583 }
1584
1585 static int trace__validate_ev_qualifier(struct trace *trace)
1586 {
1587         int err = 0, i;
1588         struct str_node *pos;
1589
1590         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1591         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1592                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1593
1594         if (trace->ev_qualifier_ids.entries == NULL) {
1595                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1596                        trace->output);
1597                 err = -EINVAL;
1598                 goto out;
1599         }
1600
1601         i = 0;
1602
1603         strlist__for_each(pos, trace->ev_qualifier) {
1604                 const char *sc = pos->s;
1605                 int id = audit_name_to_syscall(sc, trace->audit.machine);
1606
1607                 if (id < 0) {
1608                         if (err == 0) {
1609                                 fputs("Error:\tInvalid syscall ", trace->output);
1610                                 err = -EINVAL;
1611                         } else {
1612                                 fputs(", ", trace->output);
1613                         }
1614
1615                         fputs(sc, trace->output);
1616                 }
1617
1618                 trace->ev_qualifier_ids.entries[i++] = id;
1619         }
1620
1621         if (err < 0) {
1622                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1623                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1624                 zfree(&trace->ev_qualifier_ids.entries);
1625                 trace->ev_qualifier_ids.nr = 0;
1626         }
1627 out:
1628         return err;
1629 }
1630
1631 /*
1632  * args is to be interpreted as a series of longs but we need to handle
1633  * 8-byte unaligned accesses. args points to raw_data within the event
1634  * and raw_data is guaranteed to be 8-byte unaligned because it is
1635  * preceded by raw_size which is a u32. So we need to copy args to a temp
1636  * variable to read it. Most notably this avoids extended load instructions
1637  * on unaligned addresses
1638  */
1639
1640 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1641                                       unsigned char *args, struct trace *trace,
1642                                       struct thread *thread)
1643 {
1644         size_t printed = 0;
1645         unsigned char *p;
1646         unsigned long val;
1647
1648         if (sc->args != NULL) {
1649                 struct format_field *field;
1650                 u8 bit = 1;
1651                 struct syscall_arg arg = {
1652                         .idx    = 0,
1653                         .mask   = 0,
1654                         .trace  = trace,
1655                         .thread = thread,
1656                 };
1657
1658                 for (field = sc->args; field;
1659                      field = field->next, ++arg.idx, bit <<= 1) {
1660                         if (arg.mask & bit)
1661                                 continue;
1662
1663                         /* special care for unaligned accesses */
1664                         p = args + sizeof(unsigned long) * arg.idx;
1665                         memcpy(&val, p, sizeof(val));
1666
1667                         /*
1668                          * Suppress this argument if its value is zero and
1669                          * and we don't have a string associated in an
1670                          * strarray for it.
1671                          */
1672                         if (val == 0 &&
1673                             !(sc->arg_scnprintf &&
1674                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1675                               sc->arg_parm[arg.idx]))
1676                                 continue;
1677
1678                         printed += scnprintf(bf + printed, size - printed,
1679                                              "%s%s: ", printed ? ", " : "", field->name);
1680                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1681                                 arg.val = val;
1682                                 if (sc->arg_parm)
1683                                         arg.parm = sc->arg_parm[arg.idx];
1684                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1685                                                                       size - printed, &arg);
1686                         } else {
1687                                 printed += scnprintf(bf + printed, size - printed,
1688                                                      "%ld", val);
1689                         }
1690                 }
1691         } else {
1692                 int i = 0;
1693
1694                 while (i < 6) {
1695                         /* special care for unaligned accesses */
1696                         p = args + sizeof(unsigned long) * i;
1697                         memcpy(&val, p, sizeof(val));
1698                         printed += scnprintf(bf + printed, size - printed,
1699                                              "%sarg%d: %ld",
1700                                              printed ? ", " : "", i, val);
1701                         ++i;
1702                 }
1703         }
1704
1705         return printed;
1706 }
1707
1708 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1709                                   union perf_event *event,
1710                                   struct perf_sample *sample);
1711
1712 static struct syscall *trace__syscall_info(struct trace *trace,
1713                                            struct perf_evsel *evsel, int id)
1714 {
1715
1716         if (id < 0) {
1717
1718                 /*
1719                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1720                  * before that, leaving at a higher verbosity level till that is
1721                  * explained. Reproduced with plain ftrace with:
1722                  *
1723                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1724                  * grep "NR -1 " /t/trace_pipe
1725                  *
1726                  * After generating some load on the machine.
1727                  */
1728                 if (verbose > 1) {
1729                         static u64 n;
1730                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1731                                 id, perf_evsel__name(evsel), ++n);
1732                 }
1733                 return NULL;
1734         }
1735
1736         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1737             trace__read_syscall_info(trace, id))
1738                 goto out_cant_read;
1739
1740         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1741                 goto out_cant_read;
1742
1743         return &trace->syscalls.table[id];
1744
1745 out_cant_read:
1746         if (verbose) {
1747                 fprintf(trace->output, "Problems reading syscall %d", id);
1748                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1749                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1750                 fputs(" information\n", trace->output);
1751         }
1752         return NULL;
1753 }
1754
1755 static void thread__update_stats(struct thread_trace *ttrace,
1756                                  int id, struct perf_sample *sample)
1757 {
1758         struct int_node *inode;
1759         struct stats *stats;
1760         u64 duration = 0;
1761
1762         inode = intlist__findnew(ttrace->syscall_stats, id);
1763         if (inode == NULL)
1764                 return;
1765
1766         stats = inode->priv;
1767         if (stats == NULL) {
1768                 stats = malloc(sizeof(struct stats));
1769                 if (stats == NULL)
1770                         return;
1771                 init_stats(stats);
1772                 inode->priv = stats;
1773         }
1774
1775         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1776                 duration = sample->time - ttrace->entry_time;
1777
1778         update_stats(stats, duration);
1779 }
1780
1781 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1782 {
1783         struct thread_trace *ttrace;
1784         u64 duration;
1785         size_t printed;
1786
1787         if (trace->current == NULL)
1788                 return 0;
1789
1790         ttrace = thread__priv(trace->current);
1791
1792         if (!ttrace->entry_pending)
1793                 return 0;
1794
1795         duration = sample->time - ttrace->entry_time;
1796
1797         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1798         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1799         ttrace->entry_pending = false;
1800
1801         return printed;
1802 }
1803
1804 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1805                             union perf_event *event __maybe_unused,
1806                             struct perf_sample *sample)
1807 {
1808         char *msg;
1809         void *args;
1810         size_t printed = 0;
1811         struct thread *thread;
1812         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1813         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1814         struct thread_trace *ttrace;
1815
1816         if (sc == NULL)
1817                 return -1;
1818
1819         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1820         ttrace = thread__trace(thread, trace->output);
1821         if (ttrace == NULL)
1822                 goto out_put;
1823
1824         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1825
1826         if (ttrace->entry_str == NULL) {
1827                 ttrace->entry_str = malloc(trace__entry_str_size);
1828                 if (!ttrace->entry_str)
1829                         goto out_put;
1830         }
1831
1832         if (!trace->summary_only)
1833                 trace__printf_interrupted_entry(trace, sample);
1834
1835         ttrace->entry_time = sample->time;
1836         msg = ttrace->entry_str;
1837         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1838
1839         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1840                                            args, trace, thread);
1841
1842         if (sc->is_exit) {
1843                 if (!trace->duration_filter && !trace->summary_only) {
1844                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1845                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1846                 }
1847         } else
1848                 ttrace->entry_pending = true;
1849
1850         if (trace->current != thread) {
1851                 thread__put(trace->current);
1852                 trace->current = thread__get(thread);
1853         }
1854         err = 0;
1855 out_put:
1856         thread__put(thread);
1857         return err;
1858 }
1859
1860 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1861                            union perf_event *event __maybe_unused,
1862                            struct perf_sample *sample)
1863 {
1864         long ret;
1865         u64 duration = 0;
1866         struct thread *thread;
1867         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1868         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1869         struct thread_trace *ttrace;
1870
1871         if (sc == NULL)
1872                 return -1;
1873
1874         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1875         ttrace = thread__trace(thread, trace->output);
1876         if (ttrace == NULL)
1877                 goto out_put;
1878
1879         if (trace->summary)
1880                 thread__update_stats(ttrace, id, sample);
1881
1882         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1883
1884         if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1885                 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1886                 trace->last_vfs_getname = NULL;
1887                 ++trace->stats.vfs_getname;
1888         }
1889
1890         ttrace->exit_time = sample->time;
1891
1892         if (ttrace->entry_time) {
1893                 duration = sample->time - ttrace->entry_time;
1894                 if (trace__filter_duration(trace, duration))
1895                         goto out;
1896         } else if (trace->duration_filter)
1897                 goto out;
1898
1899         if (trace->summary_only)
1900                 goto out;
1901
1902         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1903
1904         if (ttrace->entry_pending) {
1905                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1906         } else {
1907                 fprintf(trace->output, " ... [");
1908                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1909                 fprintf(trace->output, "]: %s()", sc->name);
1910         }
1911
1912         if (sc->fmt == NULL) {
1913 signed_print:
1914                 fprintf(trace->output, ") = %ld", ret);
1915         } else if (ret < 0 && sc->fmt->errmsg) {
1916                 char bf[STRERR_BUFSIZE];
1917                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1918                            *e = audit_errno_to_name(-ret);
1919
1920                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1921         } else if (ret == 0 && sc->fmt->timeout)
1922                 fprintf(trace->output, ") = 0 Timeout");
1923         else if (sc->fmt->hexret)
1924                 fprintf(trace->output, ") = %#lx", ret);
1925         else
1926                 goto signed_print;
1927
1928         fputc('\n', trace->output);
1929 out:
1930         ttrace->entry_pending = false;
1931         err = 0;
1932 out_put:
1933         thread__put(thread);
1934         return err;
1935 }
1936
1937 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1938                               union perf_event *event __maybe_unused,
1939                               struct perf_sample *sample)
1940 {
1941         trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1942         return 0;
1943 }
1944
1945 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1946                                      union perf_event *event __maybe_unused,
1947                                      struct perf_sample *sample)
1948 {
1949         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1950         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1951         struct thread *thread = machine__findnew_thread(trace->host,
1952                                                         sample->pid,
1953                                                         sample->tid);
1954         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1955
1956         if (ttrace == NULL)
1957                 goto out_dump;
1958
1959         ttrace->runtime_ms += runtime_ms;
1960         trace->runtime_ms += runtime_ms;
1961         thread__put(thread);
1962         return 0;
1963
1964 out_dump:
1965         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1966                evsel->name,
1967                perf_evsel__strval(evsel, sample, "comm"),
1968                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1969                runtime,
1970                perf_evsel__intval(evsel, sample, "vruntime"));
1971         thread__put(thread);
1972         return 0;
1973 }
1974
1975 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1976                                 union perf_event *event __maybe_unused,
1977                                 struct perf_sample *sample)
1978 {
1979         trace__printf_interrupted_entry(trace, sample);
1980         trace__fprintf_tstamp(trace, sample->time, trace->output);
1981
1982         if (trace->trace_syscalls)
1983                 fprintf(trace->output, "(         ): ");
1984
1985         fprintf(trace->output, "%s:", evsel->name);
1986
1987         if (evsel->tp_format) {
1988                 event_format__fprintf(evsel->tp_format, sample->cpu,
1989                                       sample->raw_data, sample->raw_size,
1990                                       trace->output);
1991         }
1992
1993         fprintf(trace->output, ")\n");
1994         return 0;
1995 }
1996
1997 static void print_location(FILE *f, struct perf_sample *sample,
1998                            struct addr_location *al,
1999                            bool print_dso, bool print_sym)
2000 {
2001
2002         if ((verbose || print_dso) && al->map)
2003                 fprintf(f, "%s@", al->map->dso->long_name);
2004
2005         if ((verbose || print_sym) && al->sym)
2006                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2007                         al->addr - al->sym->start);
2008         else if (al->map)
2009                 fprintf(f, "0x%" PRIx64, al->addr);
2010         else
2011                 fprintf(f, "0x%" PRIx64, sample->addr);
2012 }
2013
2014 static int trace__pgfault(struct trace *trace,
2015                           struct perf_evsel *evsel,
2016                           union perf_event *event,
2017                           struct perf_sample *sample)
2018 {
2019         struct thread *thread;
2020         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2021         struct addr_location al;
2022         char map_type = 'd';
2023         struct thread_trace *ttrace;
2024         int err = -1;
2025
2026         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2027         ttrace = thread__trace(thread, trace->output);
2028         if (ttrace == NULL)
2029                 goto out_put;
2030
2031         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2032                 ttrace->pfmaj++;
2033         else
2034                 ttrace->pfmin++;
2035
2036         if (trace->summary_only)
2037                 goto out;
2038
2039         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2040                               sample->ip, &al);
2041
2042         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2043
2044         fprintf(trace->output, "%sfault [",
2045                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2046                 "maj" : "min");
2047
2048         print_location(trace->output, sample, &al, false, true);
2049
2050         fprintf(trace->output, "] => ");
2051
2052         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2053                                    sample->addr, &al);
2054
2055         if (!al.map) {
2056                 thread__find_addr_location(thread, cpumode,
2057                                            MAP__FUNCTION, sample->addr, &al);
2058
2059                 if (al.map)
2060                         map_type = 'x';
2061                 else
2062                         map_type = '?';
2063         }
2064
2065         print_location(trace->output, sample, &al, true, false);
2066
2067         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2068 out:
2069         err = 0;
2070 out_put:
2071         thread__put(thread);
2072         return err;
2073 }
2074
2075 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2076 {
2077         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2078             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2079                 return false;
2080
2081         if (trace->pid_list || trace->tid_list)
2082                 return true;
2083
2084         return false;
2085 }
2086
2087 static int trace__process_sample(struct perf_tool *tool,
2088                                  union perf_event *event,
2089                                  struct perf_sample *sample,
2090                                  struct perf_evsel *evsel,
2091                                  struct machine *machine __maybe_unused)
2092 {
2093         struct trace *trace = container_of(tool, struct trace, tool);
2094         int err = 0;
2095
2096         tracepoint_handler handler = evsel->handler;
2097
2098         if (skip_sample(trace, sample))
2099                 return 0;
2100
2101         if (!trace->full_time && trace->base_time == 0)
2102                 trace->base_time = sample->time;
2103
2104         if (handler) {
2105                 ++trace->nr_events;
2106                 handler(trace, evsel, event, sample);
2107         }
2108
2109         return err;
2110 }
2111
2112 static int parse_target_str(struct trace *trace)
2113 {
2114         if (trace->opts.target.pid) {
2115                 trace->pid_list = intlist__new(trace->opts.target.pid);
2116                 if (trace->pid_list == NULL) {
2117                         pr_err("Error parsing process id string\n");
2118                         return -EINVAL;
2119                 }
2120         }
2121
2122         if (trace->opts.target.tid) {
2123                 trace->tid_list = intlist__new(trace->opts.target.tid);
2124                 if (trace->tid_list == NULL) {
2125                         pr_err("Error parsing thread id string\n");
2126                         return -EINVAL;
2127                 }
2128         }
2129
2130         return 0;
2131 }
2132
2133 static int trace__record(struct trace *trace, int argc, const char **argv)
2134 {
2135         unsigned int rec_argc, i, j;
2136         const char **rec_argv;
2137         const char * const record_args[] = {
2138                 "record",
2139                 "-R",
2140                 "-m", "1024",
2141                 "-c", "1",
2142         };
2143
2144         const char * const sc_args[] = { "-e", };
2145         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2146         const char * const majpf_args[] = { "-e", "major-faults" };
2147         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2148         const char * const minpf_args[] = { "-e", "minor-faults" };
2149         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2150
2151         /* +1 is for the event string below */
2152         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2153                 majpf_args_nr + minpf_args_nr + argc;
2154         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2155
2156         if (rec_argv == NULL)
2157                 return -ENOMEM;
2158
2159         j = 0;
2160         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2161                 rec_argv[j++] = record_args[i];
2162
2163         if (trace->trace_syscalls) {
2164                 for (i = 0; i < sc_args_nr; i++)
2165                         rec_argv[j++] = sc_args[i];
2166
2167                 /* event string may be different for older kernels - e.g., RHEL6 */
2168                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2169                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2170                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2171                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2172                 else {
2173                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2174                         return -1;
2175                 }
2176         }
2177
2178         if (trace->trace_pgfaults & TRACE_PFMAJ)
2179                 for (i = 0; i < majpf_args_nr; i++)
2180                         rec_argv[j++] = majpf_args[i];
2181
2182         if (trace->trace_pgfaults & TRACE_PFMIN)
2183                 for (i = 0; i < minpf_args_nr; i++)
2184                         rec_argv[j++] = minpf_args[i];
2185
2186         for (i = 0; i < (unsigned int)argc; i++)
2187                 rec_argv[j++] = argv[i];
2188
2189         return cmd_record(j, rec_argv, NULL);
2190 }
2191
2192 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2193
2194 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2195 {
2196         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2197         if (evsel == NULL)
2198                 return false;
2199
2200         if (perf_evsel__field(evsel, "pathname") == NULL) {
2201                 perf_evsel__delete(evsel);
2202                 return false;
2203         }
2204
2205         evsel->handler = trace__vfs_getname;
2206         perf_evlist__add(evlist, evsel);
2207         return true;
2208 }
2209
2210 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2211                                     u64 config)
2212 {
2213         struct perf_evsel *evsel;
2214         struct perf_event_attr attr = {
2215                 .type = PERF_TYPE_SOFTWARE,
2216                 .mmap_data = 1,
2217         };
2218
2219         attr.config = config;
2220         attr.sample_period = 1;
2221
2222         event_attr_init(&attr);
2223
2224         evsel = perf_evsel__new(&attr);
2225         if (!evsel)
2226                 return -ENOMEM;
2227
2228         evsel->handler = trace__pgfault;
2229         perf_evlist__add(evlist, evsel);
2230
2231         return 0;
2232 }
2233
2234 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2235 {
2236         const u32 type = event->header.type;
2237         struct perf_evsel *evsel;
2238
2239         if (!trace->full_time && trace->base_time == 0)
2240                 trace->base_time = sample->time;
2241
2242         if (type != PERF_RECORD_SAMPLE) {
2243                 trace__process_event(trace, trace->host, event, sample);
2244                 return;
2245         }
2246
2247         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2248         if (evsel == NULL) {
2249                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2250                 return;
2251         }
2252
2253         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2254             sample->raw_data == NULL) {
2255                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2256                        perf_evsel__name(evsel), sample->tid,
2257                        sample->cpu, sample->raw_size);
2258         } else {
2259                 tracepoint_handler handler = evsel->handler;
2260                 handler(trace, evsel, event, sample);
2261         }
2262 }
2263
2264 static int trace__add_syscall_newtp(struct trace *trace)
2265 {
2266         int ret = -1;
2267         struct perf_evlist *evlist = trace->evlist;
2268         struct perf_evsel *sys_enter, *sys_exit;
2269
2270         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2271         if (sys_enter == NULL)
2272                 goto out;
2273
2274         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2275                 goto out_delete_sys_enter;
2276
2277         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2278         if (sys_exit == NULL)
2279                 goto out_delete_sys_enter;
2280
2281         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2282                 goto out_delete_sys_exit;
2283
2284         perf_evlist__add(evlist, sys_enter);
2285         perf_evlist__add(evlist, sys_exit);
2286
2287         trace->syscalls.events.sys_enter = sys_enter;
2288         trace->syscalls.events.sys_exit  = sys_exit;
2289
2290         ret = 0;
2291 out:
2292         return ret;
2293
2294 out_delete_sys_exit:
2295         perf_evsel__delete_priv(sys_exit);
2296 out_delete_sys_enter:
2297         perf_evsel__delete_priv(sys_enter);
2298         goto out;
2299 }
2300
2301 static int trace__set_ev_qualifier_filter(struct trace *trace)
2302 {
2303         int err = -1;
2304         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2305                                                 trace->ev_qualifier_ids.nr,
2306                                                 trace->ev_qualifier_ids.entries);
2307
2308         if (filter == NULL)
2309                 goto out_enomem;
2310
2311         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2312                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2313
2314         free(filter);
2315 out:
2316         return err;
2317 out_enomem:
2318         errno = ENOMEM;
2319         goto out;
2320 }
2321
2322 static int trace__run(struct trace *trace, int argc, const char **argv)
2323 {
2324         struct perf_evlist *evlist = trace->evlist;
2325         struct perf_evsel *evsel;
2326         int err = -1, i;
2327         unsigned long before;
2328         const bool forks = argc > 0;
2329         bool draining = false;
2330
2331         trace->live = true;
2332
2333         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2334                 goto out_error_raw_syscalls;
2335
2336         if (trace->trace_syscalls)
2337                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2338
2339         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2340             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2341                 goto out_error_mem;
2342         }
2343
2344         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2345             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2346                 goto out_error_mem;
2347
2348         if (trace->sched &&
2349             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2350                                    trace__sched_stat_runtime))
2351                 goto out_error_sched_stat_runtime;
2352
2353         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2354         if (err < 0) {
2355                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2356                 goto out_delete_evlist;
2357         }
2358
2359         err = trace__symbols_init(trace, evlist);
2360         if (err < 0) {
2361                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2362                 goto out_delete_evlist;
2363         }
2364
2365         perf_evlist__config(evlist, &trace->opts);
2366
2367         signal(SIGCHLD, sig_handler);
2368         signal(SIGINT, sig_handler);
2369
2370         if (forks) {
2371                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2372                                                     argv, false, NULL);
2373                 if (err < 0) {
2374                         fprintf(trace->output, "Couldn't run the workload!\n");
2375                         goto out_delete_evlist;
2376                 }
2377         }
2378
2379         err = perf_evlist__open(evlist);
2380         if (err < 0)
2381                 goto out_error_open;
2382
2383         /*
2384          * Better not use !target__has_task() here because we need to cover the
2385          * case where no threads were specified in the command line, but a
2386          * workload was, and in that case we will fill in the thread_map when
2387          * we fork the workload in perf_evlist__prepare_workload.
2388          */
2389         if (trace->filter_pids.nr > 0)
2390                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2391         else if (thread_map__pid(evlist->threads, 0) == -1)
2392                 err = perf_evlist__set_filter_pid(evlist, getpid());
2393
2394         if (err < 0)
2395                 goto out_error_mem;
2396
2397         if (trace->ev_qualifier_ids.nr > 0) {
2398                 err = trace__set_ev_qualifier_filter(trace);
2399                 if (err < 0)
2400                         goto out_errno;
2401
2402                 pr_debug("event qualifier tracepoint filter: %s\n",
2403                          trace->syscalls.events.sys_exit->filter);
2404         }
2405
2406         err = perf_evlist__apply_filters(evlist, &evsel);
2407         if (err < 0)
2408                 goto out_error_apply_filters;
2409
2410         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2411         if (err < 0)
2412                 goto out_error_mmap;
2413
2414         if (!target__none(&trace->opts.target))
2415                 perf_evlist__enable(evlist);
2416
2417         if (forks)
2418                 perf_evlist__start_workload(evlist);
2419
2420         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2421                                   evlist->threads->nr > 1 ||
2422                                   perf_evlist__first(evlist)->attr.inherit;
2423 again:
2424         before = trace->nr_events;
2425
2426         for (i = 0; i < evlist->nr_mmaps; i++) {
2427                 union perf_event *event;
2428
2429                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2430                         struct perf_sample sample;
2431
2432                         ++trace->nr_events;
2433
2434                         err = perf_evlist__parse_sample(evlist, event, &sample);
2435                         if (err) {
2436                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2437                                 goto next_event;
2438                         }
2439
2440                         trace__handle_event(trace, event, &sample);
2441 next_event:
2442                         perf_evlist__mmap_consume(evlist, i);
2443
2444                         if (interrupted)
2445                                 goto out_disable;
2446
2447                         if (done && !draining) {
2448                                 perf_evlist__disable(evlist);
2449                                 draining = true;
2450                         }
2451                 }
2452         }
2453
2454         if (trace->nr_events == before) {
2455                 int timeout = done ? 100 : -1;
2456
2457                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2458                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2459                                 draining = true;
2460
2461                         goto again;
2462                 }
2463         } else {
2464                 goto again;
2465         }
2466
2467 out_disable:
2468         thread__zput(trace->current);
2469
2470         perf_evlist__disable(evlist);
2471
2472         if (!err) {
2473                 if (trace->summary)
2474                         trace__fprintf_thread_summary(trace, trace->output);
2475
2476                 if (trace->show_tool_stats) {
2477                         fprintf(trace->output, "Stats:\n "
2478                                                " vfs_getname : %" PRIu64 "\n"
2479                                                " proc_getname: %" PRIu64 "\n",
2480                                 trace->stats.vfs_getname,
2481                                 trace->stats.proc_getname);
2482                 }
2483         }
2484
2485 out_delete_evlist:
2486         perf_evlist__delete(evlist);
2487         trace->evlist = NULL;
2488         trace->live = false;
2489         return err;
2490 {
2491         char errbuf[BUFSIZ];
2492
2493 out_error_sched_stat_runtime:
2494         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2495         goto out_error;
2496
2497 out_error_raw_syscalls:
2498         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2499         goto out_error;
2500
2501 out_error_mmap:
2502         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2503         goto out_error;
2504
2505 out_error_open:
2506         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2507
2508 out_error:
2509         fprintf(trace->output, "%s\n", errbuf);
2510         goto out_delete_evlist;
2511
2512 out_error_apply_filters:
2513         fprintf(trace->output,
2514                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2515                 evsel->filter, perf_evsel__name(evsel), errno,
2516                 strerror_r(errno, errbuf, sizeof(errbuf)));
2517         goto out_delete_evlist;
2518 }
2519 out_error_mem:
2520         fprintf(trace->output, "Not enough memory to run!\n");
2521         goto out_delete_evlist;
2522
2523 out_errno:
2524         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2525         goto out_delete_evlist;
2526 }
2527
2528 static int trace__replay(struct trace *trace)
2529 {
2530         const struct perf_evsel_str_handler handlers[] = {
2531                 { "probe:vfs_getname",       trace__vfs_getname, },
2532         };
2533         struct perf_data_file file = {
2534                 .path  = input_name,
2535                 .mode  = PERF_DATA_MODE_READ,
2536                 .force = trace->force,
2537         };
2538         struct perf_session *session;
2539         struct perf_evsel *evsel;
2540         int err = -1;
2541
2542         trace->tool.sample        = trace__process_sample;
2543         trace->tool.mmap          = perf_event__process_mmap;
2544         trace->tool.mmap2         = perf_event__process_mmap2;
2545         trace->tool.comm          = perf_event__process_comm;
2546         trace->tool.exit          = perf_event__process_exit;
2547         trace->tool.fork          = perf_event__process_fork;
2548         trace->tool.attr          = perf_event__process_attr;
2549         trace->tool.tracing_data = perf_event__process_tracing_data;
2550         trace->tool.build_id      = perf_event__process_build_id;
2551
2552         trace->tool.ordered_events = true;
2553         trace->tool.ordering_requires_timestamps = true;
2554
2555         /* add tid to output */
2556         trace->multiple_threads = true;
2557
2558         session = perf_session__new(&file, false, &trace->tool);
2559         if (session == NULL)
2560                 return -1;
2561
2562         if (symbol__init(&session->header.env) < 0)
2563                 goto out;
2564
2565         trace->host = &session->machines.host;
2566
2567         err = perf_session__set_tracepoints_handlers(session, handlers);
2568         if (err)
2569                 goto out;
2570
2571         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2572                                                      "raw_syscalls:sys_enter");
2573         /* older kernels have syscalls tp versus raw_syscalls */
2574         if (evsel == NULL)
2575                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2576                                                              "syscalls:sys_enter");
2577
2578         if (evsel &&
2579             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2580             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2581                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2582                 goto out;
2583         }
2584
2585         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2586                                                      "raw_syscalls:sys_exit");
2587         if (evsel == NULL)
2588                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2589                                                              "syscalls:sys_exit");
2590         if (evsel &&
2591             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2592             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2593                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2594                 goto out;
2595         }
2596
2597         evlist__for_each(session->evlist, evsel) {
2598                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2599                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2600                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2601                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2602                         evsel->handler = trace__pgfault;
2603         }
2604
2605         err = parse_target_str(trace);
2606         if (err != 0)
2607                 goto out;
2608
2609         setup_pager();
2610
2611         err = perf_session__process_events(session);
2612         if (err)
2613                 pr_err("Failed to process events, error %d", err);
2614
2615         else if (trace->summary)
2616                 trace__fprintf_thread_summary(trace, trace->output);
2617
2618 out:
2619         perf_session__delete(session);
2620
2621         return err;
2622 }
2623
2624 static size_t trace__fprintf_threads_header(FILE *fp)
2625 {
2626         size_t printed;
2627
2628         printed  = fprintf(fp, "\n Summary of events:\n\n");
2629
2630         return printed;
2631 }
2632
2633 static size_t thread__dump_stats(struct thread_trace *ttrace,
2634                                  struct trace *trace, FILE *fp)
2635 {
2636         struct stats *stats;
2637         size_t printed = 0;
2638         struct syscall *sc;
2639         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2640
2641         if (inode == NULL)
2642                 return 0;
2643
2644         printed += fprintf(fp, "\n");
2645
2646         printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2647         printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2648         printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2649
2650         /* each int_node is a syscall */
2651         while (inode) {
2652                 stats = inode->priv;
2653                 if (stats) {
2654                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2655                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2656                         double avg = avg_stats(stats);
2657                         double pct;
2658                         u64 n = (u64) stats->n;
2659
2660                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2661                         avg /= NSEC_PER_MSEC;
2662
2663                         sc = &trace->syscalls.table[inode->i];
2664                         printed += fprintf(fp, "   %-15s", sc->name);
2665                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2666                                            n, min, avg);
2667                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2668                 }
2669
2670                 inode = intlist__next(inode);
2671         }
2672
2673         printed += fprintf(fp, "\n\n");
2674
2675         return printed;
2676 }
2677
2678 /* struct used to pass data to per-thread function */
2679 struct summary_data {
2680         FILE *fp;
2681         struct trace *trace;
2682         size_t printed;
2683 };
2684
2685 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2686 {
2687         struct summary_data *data = priv;
2688         FILE *fp = data->fp;
2689         size_t printed = data->printed;
2690         struct trace *trace = data->trace;
2691         struct thread_trace *ttrace = thread__priv(thread);
2692         double ratio;
2693
2694         if (ttrace == NULL)
2695                 return 0;
2696
2697         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2698
2699         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2700         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2701         printed += fprintf(fp, "%.1f%%", ratio);
2702         if (ttrace->pfmaj)
2703                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2704         if (ttrace->pfmin)
2705                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2706         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2707         printed += thread__dump_stats(ttrace, trace, fp);
2708
2709         data->printed += printed;
2710
2711         return 0;
2712 }
2713
2714 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2715 {
2716         struct summary_data data = {
2717                 .fp = fp,
2718                 .trace = trace
2719         };
2720         data.printed = trace__fprintf_threads_header(fp);
2721
2722         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2723
2724         return data.printed;
2725 }
2726
2727 static int trace__set_duration(const struct option *opt, const char *str,
2728                                int unset __maybe_unused)
2729 {
2730         struct trace *trace = opt->value;
2731
2732         trace->duration_filter = atof(str);
2733         return 0;
2734 }
2735
2736 static int trace__set_filter_pids(const struct option *opt, const char *str,
2737                                   int unset __maybe_unused)
2738 {
2739         int ret = -1;
2740         size_t i;
2741         struct trace *trace = opt->value;
2742         /*
2743          * FIXME: introduce a intarray class, plain parse csv and create a
2744          * { int nr, int entries[] } struct...
2745          */
2746         struct intlist *list = intlist__new(str);
2747
2748         if (list == NULL)
2749                 return -1;
2750
2751         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2752         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2753
2754         if (trace->filter_pids.entries == NULL)
2755                 goto out;
2756
2757         trace->filter_pids.entries[0] = getpid();
2758
2759         for (i = 1; i < trace->filter_pids.nr; ++i)
2760                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2761
2762         intlist__delete(list);
2763         ret = 0;
2764 out:
2765         return ret;
2766 }
2767
2768 static int trace__open_output(struct trace *trace, const char *filename)
2769 {
2770         struct stat st;
2771
2772         if (!stat(filename, &st) && st.st_size) {
2773                 char oldname[PATH_MAX];
2774
2775                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2776                 unlink(oldname);
2777                 rename(filename, oldname);
2778         }
2779
2780         trace->output = fopen(filename, "w");
2781
2782         return trace->output == NULL ? -errno : 0;
2783 }
2784
2785 static int parse_pagefaults(const struct option *opt, const char *str,
2786                             int unset __maybe_unused)
2787 {
2788         int *trace_pgfaults = opt->value;
2789
2790         if (strcmp(str, "all") == 0)
2791                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2792         else if (strcmp(str, "maj") == 0)
2793                 *trace_pgfaults |= TRACE_PFMAJ;
2794         else if (strcmp(str, "min") == 0)
2795                 *trace_pgfaults |= TRACE_PFMIN;
2796         else
2797                 return -1;
2798
2799         return 0;
2800 }
2801
2802 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2803 {
2804         struct perf_evsel *evsel;
2805
2806         evlist__for_each(evlist, evsel)
2807                 evsel->handler = handler;
2808 }
2809
2810 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2811 {
2812         const char *trace_usage[] = {
2813                 "perf trace [<options>] [<command>]",
2814                 "perf trace [<options>] -- <command> [<options>]",
2815                 "perf trace record [<options>] [<command>]",
2816                 "perf trace record [<options>] -- <command> [<options>]",
2817                 NULL
2818         };
2819         struct trace trace = {
2820                 .audit = {
2821                         .machine = audit_detect_machine(),
2822                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
2823                 },
2824                 .syscalls = {
2825                         . max = -1,
2826                 },
2827                 .opts = {
2828                         .target = {
2829                                 .uid       = UINT_MAX,
2830                                 .uses_mmap = true,
2831                         },
2832                         .user_freq     = UINT_MAX,
2833                         .user_interval = ULLONG_MAX,
2834                         .no_buffering  = true,
2835                         .mmap_pages    = UINT_MAX,
2836                         .proc_map_timeout  = 500,
2837                 },
2838                 .output = stdout,
2839                 .show_comm = true,
2840                 .trace_syscalls = true,
2841         };
2842         const char *output_name = NULL;
2843         const char *ev_qualifier_str = NULL;
2844         const struct option trace_options[] = {
2845         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2846                      "event selector. use 'perf list' to list available events",
2847                      parse_events_option),
2848         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2849                     "show the thread COMM next to its id"),
2850         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2851         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2852         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2853         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2854         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2855                     "trace events on existing process id"),
2856         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2857                     "trace events on existing thread id"),
2858         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2859                      "pids to filter (by the kernel)", trace__set_filter_pids),
2860         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2861                     "system-wide collection from all CPUs"),
2862         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2863                     "list of cpus to monitor"),
2864         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2865                     "child tasks do not inherit counters"),
2866         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2867                      "number of mmap data pages",
2868                      perf_evlist__parse_mmap_pages),
2869         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2870                    "user to profile"),
2871         OPT_CALLBACK(0, "duration", &trace, "float",
2872                      "show only events with duration > N.M ms",
2873                      trace__set_duration),
2874         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2875         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2876         OPT_BOOLEAN('T', "time", &trace.full_time,
2877                     "Show full timestamp, not time relative to first start"),
2878         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2879                     "Show only syscall summary with statistics"),
2880         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2881                     "Show all syscalls and summary with statistics"),
2882         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2883                      "Trace pagefaults", parse_pagefaults, "maj"),
2884         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2885         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2886         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2887                         "per thread proc mmap processing timeout in ms"),
2888         OPT_END()
2889         };
2890         const char * const trace_subcommands[] = { "record", NULL };
2891         int err;
2892         char bf[BUFSIZ];
2893
2894         signal(SIGSEGV, sighandler_dump_stack);
2895         signal(SIGFPE, sighandler_dump_stack);
2896
2897         trace.evlist = perf_evlist__new();
2898
2899         if (trace.evlist == NULL) {
2900                 pr_err("Not enough memory to run!\n");
2901                 err = -ENOMEM;
2902                 goto out;
2903         }
2904
2905         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2906                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2907
2908         if (trace.trace_pgfaults) {
2909                 trace.opts.sample_address = true;
2910                 trace.opts.sample_time = true;
2911         }
2912
2913         if (trace.evlist->nr_entries > 0)
2914                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2915
2916         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2917                 return trace__record(&trace, argc-1, &argv[1]);
2918
2919         /* summary_only implies summary option, but don't overwrite summary if set */
2920         if (trace.summary_only)
2921                 trace.summary = trace.summary_only;
2922
2923         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2924             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2925                 pr_err("Please specify something to trace.\n");
2926                 return -1;
2927         }
2928
2929         if (output_name != NULL) {
2930                 err = trace__open_output(&trace, output_name);
2931                 if (err < 0) {
2932                         perror("failed to create output file");
2933                         goto out;
2934                 }
2935         }
2936
2937         if (ev_qualifier_str != NULL) {
2938                 const char *s = ev_qualifier_str;
2939                 struct strlist_config slist_config = {
2940                         .dirname = system_path(STRACE_GROUPS_DIR),
2941                 };
2942
2943                 trace.not_ev_qualifier = *s == '!';
2944                 if (trace.not_ev_qualifier)
2945                         ++s;
2946                 trace.ev_qualifier = strlist__new(s, &slist_config);
2947                 if (trace.ev_qualifier == NULL) {
2948                         fputs("Not enough memory to parse event qualifier",
2949                               trace.output);
2950                         err = -ENOMEM;
2951                         goto out_close;
2952                 }
2953
2954                 err = trace__validate_ev_qualifier(&trace);
2955                 if (err)
2956                         goto out_close;
2957         }
2958
2959         err = target__validate(&trace.opts.target);
2960         if (err) {
2961                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2962                 fprintf(trace.output, "%s", bf);
2963                 goto out_close;
2964         }
2965
2966         err = target__parse_uid(&trace.opts.target);
2967         if (err) {
2968                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2969                 fprintf(trace.output, "%s", bf);
2970                 goto out_close;
2971         }
2972
2973         if (!argc && target__none(&trace.opts.target))
2974                 trace.opts.target.system_wide = true;
2975
2976         if (input_name)
2977                 err = trace__replay(&trace);
2978         else
2979                 err = trace__run(&trace, argc, argv);
2980
2981 out_close:
2982         if (output_name != NULL)
2983                 fclose(trace.output);
2984 out:
2985         return err;
2986 }