perf trace: Deref sys_enter pointer args with contents from probe:vfs_getname
[firefly-linux-kernel-4.4.55.git] / tools / perf / builtin-trace.c
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/exec_cmd.h"
7 #include "util/machine.h"
8 #include "util/session.h"
9 #include "util/thread.h"
10 #include "util/parse-options.h"
11 #include "util/strlist.h"
12 #include "util/intlist.h"
13 #include "util/thread_map.h"
14 #include "util/stat.h"
15 #include "trace-event.h"
16 #include "util/parse-events.h"
17
18 #include <libaudit.h>
19 #include <stdlib.h>
20 #include <sys/mman.h>
21 #include <linux/futex.h>
22
23 /* For older distros: */
24 #ifndef MAP_STACK
25 # define MAP_STACK              0x20000
26 #endif
27
28 #ifndef MADV_HWPOISON
29 # define MADV_HWPOISON          100
30 #endif
31
32 #ifndef MADV_MERGEABLE
33 # define MADV_MERGEABLE         12
34 #endif
35
36 #ifndef MADV_UNMERGEABLE
37 # define MADV_UNMERGEABLE       13
38 #endif
39
40 #ifndef EFD_SEMAPHORE
41 # define EFD_SEMAPHORE          1
42 #endif
43
44 #ifndef EFD_NONBLOCK
45 # define EFD_NONBLOCK           00004000
46 #endif
47
48 #ifndef EFD_CLOEXEC
49 # define EFD_CLOEXEC            02000000
50 #endif
51
52 #ifndef O_CLOEXEC
53 # define O_CLOEXEC              02000000
54 #endif
55
56 #ifndef SOCK_DCCP
57 # define SOCK_DCCP              6
58 #endif
59
60 #ifndef SOCK_CLOEXEC
61 # define SOCK_CLOEXEC           02000000
62 #endif
63
64 #ifndef SOCK_NONBLOCK
65 # define SOCK_NONBLOCK          00004000
66 #endif
67
68 #ifndef MSG_CMSG_CLOEXEC
69 # define MSG_CMSG_CLOEXEC       0x40000000
70 #endif
71
72 #ifndef PERF_FLAG_FD_NO_GROUP
73 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
74 #endif
75
76 #ifndef PERF_FLAG_FD_OUTPUT
77 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
78 #endif
79
80 #ifndef PERF_FLAG_PID_CGROUP
81 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
82 #endif
83
84 #ifndef PERF_FLAG_FD_CLOEXEC
85 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
86 #endif
87
88
89 struct tp_field {
90         int offset;
91         union {
92                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
93                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
94         };
95 };
96
97 #define TP_UINT_FIELD(bits) \
98 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
99 { \
100         u##bits value; \
101         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
102         return value;  \
103 }
104
105 TP_UINT_FIELD(8);
106 TP_UINT_FIELD(16);
107 TP_UINT_FIELD(32);
108 TP_UINT_FIELD(64);
109
110 #define TP_UINT_FIELD__SWAPPED(bits) \
111 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
112 { \
113         u##bits value; \
114         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
115         return bswap_##bits(value);\
116 }
117
118 TP_UINT_FIELD__SWAPPED(16);
119 TP_UINT_FIELD__SWAPPED(32);
120 TP_UINT_FIELD__SWAPPED(64);
121
122 static int tp_field__init_uint(struct tp_field *field,
123                                struct format_field *format_field,
124                                bool needs_swap)
125 {
126         field->offset = format_field->offset;
127
128         switch (format_field->size) {
129         case 1:
130                 field->integer = tp_field__u8;
131                 break;
132         case 2:
133                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
134                 break;
135         case 4:
136                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
137                 break;
138         case 8:
139                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
140                 break;
141         default:
142                 return -1;
143         }
144
145         return 0;
146 }
147
148 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
149 {
150         return sample->raw_data + field->offset;
151 }
152
153 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
154 {
155         field->offset = format_field->offset;
156         field->pointer = tp_field__ptr;
157         return 0;
158 }
159
160 struct syscall_tp {
161         struct tp_field id;
162         union {
163                 struct tp_field args, ret;
164         };
165 };
166
167 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
168                                           struct tp_field *field,
169                                           const char *name)
170 {
171         struct format_field *format_field = perf_evsel__field(evsel, name);
172
173         if (format_field == NULL)
174                 return -1;
175
176         return tp_field__init_uint(field, format_field, evsel->needs_swap);
177 }
178
179 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
180         ({ struct syscall_tp *sc = evsel->priv;\
181            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
182
183 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
184                                          struct tp_field *field,
185                                          const char *name)
186 {
187         struct format_field *format_field = perf_evsel__field(evsel, name);
188
189         if (format_field == NULL)
190                 return -1;
191
192         return tp_field__init_ptr(field, format_field);
193 }
194
195 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
196         ({ struct syscall_tp *sc = evsel->priv;\
197            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
198
199 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
200 {
201         zfree(&evsel->priv);
202         perf_evsel__delete(evsel);
203 }
204
205 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
206 {
207         evsel->priv = malloc(sizeof(struct syscall_tp));
208         if (evsel->priv != NULL) {
209                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
210                         goto out_delete;
211
212                 evsel->handler = handler;
213                 return 0;
214         }
215
216         return -ENOMEM;
217
218 out_delete:
219         zfree(&evsel->priv);
220         return -ENOENT;
221 }
222
223 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
224 {
225         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
226
227         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
228         if (evsel == NULL)
229                 evsel = perf_evsel__newtp("syscalls", direction);
230
231         if (evsel) {
232                 if (perf_evsel__init_syscall_tp(evsel, handler))
233                         goto out_delete;
234         }
235
236         return evsel;
237
238 out_delete:
239         perf_evsel__delete_priv(evsel);
240         return NULL;
241 }
242
243 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
244         ({ struct syscall_tp *fields = evsel->priv; \
245            fields->name.integer(&fields->name, sample); })
246
247 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
248         ({ struct syscall_tp *fields = evsel->priv; \
249            fields->name.pointer(&fields->name, sample); })
250
251 struct syscall_arg {
252         unsigned long val;
253         struct thread *thread;
254         struct trace  *trace;
255         void          *parm;
256         u8            idx;
257         u8            mask;
258 };
259
260 struct strarray {
261         int         offset;
262         int         nr_entries;
263         const char **entries;
264 };
265
266 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
267         .nr_entries = ARRAY_SIZE(array), \
268         .entries = array, \
269 }
270
271 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
272         .offset     = off, \
273         .nr_entries = ARRAY_SIZE(array), \
274         .entries = array, \
275 }
276
277 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
278                                                 const char *intfmt,
279                                                 struct syscall_arg *arg)
280 {
281         struct strarray *sa = arg->parm;
282         int idx = arg->val - sa->offset;
283
284         if (idx < 0 || idx >= sa->nr_entries)
285                 return scnprintf(bf, size, intfmt, arg->val);
286
287         return scnprintf(bf, size, "%s", sa->entries[idx]);
288 }
289
290 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
291                                               struct syscall_arg *arg)
292 {
293         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
294 }
295
296 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
297
298 #if defined(__i386__) || defined(__x86_64__)
299 /*
300  * FIXME: Make this available to all arches as soon as the ioctl beautifier
301  *        gets rewritten to support all arches.
302  */
303 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
304                                                  struct syscall_arg *arg)
305 {
306         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
307 }
308
309 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
310 #endif /* defined(__i386__) || defined(__x86_64__) */
311
312 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
313                                         struct syscall_arg *arg);
314
315 #define SCA_FD syscall_arg__scnprintf_fd
316
317 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
318                                            struct syscall_arg *arg)
319 {
320         int fd = arg->val;
321
322         if (fd == AT_FDCWD)
323                 return scnprintf(bf, size, "CWD");
324
325         return syscall_arg__scnprintf_fd(bf, size, arg);
326 }
327
328 #define SCA_FDAT syscall_arg__scnprintf_fd_at
329
330 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
331                                               struct syscall_arg *arg);
332
333 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
334
335 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
336                                          struct syscall_arg *arg)
337 {
338         return scnprintf(bf, size, "%#lx", arg->val);
339 }
340
341 #define SCA_HEX syscall_arg__scnprintf_hex
342
343 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
344                                          struct syscall_arg *arg)
345 {
346         return scnprintf(bf, size, "%d", arg->val);
347 }
348
349 #define SCA_INT syscall_arg__scnprintf_int
350
351 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
352                                                struct syscall_arg *arg)
353 {
354         int printed = 0, prot = arg->val;
355
356         if (prot == PROT_NONE)
357                 return scnprintf(bf, size, "NONE");
358 #define P_MMAP_PROT(n) \
359         if (prot & PROT_##n) { \
360                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
361                 prot &= ~PROT_##n; \
362         }
363
364         P_MMAP_PROT(EXEC);
365         P_MMAP_PROT(READ);
366         P_MMAP_PROT(WRITE);
367 #ifdef PROT_SEM
368         P_MMAP_PROT(SEM);
369 #endif
370         P_MMAP_PROT(GROWSDOWN);
371         P_MMAP_PROT(GROWSUP);
372 #undef P_MMAP_PROT
373
374         if (prot)
375                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
376
377         return printed;
378 }
379
380 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
381
382 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
383                                                 struct syscall_arg *arg)
384 {
385         int printed = 0, flags = arg->val;
386
387 #define P_MMAP_FLAG(n) \
388         if (flags & MAP_##n) { \
389                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
390                 flags &= ~MAP_##n; \
391         }
392
393         P_MMAP_FLAG(SHARED);
394         P_MMAP_FLAG(PRIVATE);
395 #ifdef MAP_32BIT
396         P_MMAP_FLAG(32BIT);
397 #endif
398         P_MMAP_FLAG(ANONYMOUS);
399         P_MMAP_FLAG(DENYWRITE);
400         P_MMAP_FLAG(EXECUTABLE);
401         P_MMAP_FLAG(FILE);
402         P_MMAP_FLAG(FIXED);
403         P_MMAP_FLAG(GROWSDOWN);
404 #ifdef MAP_HUGETLB
405         P_MMAP_FLAG(HUGETLB);
406 #endif
407         P_MMAP_FLAG(LOCKED);
408         P_MMAP_FLAG(NONBLOCK);
409         P_MMAP_FLAG(NORESERVE);
410         P_MMAP_FLAG(POPULATE);
411         P_MMAP_FLAG(STACK);
412 #ifdef MAP_UNINITIALIZED
413         P_MMAP_FLAG(UNINITIALIZED);
414 #endif
415 #undef P_MMAP_FLAG
416
417         if (flags)
418                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
419
420         return printed;
421 }
422
423 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
424
425 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
426                                                   struct syscall_arg *arg)
427 {
428         int printed = 0, flags = arg->val;
429
430 #define P_MREMAP_FLAG(n) \
431         if (flags & MREMAP_##n) { \
432                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
433                 flags &= ~MREMAP_##n; \
434         }
435
436         P_MREMAP_FLAG(MAYMOVE);
437 #ifdef MREMAP_FIXED
438         P_MREMAP_FLAG(FIXED);
439 #endif
440 #undef P_MREMAP_FLAG
441
442         if (flags)
443                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
444
445         return printed;
446 }
447
448 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
449
450 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
451                                                       struct syscall_arg *arg)
452 {
453         int behavior = arg->val;
454
455         switch (behavior) {
456 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
457         P_MADV_BHV(NORMAL);
458         P_MADV_BHV(RANDOM);
459         P_MADV_BHV(SEQUENTIAL);
460         P_MADV_BHV(WILLNEED);
461         P_MADV_BHV(DONTNEED);
462         P_MADV_BHV(REMOVE);
463         P_MADV_BHV(DONTFORK);
464         P_MADV_BHV(DOFORK);
465         P_MADV_BHV(HWPOISON);
466 #ifdef MADV_SOFT_OFFLINE
467         P_MADV_BHV(SOFT_OFFLINE);
468 #endif
469         P_MADV_BHV(MERGEABLE);
470         P_MADV_BHV(UNMERGEABLE);
471 #ifdef MADV_HUGEPAGE
472         P_MADV_BHV(HUGEPAGE);
473 #endif
474 #ifdef MADV_NOHUGEPAGE
475         P_MADV_BHV(NOHUGEPAGE);
476 #endif
477 #ifdef MADV_DONTDUMP
478         P_MADV_BHV(DONTDUMP);
479 #endif
480 #ifdef MADV_DODUMP
481         P_MADV_BHV(DODUMP);
482 #endif
483 #undef P_MADV_PHV
484         default: break;
485         }
486
487         return scnprintf(bf, size, "%#x", behavior);
488 }
489
490 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
491
492 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
493                                            struct syscall_arg *arg)
494 {
495         int printed = 0, op = arg->val;
496
497         if (op == 0)
498                 return scnprintf(bf, size, "NONE");
499 #define P_CMD(cmd) \
500         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
501                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
502                 op &= ~LOCK_##cmd; \
503         }
504
505         P_CMD(SH);
506         P_CMD(EX);
507         P_CMD(NB);
508         P_CMD(UN);
509         P_CMD(MAND);
510         P_CMD(RW);
511         P_CMD(READ);
512         P_CMD(WRITE);
513 #undef P_OP
514
515         if (op)
516                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
517
518         return printed;
519 }
520
521 #define SCA_FLOCK syscall_arg__scnprintf_flock
522
523 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
524 {
525         enum syscall_futex_args {
526                 SCF_UADDR   = (1 << 0),
527                 SCF_OP      = (1 << 1),
528                 SCF_VAL     = (1 << 2),
529                 SCF_TIMEOUT = (1 << 3),
530                 SCF_UADDR2  = (1 << 4),
531                 SCF_VAL3    = (1 << 5),
532         };
533         int op = arg->val;
534         int cmd = op & FUTEX_CMD_MASK;
535         size_t printed = 0;
536
537         switch (cmd) {
538 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
539         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
540         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
541         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
542         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
543         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
544         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
545         P_FUTEX_OP(WAKE_OP);                                                      break;
546         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
547         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
548         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
549         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
550         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
551         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
552         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
553         }
554
555         if (op & FUTEX_PRIVATE_FLAG)
556                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
557
558         if (op & FUTEX_CLOCK_REALTIME)
559                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
560
561         return printed;
562 }
563
564 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
565
566 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
567 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
568
569 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
570 static DEFINE_STRARRAY(itimers);
571
572 static const char *whences[] = { "SET", "CUR", "END",
573 #ifdef SEEK_DATA
574 "DATA",
575 #endif
576 #ifdef SEEK_HOLE
577 "HOLE",
578 #endif
579 };
580 static DEFINE_STRARRAY(whences);
581
582 static const char *fcntl_cmds[] = {
583         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
584         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
585         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
586         "F_GETOWNER_UIDS",
587 };
588 static DEFINE_STRARRAY(fcntl_cmds);
589
590 static const char *rlimit_resources[] = {
591         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
592         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
593         "RTTIME",
594 };
595 static DEFINE_STRARRAY(rlimit_resources);
596
597 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
598 static DEFINE_STRARRAY(sighow);
599
600 static const char *clockid[] = {
601         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
602         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
603 };
604 static DEFINE_STRARRAY(clockid);
605
606 static const char *socket_families[] = {
607         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
608         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
609         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
610         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
611         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
612         "ALG", "NFC", "VSOCK",
613 };
614 static DEFINE_STRARRAY(socket_families);
615
616 #ifndef SOCK_TYPE_MASK
617 #define SOCK_TYPE_MASK 0xf
618 #endif
619
620 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
621                                                       struct syscall_arg *arg)
622 {
623         size_t printed;
624         int type = arg->val,
625             flags = type & ~SOCK_TYPE_MASK;
626
627         type &= SOCK_TYPE_MASK;
628         /*
629          * Can't use a strarray, MIPS may override for ABI reasons.
630          */
631         switch (type) {
632 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
633         P_SK_TYPE(STREAM);
634         P_SK_TYPE(DGRAM);
635         P_SK_TYPE(RAW);
636         P_SK_TYPE(RDM);
637         P_SK_TYPE(SEQPACKET);
638         P_SK_TYPE(DCCP);
639         P_SK_TYPE(PACKET);
640 #undef P_SK_TYPE
641         default:
642                 printed = scnprintf(bf, size, "%#x", type);
643         }
644
645 #define P_SK_FLAG(n) \
646         if (flags & SOCK_##n) { \
647                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
648                 flags &= ~SOCK_##n; \
649         }
650
651         P_SK_FLAG(CLOEXEC);
652         P_SK_FLAG(NONBLOCK);
653 #undef P_SK_FLAG
654
655         if (flags)
656                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
657
658         return printed;
659 }
660
661 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
662
663 #ifndef MSG_PROBE
664 #define MSG_PROBE            0x10
665 #endif
666 #ifndef MSG_WAITFORONE
667 #define MSG_WAITFORONE  0x10000
668 #endif
669 #ifndef MSG_SENDPAGE_NOTLAST
670 #define MSG_SENDPAGE_NOTLAST 0x20000
671 #endif
672 #ifndef MSG_FASTOPEN
673 #define MSG_FASTOPEN         0x20000000
674 #endif
675
676 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
677                                                struct syscall_arg *arg)
678 {
679         int printed = 0, flags = arg->val;
680
681         if (flags == 0)
682                 return scnprintf(bf, size, "NONE");
683 #define P_MSG_FLAG(n) \
684         if (flags & MSG_##n) { \
685                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
686                 flags &= ~MSG_##n; \
687         }
688
689         P_MSG_FLAG(OOB);
690         P_MSG_FLAG(PEEK);
691         P_MSG_FLAG(DONTROUTE);
692         P_MSG_FLAG(TRYHARD);
693         P_MSG_FLAG(CTRUNC);
694         P_MSG_FLAG(PROBE);
695         P_MSG_FLAG(TRUNC);
696         P_MSG_FLAG(DONTWAIT);
697         P_MSG_FLAG(EOR);
698         P_MSG_FLAG(WAITALL);
699         P_MSG_FLAG(FIN);
700         P_MSG_FLAG(SYN);
701         P_MSG_FLAG(CONFIRM);
702         P_MSG_FLAG(RST);
703         P_MSG_FLAG(ERRQUEUE);
704         P_MSG_FLAG(NOSIGNAL);
705         P_MSG_FLAG(MORE);
706         P_MSG_FLAG(WAITFORONE);
707         P_MSG_FLAG(SENDPAGE_NOTLAST);
708         P_MSG_FLAG(FASTOPEN);
709         P_MSG_FLAG(CMSG_CLOEXEC);
710 #undef P_MSG_FLAG
711
712         if (flags)
713                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
714
715         return printed;
716 }
717
718 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
719
720 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
721                                                  struct syscall_arg *arg)
722 {
723         size_t printed = 0;
724         int mode = arg->val;
725
726         if (mode == F_OK) /* 0 */
727                 return scnprintf(bf, size, "F");
728 #define P_MODE(n) \
729         if (mode & n##_OK) { \
730                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
731                 mode &= ~n##_OK; \
732         }
733
734         P_MODE(R);
735         P_MODE(W);
736         P_MODE(X);
737 #undef P_MODE
738
739         if (mode)
740                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
741
742         return printed;
743 }
744
745 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
746
747 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
748                                               struct syscall_arg *arg);
749
750 #define SCA_FILENAME syscall_arg__scnprintf_filename
751
752 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
753                                                struct syscall_arg *arg)
754 {
755         int printed = 0, flags = arg->val;
756
757         if (!(flags & O_CREAT))
758                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
759
760         if (flags == 0)
761                 return scnprintf(bf, size, "RDONLY");
762 #define P_FLAG(n) \
763         if (flags & O_##n) { \
764                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
765                 flags &= ~O_##n; \
766         }
767
768         P_FLAG(APPEND);
769         P_FLAG(ASYNC);
770         P_FLAG(CLOEXEC);
771         P_FLAG(CREAT);
772         P_FLAG(DIRECT);
773         P_FLAG(DIRECTORY);
774         P_FLAG(EXCL);
775         P_FLAG(LARGEFILE);
776         P_FLAG(NOATIME);
777         P_FLAG(NOCTTY);
778 #ifdef O_NONBLOCK
779         P_FLAG(NONBLOCK);
780 #elif O_NDELAY
781         P_FLAG(NDELAY);
782 #endif
783 #ifdef O_PATH
784         P_FLAG(PATH);
785 #endif
786         P_FLAG(RDWR);
787 #ifdef O_DSYNC
788         if ((flags & O_SYNC) == O_SYNC)
789                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
790         else {
791                 P_FLAG(DSYNC);
792         }
793 #else
794         P_FLAG(SYNC);
795 #endif
796         P_FLAG(TRUNC);
797         P_FLAG(WRONLY);
798 #undef P_FLAG
799
800         if (flags)
801                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
802
803         return printed;
804 }
805
806 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
807
808 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
809                                                 struct syscall_arg *arg)
810 {
811         int printed = 0, flags = arg->val;
812
813         if (flags == 0)
814                 return 0;
815
816 #define P_FLAG(n) \
817         if (flags & PERF_FLAG_##n) { \
818                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
819                 flags &= ~PERF_FLAG_##n; \
820         }
821
822         P_FLAG(FD_NO_GROUP);
823         P_FLAG(FD_OUTPUT);
824         P_FLAG(PID_CGROUP);
825         P_FLAG(FD_CLOEXEC);
826 #undef P_FLAG
827
828         if (flags)
829                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
830
831         return printed;
832 }
833
834 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
835
836 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
837                                                    struct syscall_arg *arg)
838 {
839         int printed = 0, flags = arg->val;
840
841         if (flags == 0)
842                 return scnprintf(bf, size, "NONE");
843 #define P_FLAG(n) \
844         if (flags & EFD_##n) { \
845                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
846                 flags &= ~EFD_##n; \
847         }
848
849         P_FLAG(SEMAPHORE);
850         P_FLAG(CLOEXEC);
851         P_FLAG(NONBLOCK);
852 #undef P_FLAG
853
854         if (flags)
855                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
856
857         return printed;
858 }
859
860 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
861
862 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
863                                                 struct syscall_arg *arg)
864 {
865         int printed = 0, flags = arg->val;
866
867 #define P_FLAG(n) \
868         if (flags & O_##n) { \
869                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
870                 flags &= ~O_##n; \
871         }
872
873         P_FLAG(CLOEXEC);
874         P_FLAG(NONBLOCK);
875 #undef P_FLAG
876
877         if (flags)
878                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
879
880         return printed;
881 }
882
883 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
884
885 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
886 {
887         int sig = arg->val;
888
889         switch (sig) {
890 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
891         P_SIGNUM(HUP);
892         P_SIGNUM(INT);
893         P_SIGNUM(QUIT);
894         P_SIGNUM(ILL);
895         P_SIGNUM(TRAP);
896         P_SIGNUM(ABRT);
897         P_SIGNUM(BUS);
898         P_SIGNUM(FPE);
899         P_SIGNUM(KILL);
900         P_SIGNUM(USR1);
901         P_SIGNUM(SEGV);
902         P_SIGNUM(USR2);
903         P_SIGNUM(PIPE);
904         P_SIGNUM(ALRM);
905         P_SIGNUM(TERM);
906         P_SIGNUM(CHLD);
907         P_SIGNUM(CONT);
908         P_SIGNUM(STOP);
909         P_SIGNUM(TSTP);
910         P_SIGNUM(TTIN);
911         P_SIGNUM(TTOU);
912         P_SIGNUM(URG);
913         P_SIGNUM(XCPU);
914         P_SIGNUM(XFSZ);
915         P_SIGNUM(VTALRM);
916         P_SIGNUM(PROF);
917         P_SIGNUM(WINCH);
918         P_SIGNUM(IO);
919         P_SIGNUM(PWR);
920         P_SIGNUM(SYS);
921 #ifdef SIGEMT
922         P_SIGNUM(EMT);
923 #endif
924 #ifdef SIGSTKFLT
925         P_SIGNUM(STKFLT);
926 #endif
927 #ifdef SIGSWI
928         P_SIGNUM(SWI);
929 #endif
930         default: break;
931         }
932
933         return scnprintf(bf, size, "%#x", sig);
934 }
935
936 #define SCA_SIGNUM syscall_arg__scnprintf_signum
937
938 #if defined(__i386__) || defined(__x86_64__)
939 /*
940  * FIXME: Make this available to all arches.
941  */
942 #define TCGETS          0x5401
943
944 static const char *tioctls[] = {
945         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
946         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
947         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
948         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
949         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
950         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
951         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
952         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
953         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
954         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
955         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
956         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
957         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
958         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
959         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
960 };
961
962 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
963 #endif /* defined(__i386__) || defined(__x86_64__) */
964
965 #define STRARRAY(arg, name, array) \
966           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
967           .arg_parm      = { [arg] = &strarray__##array, }
968
969 static struct syscall_fmt {
970         const char *name;
971         const char *alias;
972         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
973         void       *arg_parm[6];
974         bool       errmsg;
975         bool       timeout;
976         bool       hexret;
977 } syscall_fmts[] = {
978         { .name     = "access",     .errmsg = true,
979           .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
980         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
981         { .name     = "brk",        .hexret = true,
982           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
983         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
984         { .name     = "close",      .errmsg = true,
985           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
986         { .name     = "connect",    .errmsg = true, },
987         { .name     = "dup",        .errmsg = true,
988           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
989         { .name     = "dup2",       .errmsg = true,
990           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
991         { .name     = "dup3",       .errmsg = true,
992           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
993         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
994         { .name     = "eventfd2",   .errmsg = true,
995           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
996         { .name     = "faccessat",  .errmsg = true,
997           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
998         { .name     = "fadvise64",  .errmsg = true,
999           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1000         { .name     = "fallocate",  .errmsg = true,
1001           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1002         { .name     = "fchdir",     .errmsg = true,
1003           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1004         { .name     = "fchmod",     .errmsg = true,
1005           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1006         { .name     = "fchmodat",   .errmsg = true,
1007           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1008         { .name     = "fchown",     .errmsg = true,
1009           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1010         { .name     = "fchownat",   .errmsg = true,
1011           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1012         { .name     = "fcntl",      .errmsg = true,
1013           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1014                              [1] = SCA_STRARRAY, /* cmd */ },
1015           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1016         { .name     = "fdatasync",  .errmsg = true,
1017           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1018         { .name     = "flock",      .errmsg = true,
1019           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1020                              [1] = SCA_FLOCK, /* cmd */ }, },
1021         { .name     = "fsetxattr",  .errmsg = true,
1022           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1023         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1024           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1025         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1026           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1027         { .name     = "fstatfs",    .errmsg = true,
1028           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1029         { .name     = "fsync",    .errmsg = true,
1030           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1031         { .name     = "ftruncate", .errmsg = true,
1032           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1033         { .name     = "futex",      .errmsg = true,
1034           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1035         { .name     = "futimesat", .errmsg = true,
1036           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1037         { .name     = "getdents",   .errmsg = true,
1038           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039         { .name     = "getdents64", .errmsg = true,
1040           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1041         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1042         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1043         { .name     = "ioctl",      .errmsg = true,
1044           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1045 #if defined(__i386__) || defined(__x86_64__)
1046 /*
1047  * FIXME: Make this available to all arches.
1048  */
1049                              [1] = SCA_STRHEXARRAY, /* cmd */
1050                              [2] = SCA_HEX, /* arg */ },
1051           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1052 #else
1053                              [2] = SCA_HEX, /* arg */ }, },
1054 #endif
1055         { .name     = "kill",       .errmsg = true,
1056           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1057         { .name     = "linkat",     .errmsg = true,
1058           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1059         { .name     = "lseek",      .errmsg = true,
1060           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1061                              [2] = SCA_STRARRAY, /* whence */ },
1062           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1063         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
1064         { .name     = "madvise",    .errmsg = true,
1065           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1066                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1067         { .name     = "mkdirat",    .errmsg = true,
1068           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1069         { .name     = "mknodat",    .errmsg = true,
1070           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1071         { .name     = "mlock",      .errmsg = true,
1072           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1073         { .name     = "mlockall",   .errmsg = true,
1074           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1075         { .name     = "mmap",       .hexret = true,
1076           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1077                              [2] = SCA_MMAP_PROT, /* prot */
1078                              [3] = SCA_MMAP_FLAGS, /* flags */
1079                              [4] = SCA_FD,        /* fd */ }, },
1080         { .name     = "mprotect",   .errmsg = true,
1081           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1082                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1083         { .name     = "mremap",     .hexret = true,
1084           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1085                              [3] = SCA_MREMAP_FLAGS, /* flags */
1086                              [4] = SCA_HEX, /* new_addr */ }, },
1087         { .name     = "munlock",    .errmsg = true,
1088           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1089         { .name     = "munmap",     .errmsg = true,
1090           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1091         { .name     = "name_to_handle_at", .errmsg = true,
1092           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1093         { .name     = "newfstatat", .errmsg = true,
1094           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1095         { .name     = "open",       .errmsg = true,
1096           .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1097                              [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1098         { .name     = "open_by_handle_at", .errmsg = true,
1099           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1100                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1101         { .name     = "openat",     .errmsg = true,
1102           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1103                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1104         { .name     = "perf_event_open", .errmsg = true,
1105           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1106                              [2] = SCA_INT, /* cpu */
1107                              [3] = SCA_FD,  /* group_fd */
1108                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1109         { .name     = "pipe2",      .errmsg = true,
1110           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1111         { .name     = "poll",       .errmsg = true, .timeout = true, },
1112         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1113         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1114           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1115         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1116           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1117         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1118         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1119           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1120         { .name     = "pwritev",    .errmsg = true,
1121           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1122         { .name     = "read",       .errmsg = true,
1123           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1124         { .name     = "readlinkat", .errmsg = true,
1125           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1126         { .name     = "readv",      .errmsg = true,
1127           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1128         { .name     = "recvfrom",   .errmsg = true,
1129           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1130         { .name     = "recvmmsg",   .errmsg = true,
1131           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1132         { .name     = "recvmsg",    .errmsg = true,
1133           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1134         { .name     = "renameat",   .errmsg = true,
1135           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1136         { .name     = "rt_sigaction", .errmsg = true,
1137           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1138         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1139         { .name     = "rt_sigqueueinfo", .errmsg = true,
1140           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1141         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1142           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1143         { .name     = "select",     .errmsg = true, .timeout = true, },
1144         { .name     = "sendmmsg",    .errmsg = true,
1145           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1146         { .name     = "sendmsg",    .errmsg = true,
1147           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1148         { .name     = "sendto",     .errmsg = true,
1149           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1150         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1151         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1152         { .name     = "shutdown",   .errmsg = true,
1153           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1154         { .name     = "socket",     .errmsg = true,
1155           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1156                              [1] = SCA_SK_TYPE, /* type */ },
1157           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1158         { .name     = "socketpair", .errmsg = true,
1159           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1160                              [1] = SCA_SK_TYPE, /* type */ },
1161           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1162         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
1163         { .name     = "symlinkat",  .errmsg = true,
1164           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1165         { .name     = "tgkill",     .errmsg = true,
1166           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1167         { .name     = "tkill",      .errmsg = true,
1168           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1169         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1170         { .name     = "unlinkat",   .errmsg = true,
1171           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1172         { .name     = "utimensat",  .errmsg = true,
1173           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1174         { .name     = "write",      .errmsg = true,
1175           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1176         { .name     = "writev",     .errmsg = true,
1177           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1178 };
1179
1180 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1181 {
1182         const struct syscall_fmt *fmt = fmtp;
1183         return strcmp(name, fmt->name);
1184 }
1185
1186 static struct syscall_fmt *syscall_fmt__find(const char *name)
1187 {
1188         const int nmemb = ARRAY_SIZE(syscall_fmts);
1189         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1190 }
1191
1192 struct syscall {
1193         struct event_format *tp_format;
1194         int                 nr_args;
1195         struct format_field *args;
1196         const char          *name;
1197         bool                is_exit;
1198         struct syscall_fmt  *fmt;
1199         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1200         void                **arg_parm;
1201 };
1202
1203 static size_t fprintf_duration(unsigned long t, FILE *fp)
1204 {
1205         double duration = (double)t / NSEC_PER_MSEC;
1206         size_t printed = fprintf(fp, "(");
1207
1208         if (duration >= 1.0)
1209                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1210         else if (duration >= 0.01)
1211                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1212         else
1213                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1214         return printed + fprintf(fp, "): ");
1215 }
1216
1217 /**
1218  * filename.ptr: The filename char pointer that will be vfs_getname'd
1219  * filename.entry_str_pos: Where to insert the string translated from
1220  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1221  */
1222 struct thread_trace {
1223         u64               entry_time;
1224         u64               exit_time;
1225         bool              entry_pending;
1226         unsigned long     nr_events;
1227         unsigned long     pfmaj, pfmin;
1228         char              *entry_str;
1229         double            runtime_ms;
1230         struct {
1231                 unsigned long ptr;
1232                 int           entry_str_pos;
1233         } filename;
1234         struct {
1235                 int       max;
1236                 char      **table;
1237         } paths;
1238
1239         struct intlist *syscall_stats;
1240 };
1241
1242 static struct thread_trace *thread_trace__new(void)
1243 {
1244         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1245
1246         if (ttrace)
1247                 ttrace->paths.max = -1;
1248
1249         ttrace->syscall_stats = intlist__new(NULL);
1250
1251         return ttrace;
1252 }
1253
1254 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1255 {
1256         struct thread_trace *ttrace;
1257
1258         if (thread == NULL)
1259                 goto fail;
1260
1261         if (thread__priv(thread) == NULL)
1262                 thread__set_priv(thread, thread_trace__new());
1263
1264         if (thread__priv(thread) == NULL)
1265                 goto fail;
1266
1267         ttrace = thread__priv(thread);
1268         ++ttrace->nr_events;
1269
1270         return ttrace;
1271 fail:
1272         color_fprintf(fp, PERF_COLOR_RED,
1273                       "WARNING: not enough memory, dropping samples!\n");
1274         return NULL;
1275 }
1276
1277 #define TRACE_PFMAJ             (1 << 0)
1278 #define TRACE_PFMIN             (1 << 1)
1279
1280 static const size_t trace__entry_str_size = 2048;
1281
1282 struct trace {
1283         struct perf_tool        tool;
1284         struct {
1285                 int             machine;
1286                 int             open_id;
1287         }                       audit;
1288         struct {
1289                 int             max;
1290                 struct syscall  *table;
1291                 struct {
1292                         struct perf_evsel *sys_enter,
1293                                           *sys_exit;
1294                 }               events;
1295         } syscalls;
1296         struct record_opts      opts;
1297         struct perf_evlist      *evlist;
1298         struct machine          *host;
1299         struct thread           *current;
1300         u64                     base_time;
1301         FILE                    *output;
1302         unsigned long           nr_events;
1303         struct strlist          *ev_qualifier;
1304         struct {
1305                 size_t          nr;
1306                 int             *entries;
1307         }                       ev_qualifier_ids;
1308         const char              *last_vfs_getname;
1309         struct intlist          *tid_list;
1310         struct intlist          *pid_list;
1311         struct {
1312                 size_t          nr;
1313                 pid_t           *entries;
1314         }                       filter_pids;
1315         double                  duration_filter;
1316         double                  runtime_ms;
1317         struct {
1318                 u64             vfs_getname,
1319                                 proc_getname;
1320         } stats;
1321         bool                    not_ev_qualifier;
1322         bool                    live;
1323         bool                    full_time;
1324         bool                    sched;
1325         bool                    multiple_threads;
1326         bool                    summary;
1327         bool                    summary_only;
1328         bool                    show_comm;
1329         bool                    show_tool_stats;
1330         bool                    trace_syscalls;
1331         bool                    force;
1332         bool                    vfs_getname;
1333         int                     trace_pgfaults;
1334 };
1335
1336 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1337 {
1338         struct thread_trace *ttrace = thread__priv(thread);
1339
1340         if (fd > ttrace->paths.max) {
1341                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1342
1343                 if (npath == NULL)
1344                         return -1;
1345
1346                 if (ttrace->paths.max != -1) {
1347                         memset(npath + ttrace->paths.max + 1, 0,
1348                                (fd - ttrace->paths.max) * sizeof(char *));
1349                 } else {
1350                         memset(npath, 0, (fd + 1) * sizeof(char *));
1351                 }
1352
1353                 ttrace->paths.table = npath;
1354                 ttrace->paths.max   = fd;
1355         }
1356
1357         ttrace->paths.table[fd] = strdup(pathname);
1358
1359         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1360 }
1361
1362 static int thread__read_fd_path(struct thread *thread, int fd)
1363 {
1364         char linkname[PATH_MAX], pathname[PATH_MAX];
1365         struct stat st;
1366         int ret;
1367
1368         if (thread->pid_ == thread->tid) {
1369                 scnprintf(linkname, sizeof(linkname),
1370                           "/proc/%d/fd/%d", thread->pid_, fd);
1371         } else {
1372                 scnprintf(linkname, sizeof(linkname),
1373                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1374         }
1375
1376         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1377                 return -1;
1378
1379         ret = readlink(linkname, pathname, sizeof(pathname));
1380
1381         if (ret < 0 || ret > st.st_size)
1382                 return -1;
1383
1384         pathname[ret] = '\0';
1385         return trace__set_fd_pathname(thread, fd, pathname);
1386 }
1387
1388 static const char *thread__fd_path(struct thread *thread, int fd,
1389                                    struct trace *trace)
1390 {
1391         struct thread_trace *ttrace = thread__priv(thread);
1392
1393         if (ttrace == NULL)
1394                 return NULL;
1395
1396         if (fd < 0)
1397                 return NULL;
1398
1399         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1400                 if (!trace->live)
1401                         return NULL;
1402                 ++trace->stats.proc_getname;
1403                 if (thread__read_fd_path(thread, fd))
1404                         return NULL;
1405         }
1406
1407         return ttrace->paths.table[fd];
1408 }
1409
1410 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1411                                         struct syscall_arg *arg)
1412 {
1413         int fd = arg->val;
1414         size_t printed = scnprintf(bf, size, "%d", fd);
1415         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1416
1417         if (path)
1418                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1419
1420         return printed;
1421 }
1422
1423 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1424                                               struct syscall_arg *arg)
1425 {
1426         int fd = arg->val;
1427         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1428         struct thread_trace *ttrace = thread__priv(arg->thread);
1429
1430         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1431                 zfree(&ttrace->paths.table[fd]);
1432
1433         return printed;
1434 }
1435
1436 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1437                                      unsigned long ptr)
1438 {
1439         struct thread_trace *ttrace = thread__priv(thread);
1440
1441         ttrace->filename.ptr = ptr;
1442         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1443 }
1444
1445 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1446                                               struct syscall_arg *arg)
1447 {
1448         unsigned long ptr = arg->val;
1449
1450         if (!arg->trace->vfs_getname)
1451                 return scnprintf(bf, size, "%#x", ptr);
1452
1453         thread__set_filename_pos(arg->thread, bf, ptr);
1454         return 0;
1455 }
1456
1457 static bool trace__filter_duration(struct trace *trace, double t)
1458 {
1459         return t < (trace->duration_filter * NSEC_PER_MSEC);
1460 }
1461
1462 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1463 {
1464         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1465
1466         return fprintf(fp, "%10.3f ", ts);
1467 }
1468
1469 static bool done = false;
1470 static bool interrupted = false;
1471
1472 static void sig_handler(int sig)
1473 {
1474         done = true;
1475         interrupted = sig == SIGINT;
1476 }
1477
1478 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1479                                         u64 duration, u64 tstamp, FILE *fp)
1480 {
1481         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1482         printed += fprintf_duration(duration, fp);
1483
1484         if (trace->multiple_threads) {
1485                 if (trace->show_comm)
1486                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1487                 printed += fprintf(fp, "%d ", thread->tid);
1488         }
1489
1490         return printed;
1491 }
1492
1493 static int trace__process_event(struct trace *trace, struct machine *machine,
1494                                 union perf_event *event, struct perf_sample *sample)
1495 {
1496         int ret = 0;
1497
1498         switch (event->header.type) {
1499         case PERF_RECORD_LOST:
1500                 color_fprintf(trace->output, PERF_COLOR_RED,
1501                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1502                 ret = machine__process_lost_event(machine, event, sample);
1503         default:
1504                 ret = machine__process_event(machine, event, sample);
1505                 break;
1506         }
1507
1508         return ret;
1509 }
1510
1511 static int trace__tool_process(struct perf_tool *tool,
1512                                union perf_event *event,
1513                                struct perf_sample *sample,
1514                                struct machine *machine)
1515 {
1516         struct trace *trace = container_of(tool, struct trace, tool);
1517         return trace__process_event(trace, machine, event, sample);
1518 }
1519
1520 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1521 {
1522         int err = symbol__init(NULL);
1523
1524         if (err)
1525                 return err;
1526
1527         trace->host = machine__new_host();
1528         if (trace->host == NULL)
1529                 return -ENOMEM;
1530
1531         if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1532                 return -errno;
1533
1534         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1535                                             evlist->threads, trace__tool_process, false,
1536                                             trace->opts.proc_map_timeout);
1537         if (err)
1538                 symbol__exit();
1539
1540         return err;
1541 }
1542
1543 static int syscall__set_arg_fmts(struct syscall *sc)
1544 {
1545         struct format_field *field;
1546         int idx = 0;
1547
1548         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1549         if (sc->arg_scnprintf == NULL)
1550                 return -1;
1551
1552         if (sc->fmt)
1553                 sc->arg_parm = sc->fmt->arg_parm;
1554
1555         for (field = sc->args; field; field = field->next) {
1556                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1557                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1558                 else if (field->flags & FIELD_IS_POINTER)
1559                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1560                 ++idx;
1561         }
1562
1563         return 0;
1564 }
1565
1566 static int trace__read_syscall_info(struct trace *trace, int id)
1567 {
1568         char tp_name[128];
1569         struct syscall *sc;
1570         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1571
1572         if (name == NULL)
1573                 return -1;
1574
1575         if (id > trace->syscalls.max) {
1576                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1577
1578                 if (nsyscalls == NULL)
1579                         return -1;
1580
1581                 if (trace->syscalls.max != -1) {
1582                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1583                                (id - trace->syscalls.max) * sizeof(*sc));
1584                 } else {
1585                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1586                 }
1587
1588                 trace->syscalls.table = nsyscalls;
1589                 trace->syscalls.max   = id;
1590         }
1591
1592         sc = trace->syscalls.table + id;
1593         sc->name = name;
1594
1595         sc->fmt  = syscall_fmt__find(sc->name);
1596
1597         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1598         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1599
1600         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1601                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1602                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1603         }
1604
1605         if (sc->tp_format == NULL)
1606                 return -1;
1607
1608         sc->args = sc->tp_format->format.fields;
1609         sc->nr_args = sc->tp_format->format.nr_fields;
1610         /* drop nr field - not relevant here; does not exist on older kernels */
1611         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1612                 sc->args = sc->args->next;
1613                 --sc->nr_args;
1614         }
1615
1616         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1617
1618         return syscall__set_arg_fmts(sc);
1619 }
1620
1621 static int trace__validate_ev_qualifier(struct trace *trace)
1622 {
1623         int err = 0, i;
1624         struct str_node *pos;
1625
1626         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1627         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1628                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1629
1630         if (trace->ev_qualifier_ids.entries == NULL) {
1631                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1632                        trace->output);
1633                 err = -EINVAL;
1634                 goto out;
1635         }
1636
1637         i = 0;
1638
1639         strlist__for_each(pos, trace->ev_qualifier) {
1640                 const char *sc = pos->s;
1641                 int id = audit_name_to_syscall(sc, trace->audit.machine);
1642
1643                 if (id < 0) {
1644                         if (err == 0) {
1645                                 fputs("Error:\tInvalid syscall ", trace->output);
1646                                 err = -EINVAL;
1647                         } else {
1648                                 fputs(", ", trace->output);
1649                         }
1650
1651                         fputs(sc, trace->output);
1652                 }
1653
1654                 trace->ev_qualifier_ids.entries[i++] = id;
1655         }
1656
1657         if (err < 0) {
1658                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1659                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1660                 zfree(&trace->ev_qualifier_ids.entries);
1661                 trace->ev_qualifier_ids.nr = 0;
1662         }
1663 out:
1664         return err;
1665 }
1666
1667 /*
1668  * args is to be interpreted as a series of longs but we need to handle
1669  * 8-byte unaligned accesses. args points to raw_data within the event
1670  * and raw_data is guaranteed to be 8-byte unaligned because it is
1671  * preceded by raw_size which is a u32. So we need to copy args to a temp
1672  * variable to read it. Most notably this avoids extended load instructions
1673  * on unaligned addresses
1674  */
1675
1676 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1677                                       unsigned char *args, struct trace *trace,
1678                                       struct thread *thread)
1679 {
1680         size_t printed = 0;
1681         unsigned char *p;
1682         unsigned long val;
1683
1684         if (sc->args != NULL) {
1685                 struct format_field *field;
1686                 u8 bit = 1;
1687                 struct syscall_arg arg = {
1688                         .idx    = 0,
1689                         .mask   = 0,
1690                         .trace  = trace,
1691                         .thread = thread,
1692                 };
1693
1694                 for (field = sc->args; field;
1695                      field = field->next, ++arg.idx, bit <<= 1) {
1696                         if (arg.mask & bit)
1697                                 continue;
1698
1699                         /* special care for unaligned accesses */
1700                         p = args + sizeof(unsigned long) * arg.idx;
1701                         memcpy(&val, p, sizeof(val));
1702
1703                         /*
1704                          * Suppress this argument if its value is zero and
1705                          * and we don't have a string associated in an
1706                          * strarray for it.
1707                          */
1708                         if (val == 0 &&
1709                             !(sc->arg_scnprintf &&
1710                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1711                               sc->arg_parm[arg.idx]))
1712                                 continue;
1713
1714                         printed += scnprintf(bf + printed, size - printed,
1715                                              "%s%s: ", printed ? ", " : "", field->name);
1716                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1717                                 arg.val = val;
1718                                 if (sc->arg_parm)
1719                                         arg.parm = sc->arg_parm[arg.idx];
1720                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1721                                                                       size - printed, &arg);
1722                         } else {
1723                                 printed += scnprintf(bf + printed, size - printed,
1724                                                      "%ld", val);
1725                         }
1726                 }
1727         } else {
1728                 int i = 0;
1729
1730                 while (i < 6) {
1731                         /* special care for unaligned accesses */
1732                         p = args + sizeof(unsigned long) * i;
1733                         memcpy(&val, p, sizeof(val));
1734                         printed += scnprintf(bf + printed, size - printed,
1735                                              "%sarg%d: %ld",
1736                                              printed ? ", " : "", i, val);
1737                         ++i;
1738                 }
1739         }
1740
1741         return printed;
1742 }
1743
1744 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1745                                   union perf_event *event,
1746                                   struct perf_sample *sample);
1747
1748 static struct syscall *trace__syscall_info(struct trace *trace,
1749                                            struct perf_evsel *evsel, int id)
1750 {
1751
1752         if (id < 0) {
1753
1754                 /*
1755                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1756                  * before that, leaving at a higher verbosity level till that is
1757                  * explained. Reproduced with plain ftrace with:
1758                  *
1759                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1760                  * grep "NR -1 " /t/trace_pipe
1761                  *
1762                  * After generating some load on the machine.
1763                  */
1764                 if (verbose > 1) {
1765                         static u64 n;
1766                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1767                                 id, perf_evsel__name(evsel), ++n);
1768                 }
1769                 return NULL;
1770         }
1771
1772         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1773             trace__read_syscall_info(trace, id))
1774                 goto out_cant_read;
1775
1776         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1777                 goto out_cant_read;
1778
1779         return &trace->syscalls.table[id];
1780
1781 out_cant_read:
1782         if (verbose) {
1783                 fprintf(trace->output, "Problems reading syscall %d", id);
1784                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1785                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1786                 fputs(" information\n", trace->output);
1787         }
1788         return NULL;
1789 }
1790
1791 static void thread__update_stats(struct thread_trace *ttrace,
1792                                  int id, struct perf_sample *sample)
1793 {
1794         struct int_node *inode;
1795         struct stats *stats;
1796         u64 duration = 0;
1797
1798         inode = intlist__findnew(ttrace->syscall_stats, id);
1799         if (inode == NULL)
1800                 return;
1801
1802         stats = inode->priv;
1803         if (stats == NULL) {
1804                 stats = malloc(sizeof(struct stats));
1805                 if (stats == NULL)
1806                         return;
1807                 init_stats(stats);
1808                 inode->priv = stats;
1809         }
1810
1811         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1812                 duration = sample->time - ttrace->entry_time;
1813
1814         update_stats(stats, duration);
1815 }
1816
1817 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1818 {
1819         struct thread_trace *ttrace;
1820         u64 duration;
1821         size_t printed;
1822
1823         if (trace->current == NULL)
1824                 return 0;
1825
1826         ttrace = thread__priv(trace->current);
1827
1828         if (!ttrace->entry_pending)
1829                 return 0;
1830
1831         duration = sample->time - ttrace->entry_time;
1832
1833         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1834         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1835         ttrace->entry_pending = false;
1836
1837         return printed;
1838 }
1839
1840 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1841                             union perf_event *event __maybe_unused,
1842                             struct perf_sample *sample)
1843 {
1844         char *msg;
1845         void *args;
1846         size_t printed = 0;
1847         struct thread *thread;
1848         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1849         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1850         struct thread_trace *ttrace;
1851
1852         if (sc == NULL)
1853                 return -1;
1854
1855         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1856         ttrace = thread__trace(thread, trace->output);
1857         if (ttrace == NULL)
1858                 goto out_put;
1859
1860         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1861
1862         if (ttrace->entry_str == NULL) {
1863                 ttrace->entry_str = malloc(trace__entry_str_size);
1864                 if (!ttrace->entry_str)
1865                         goto out_put;
1866         }
1867
1868         if (!trace->summary_only)
1869                 trace__printf_interrupted_entry(trace, sample);
1870
1871         ttrace->entry_time = sample->time;
1872         msg = ttrace->entry_str;
1873         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1874
1875         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1876                                            args, trace, thread);
1877
1878         if (sc->is_exit) {
1879                 if (!trace->duration_filter && !trace->summary_only) {
1880                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1881                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1882                 }
1883         } else
1884                 ttrace->entry_pending = true;
1885
1886         if (trace->current != thread) {
1887                 thread__put(trace->current);
1888                 trace->current = thread__get(thread);
1889         }
1890         err = 0;
1891 out_put:
1892         thread__put(thread);
1893         return err;
1894 }
1895
1896 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1897                            union perf_event *event __maybe_unused,
1898                            struct perf_sample *sample)
1899 {
1900         long ret;
1901         u64 duration = 0;
1902         struct thread *thread;
1903         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1904         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1905         struct thread_trace *ttrace;
1906
1907         if (sc == NULL)
1908                 return -1;
1909
1910         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1911         ttrace = thread__trace(thread, trace->output);
1912         if (ttrace == NULL)
1913                 goto out_put;
1914
1915         if (trace->summary)
1916                 thread__update_stats(ttrace, id, sample);
1917
1918         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1919
1920         if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1921                 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1922                 trace->last_vfs_getname = NULL;
1923                 ++trace->stats.vfs_getname;
1924         }
1925
1926         ttrace->exit_time = sample->time;
1927
1928         if (ttrace->entry_time) {
1929                 duration = sample->time - ttrace->entry_time;
1930                 if (trace__filter_duration(trace, duration))
1931                         goto out;
1932         } else if (trace->duration_filter)
1933                 goto out;
1934
1935         if (trace->summary_only)
1936                 goto out;
1937
1938         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1939
1940         if (ttrace->entry_pending) {
1941                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1942         } else {
1943                 fprintf(trace->output, " ... [");
1944                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1945                 fprintf(trace->output, "]: %s()", sc->name);
1946         }
1947
1948         if (sc->fmt == NULL) {
1949 signed_print:
1950                 fprintf(trace->output, ") = %ld", ret);
1951         } else if (ret < 0 && sc->fmt->errmsg) {
1952                 char bf[STRERR_BUFSIZE];
1953                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1954                            *e = audit_errno_to_name(-ret);
1955
1956                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1957         } else if (ret == 0 && sc->fmt->timeout)
1958                 fprintf(trace->output, ") = 0 Timeout");
1959         else if (sc->fmt->hexret)
1960                 fprintf(trace->output, ") = %#lx", ret);
1961         else
1962                 goto signed_print;
1963
1964         fputc('\n', trace->output);
1965 out:
1966         ttrace->entry_pending = false;
1967         err = 0;
1968 out_put:
1969         thread__put(thread);
1970         return err;
1971 }
1972
1973 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1974                               union perf_event *event __maybe_unused,
1975                               struct perf_sample *sample)
1976 {
1977         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978         struct thread_trace *ttrace;
1979         size_t filename_len, entry_str_len, to_move;
1980         ssize_t remaining_space;
1981         char *pos;
1982         const char *filename;
1983
1984         trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1985
1986         if (!thread)
1987                 goto out;
1988
1989         ttrace = thread__priv(thread);
1990         if (!ttrace)
1991                 goto out;
1992
1993         if (!ttrace->filename.ptr)
1994                 goto out;
1995
1996         entry_str_len = strlen(ttrace->entry_str);
1997         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1998         if (remaining_space <= 0)
1999                 goto out;
2000
2001         filename = trace->last_vfs_getname;
2002         filename_len = strlen(filename);
2003         if (filename_len > (size_t)remaining_space) {
2004                 filename += filename_len - remaining_space;
2005                 filename_len = remaining_space;
2006         }
2007
2008         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2009         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2010         memmove(pos + filename_len, pos, to_move);
2011         memcpy(pos, filename, filename_len);
2012
2013         ttrace->filename.ptr = 0;
2014         ttrace->filename.entry_str_pos = 0;
2015 out:
2016         return 0;
2017 }
2018
2019 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2020                                      union perf_event *event __maybe_unused,
2021                                      struct perf_sample *sample)
2022 {
2023         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2024         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2025         struct thread *thread = machine__findnew_thread(trace->host,
2026                                                         sample->pid,
2027                                                         sample->tid);
2028         struct thread_trace *ttrace = thread__trace(thread, trace->output);
2029
2030         if (ttrace == NULL)
2031                 goto out_dump;
2032
2033         ttrace->runtime_ms += runtime_ms;
2034         trace->runtime_ms += runtime_ms;
2035         thread__put(thread);
2036         return 0;
2037
2038 out_dump:
2039         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2040                evsel->name,
2041                perf_evsel__strval(evsel, sample, "comm"),
2042                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2043                runtime,
2044                perf_evsel__intval(evsel, sample, "vruntime"));
2045         thread__put(thread);
2046         return 0;
2047 }
2048
2049 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2050                                 union perf_event *event __maybe_unused,
2051                                 struct perf_sample *sample)
2052 {
2053         trace__printf_interrupted_entry(trace, sample);
2054         trace__fprintf_tstamp(trace, sample->time, trace->output);
2055
2056         if (trace->trace_syscalls)
2057                 fprintf(trace->output, "(         ): ");
2058
2059         fprintf(trace->output, "%s:", evsel->name);
2060
2061         if (evsel->tp_format) {
2062                 event_format__fprintf(evsel->tp_format, sample->cpu,
2063                                       sample->raw_data, sample->raw_size,
2064                                       trace->output);
2065         }
2066
2067         fprintf(trace->output, ")\n");
2068         return 0;
2069 }
2070
2071 static void print_location(FILE *f, struct perf_sample *sample,
2072                            struct addr_location *al,
2073                            bool print_dso, bool print_sym)
2074 {
2075
2076         if ((verbose || print_dso) && al->map)
2077                 fprintf(f, "%s@", al->map->dso->long_name);
2078
2079         if ((verbose || print_sym) && al->sym)
2080                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2081                         al->addr - al->sym->start);
2082         else if (al->map)
2083                 fprintf(f, "0x%" PRIx64, al->addr);
2084         else
2085                 fprintf(f, "0x%" PRIx64, sample->addr);
2086 }
2087
2088 static int trace__pgfault(struct trace *trace,
2089                           struct perf_evsel *evsel,
2090                           union perf_event *event,
2091                           struct perf_sample *sample)
2092 {
2093         struct thread *thread;
2094         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2095         struct addr_location al;
2096         char map_type = 'd';
2097         struct thread_trace *ttrace;
2098         int err = -1;
2099
2100         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2101         ttrace = thread__trace(thread, trace->output);
2102         if (ttrace == NULL)
2103                 goto out_put;
2104
2105         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2106                 ttrace->pfmaj++;
2107         else
2108                 ttrace->pfmin++;
2109
2110         if (trace->summary_only)
2111                 goto out;
2112
2113         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2114                               sample->ip, &al);
2115
2116         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2117
2118         fprintf(trace->output, "%sfault [",
2119                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2120                 "maj" : "min");
2121
2122         print_location(trace->output, sample, &al, false, true);
2123
2124         fprintf(trace->output, "] => ");
2125
2126         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2127                                    sample->addr, &al);
2128
2129         if (!al.map) {
2130                 thread__find_addr_location(thread, cpumode,
2131                                            MAP__FUNCTION, sample->addr, &al);
2132
2133                 if (al.map)
2134                         map_type = 'x';
2135                 else
2136                         map_type = '?';
2137         }
2138
2139         print_location(trace->output, sample, &al, true, false);
2140
2141         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2142 out:
2143         err = 0;
2144 out_put:
2145         thread__put(thread);
2146         return err;
2147 }
2148
2149 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2150 {
2151         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2152             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2153                 return false;
2154
2155         if (trace->pid_list || trace->tid_list)
2156                 return true;
2157
2158         return false;
2159 }
2160
2161 static int trace__process_sample(struct perf_tool *tool,
2162                                  union perf_event *event,
2163                                  struct perf_sample *sample,
2164                                  struct perf_evsel *evsel,
2165                                  struct machine *machine __maybe_unused)
2166 {
2167         struct trace *trace = container_of(tool, struct trace, tool);
2168         int err = 0;
2169
2170         tracepoint_handler handler = evsel->handler;
2171
2172         if (skip_sample(trace, sample))
2173                 return 0;
2174
2175         if (!trace->full_time && trace->base_time == 0)
2176                 trace->base_time = sample->time;
2177
2178         if (handler) {
2179                 ++trace->nr_events;
2180                 handler(trace, evsel, event, sample);
2181         }
2182
2183         return err;
2184 }
2185
2186 static int parse_target_str(struct trace *trace)
2187 {
2188         if (trace->opts.target.pid) {
2189                 trace->pid_list = intlist__new(trace->opts.target.pid);
2190                 if (trace->pid_list == NULL) {
2191                         pr_err("Error parsing process id string\n");
2192                         return -EINVAL;
2193                 }
2194         }
2195
2196         if (trace->opts.target.tid) {
2197                 trace->tid_list = intlist__new(trace->opts.target.tid);
2198                 if (trace->tid_list == NULL) {
2199                         pr_err("Error parsing thread id string\n");
2200                         return -EINVAL;
2201                 }
2202         }
2203
2204         return 0;
2205 }
2206
2207 static int trace__record(struct trace *trace, int argc, const char **argv)
2208 {
2209         unsigned int rec_argc, i, j;
2210         const char **rec_argv;
2211         const char * const record_args[] = {
2212                 "record",
2213                 "-R",
2214                 "-m", "1024",
2215                 "-c", "1",
2216         };
2217
2218         const char * const sc_args[] = { "-e", };
2219         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2220         const char * const majpf_args[] = { "-e", "major-faults" };
2221         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2222         const char * const minpf_args[] = { "-e", "minor-faults" };
2223         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2224
2225         /* +1 is for the event string below */
2226         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2227                 majpf_args_nr + minpf_args_nr + argc;
2228         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2229
2230         if (rec_argv == NULL)
2231                 return -ENOMEM;
2232
2233         j = 0;
2234         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2235                 rec_argv[j++] = record_args[i];
2236
2237         if (trace->trace_syscalls) {
2238                 for (i = 0; i < sc_args_nr; i++)
2239                         rec_argv[j++] = sc_args[i];
2240
2241                 /* event string may be different for older kernels - e.g., RHEL6 */
2242                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2243                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2244                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2245                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2246                 else {
2247                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2248                         return -1;
2249                 }
2250         }
2251
2252         if (trace->trace_pgfaults & TRACE_PFMAJ)
2253                 for (i = 0; i < majpf_args_nr; i++)
2254                         rec_argv[j++] = majpf_args[i];
2255
2256         if (trace->trace_pgfaults & TRACE_PFMIN)
2257                 for (i = 0; i < minpf_args_nr; i++)
2258                         rec_argv[j++] = minpf_args[i];
2259
2260         for (i = 0; i < (unsigned int)argc; i++)
2261                 rec_argv[j++] = argv[i];
2262
2263         return cmd_record(j, rec_argv, NULL);
2264 }
2265
2266 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2267
2268 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2269 {
2270         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2271         if (evsel == NULL)
2272                 return false;
2273
2274         if (perf_evsel__field(evsel, "pathname") == NULL) {
2275                 perf_evsel__delete(evsel);
2276                 return false;
2277         }
2278
2279         evsel->handler = trace__vfs_getname;
2280         perf_evlist__add(evlist, evsel);
2281         return true;
2282 }
2283
2284 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2285                                     u64 config)
2286 {
2287         struct perf_evsel *evsel;
2288         struct perf_event_attr attr = {
2289                 .type = PERF_TYPE_SOFTWARE,
2290                 .mmap_data = 1,
2291         };
2292
2293         attr.config = config;
2294         attr.sample_period = 1;
2295
2296         event_attr_init(&attr);
2297
2298         evsel = perf_evsel__new(&attr);
2299         if (!evsel)
2300                 return -ENOMEM;
2301
2302         evsel->handler = trace__pgfault;
2303         perf_evlist__add(evlist, evsel);
2304
2305         return 0;
2306 }
2307
2308 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2309 {
2310         const u32 type = event->header.type;
2311         struct perf_evsel *evsel;
2312
2313         if (!trace->full_time && trace->base_time == 0)
2314                 trace->base_time = sample->time;
2315
2316         if (type != PERF_RECORD_SAMPLE) {
2317                 trace__process_event(trace, trace->host, event, sample);
2318                 return;
2319         }
2320
2321         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2322         if (evsel == NULL) {
2323                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2324                 return;
2325         }
2326
2327         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2328             sample->raw_data == NULL) {
2329                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2330                        perf_evsel__name(evsel), sample->tid,
2331                        sample->cpu, sample->raw_size);
2332         } else {
2333                 tracepoint_handler handler = evsel->handler;
2334                 handler(trace, evsel, event, sample);
2335         }
2336 }
2337
2338 static int trace__add_syscall_newtp(struct trace *trace)
2339 {
2340         int ret = -1;
2341         struct perf_evlist *evlist = trace->evlist;
2342         struct perf_evsel *sys_enter, *sys_exit;
2343
2344         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2345         if (sys_enter == NULL)
2346                 goto out;
2347
2348         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2349                 goto out_delete_sys_enter;
2350
2351         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2352         if (sys_exit == NULL)
2353                 goto out_delete_sys_enter;
2354
2355         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2356                 goto out_delete_sys_exit;
2357
2358         perf_evlist__add(evlist, sys_enter);
2359         perf_evlist__add(evlist, sys_exit);
2360
2361         trace->syscalls.events.sys_enter = sys_enter;
2362         trace->syscalls.events.sys_exit  = sys_exit;
2363
2364         ret = 0;
2365 out:
2366         return ret;
2367
2368 out_delete_sys_exit:
2369         perf_evsel__delete_priv(sys_exit);
2370 out_delete_sys_enter:
2371         perf_evsel__delete_priv(sys_enter);
2372         goto out;
2373 }
2374
2375 static int trace__set_ev_qualifier_filter(struct trace *trace)
2376 {
2377         int err = -1;
2378         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2379                                                 trace->ev_qualifier_ids.nr,
2380                                                 trace->ev_qualifier_ids.entries);
2381
2382         if (filter == NULL)
2383                 goto out_enomem;
2384
2385         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2386                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2387
2388         free(filter);
2389 out:
2390         return err;
2391 out_enomem:
2392         errno = ENOMEM;
2393         goto out;
2394 }
2395
2396 static int trace__run(struct trace *trace, int argc, const char **argv)
2397 {
2398         struct perf_evlist *evlist = trace->evlist;
2399         struct perf_evsel *evsel;
2400         int err = -1, i;
2401         unsigned long before;
2402         const bool forks = argc > 0;
2403         bool draining = false;
2404
2405         trace->live = true;
2406
2407         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2408                 goto out_error_raw_syscalls;
2409
2410         if (trace->trace_syscalls)
2411                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2412
2413         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2414             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2415                 goto out_error_mem;
2416         }
2417
2418         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2419             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2420                 goto out_error_mem;
2421
2422         if (trace->sched &&
2423             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2424                                    trace__sched_stat_runtime))
2425                 goto out_error_sched_stat_runtime;
2426
2427         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2428         if (err < 0) {
2429                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2430                 goto out_delete_evlist;
2431         }
2432
2433         err = trace__symbols_init(trace, evlist);
2434         if (err < 0) {
2435                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2436                 goto out_delete_evlist;
2437         }
2438
2439         perf_evlist__config(evlist, &trace->opts);
2440
2441         signal(SIGCHLD, sig_handler);
2442         signal(SIGINT, sig_handler);
2443
2444         if (forks) {
2445                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2446                                                     argv, false, NULL);
2447                 if (err < 0) {
2448                         fprintf(trace->output, "Couldn't run the workload!\n");
2449                         goto out_delete_evlist;
2450                 }
2451         }
2452
2453         err = perf_evlist__open(evlist);
2454         if (err < 0)
2455                 goto out_error_open;
2456
2457         /*
2458          * Better not use !target__has_task() here because we need to cover the
2459          * case where no threads were specified in the command line, but a
2460          * workload was, and in that case we will fill in the thread_map when
2461          * we fork the workload in perf_evlist__prepare_workload.
2462          */
2463         if (trace->filter_pids.nr > 0)
2464                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2465         else if (thread_map__pid(evlist->threads, 0) == -1)
2466                 err = perf_evlist__set_filter_pid(evlist, getpid());
2467
2468         if (err < 0)
2469                 goto out_error_mem;
2470
2471         if (trace->ev_qualifier_ids.nr > 0) {
2472                 err = trace__set_ev_qualifier_filter(trace);
2473                 if (err < 0)
2474                         goto out_errno;
2475
2476                 pr_debug("event qualifier tracepoint filter: %s\n",
2477                          trace->syscalls.events.sys_exit->filter);
2478         }
2479
2480         err = perf_evlist__apply_filters(evlist, &evsel);
2481         if (err < 0)
2482                 goto out_error_apply_filters;
2483
2484         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2485         if (err < 0)
2486                 goto out_error_mmap;
2487
2488         if (!target__none(&trace->opts.target))
2489                 perf_evlist__enable(evlist);
2490
2491         if (forks)
2492                 perf_evlist__start_workload(evlist);
2493
2494         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2495                                   evlist->threads->nr > 1 ||
2496                                   perf_evlist__first(evlist)->attr.inherit;
2497 again:
2498         before = trace->nr_events;
2499
2500         for (i = 0; i < evlist->nr_mmaps; i++) {
2501                 union perf_event *event;
2502
2503                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2504                         struct perf_sample sample;
2505
2506                         ++trace->nr_events;
2507
2508                         err = perf_evlist__parse_sample(evlist, event, &sample);
2509                         if (err) {
2510                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2511                                 goto next_event;
2512                         }
2513
2514                         trace__handle_event(trace, event, &sample);
2515 next_event:
2516                         perf_evlist__mmap_consume(evlist, i);
2517
2518                         if (interrupted)
2519                                 goto out_disable;
2520
2521                         if (done && !draining) {
2522                                 perf_evlist__disable(evlist);
2523                                 draining = true;
2524                         }
2525                 }
2526         }
2527
2528         if (trace->nr_events == before) {
2529                 int timeout = done ? 100 : -1;
2530
2531                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2532                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2533                                 draining = true;
2534
2535                         goto again;
2536                 }
2537         } else {
2538                 goto again;
2539         }
2540
2541 out_disable:
2542         thread__zput(trace->current);
2543
2544         perf_evlist__disable(evlist);
2545
2546         if (!err) {
2547                 if (trace->summary)
2548                         trace__fprintf_thread_summary(trace, trace->output);
2549
2550                 if (trace->show_tool_stats) {
2551                         fprintf(trace->output, "Stats:\n "
2552                                                " vfs_getname : %" PRIu64 "\n"
2553                                                " proc_getname: %" PRIu64 "\n",
2554                                 trace->stats.vfs_getname,
2555                                 trace->stats.proc_getname);
2556                 }
2557         }
2558
2559 out_delete_evlist:
2560         perf_evlist__delete(evlist);
2561         trace->evlist = NULL;
2562         trace->live = false;
2563         return err;
2564 {
2565         char errbuf[BUFSIZ];
2566
2567 out_error_sched_stat_runtime:
2568         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2569         goto out_error;
2570
2571 out_error_raw_syscalls:
2572         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2573         goto out_error;
2574
2575 out_error_mmap:
2576         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2577         goto out_error;
2578
2579 out_error_open:
2580         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2581
2582 out_error:
2583         fprintf(trace->output, "%s\n", errbuf);
2584         goto out_delete_evlist;
2585
2586 out_error_apply_filters:
2587         fprintf(trace->output,
2588                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2589                 evsel->filter, perf_evsel__name(evsel), errno,
2590                 strerror_r(errno, errbuf, sizeof(errbuf)));
2591         goto out_delete_evlist;
2592 }
2593 out_error_mem:
2594         fprintf(trace->output, "Not enough memory to run!\n");
2595         goto out_delete_evlist;
2596
2597 out_errno:
2598         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2599         goto out_delete_evlist;
2600 }
2601
2602 static int trace__replay(struct trace *trace)
2603 {
2604         const struct perf_evsel_str_handler handlers[] = {
2605                 { "probe:vfs_getname",       trace__vfs_getname, },
2606         };
2607         struct perf_data_file file = {
2608                 .path  = input_name,
2609                 .mode  = PERF_DATA_MODE_READ,
2610                 .force = trace->force,
2611         };
2612         struct perf_session *session;
2613         struct perf_evsel *evsel;
2614         int err = -1;
2615
2616         trace->tool.sample        = trace__process_sample;
2617         trace->tool.mmap          = perf_event__process_mmap;
2618         trace->tool.mmap2         = perf_event__process_mmap2;
2619         trace->tool.comm          = perf_event__process_comm;
2620         trace->tool.exit          = perf_event__process_exit;
2621         trace->tool.fork          = perf_event__process_fork;
2622         trace->tool.attr          = perf_event__process_attr;
2623         trace->tool.tracing_data = perf_event__process_tracing_data;
2624         trace->tool.build_id      = perf_event__process_build_id;
2625
2626         trace->tool.ordered_events = true;
2627         trace->tool.ordering_requires_timestamps = true;
2628
2629         /* add tid to output */
2630         trace->multiple_threads = true;
2631
2632         session = perf_session__new(&file, false, &trace->tool);
2633         if (session == NULL)
2634                 return -1;
2635
2636         if (symbol__init(&session->header.env) < 0)
2637                 goto out;
2638
2639         trace->host = &session->machines.host;
2640
2641         err = perf_session__set_tracepoints_handlers(session, handlers);
2642         if (err)
2643                 goto out;
2644
2645         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2646                                                      "raw_syscalls:sys_enter");
2647         /* older kernels have syscalls tp versus raw_syscalls */
2648         if (evsel == NULL)
2649                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2650                                                              "syscalls:sys_enter");
2651
2652         if (evsel &&
2653             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2654             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2655                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2656                 goto out;
2657         }
2658
2659         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2660                                                      "raw_syscalls:sys_exit");
2661         if (evsel == NULL)
2662                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2663                                                              "syscalls:sys_exit");
2664         if (evsel &&
2665             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2666             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2667                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2668                 goto out;
2669         }
2670
2671         evlist__for_each(session->evlist, evsel) {
2672                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2673                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2674                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2675                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2676                         evsel->handler = trace__pgfault;
2677         }
2678
2679         err = parse_target_str(trace);
2680         if (err != 0)
2681                 goto out;
2682
2683         setup_pager();
2684
2685         err = perf_session__process_events(session);
2686         if (err)
2687                 pr_err("Failed to process events, error %d", err);
2688
2689         else if (trace->summary)
2690                 trace__fprintf_thread_summary(trace, trace->output);
2691
2692 out:
2693         perf_session__delete(session);
2694
2695         return err;
2696 }
2697
2698 static size_t trace__fprintf_threads_header(FILE *fp)
2699 {
2700         size_t printed;
2701
2702         printed  = fprintf(fp, "\n Summary of events:\n\n");
2703
2704         return printed;
2705 }
2706
2707 static size_t thread__dump_stats(struct thread_trace *ttrace,
2708                                  struct trace *trace, FILE *fp)
2709 {
2710         struct stats *stats;
2711         size_t printed = 0;
2712         struct syscall *sc;
2713         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2714
2715         if (inode == NULL)
2716                 return 0;
2717
2718         printed += fprintf(fp, "\n");
2719
2720         printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2721         printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2722         printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2723
2724         /* each int_node is a syscall */
2725         while (inode) {
2726                 stats = inode->priv;
2727                 if (stats) {
2728                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2729                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2730                         double avg = avg_stats(stats);
2731                         double pct;
2732                         u64 n = (u64) stats->n;
2733
2734                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2735                         avg /= NSEC_PER_MSEC;
2736
2737                         sc = &trace->syscalls.table[inode->i];
2738                         printed += fprintf(fp, "   %-15s", sc->name);
2739                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2740                                            n, min, avg);
2741                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2742                 }
2743
2744                 inode = intlist__next(inode);
2745         }
2746
2747         printed += fprintf(fp, "\n\n");
2748
2749         return printed;
2750 }
2751
2752 /* struct used to pass data to per-thread function */
2753 struct summary_data {
2754         FILE *fp;
2755         struct trace *trace;
2756         size_t printed;
2757 };
2758
2759 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2760 {
2761         struct summary_data *data = priv;
2762         FILE *fp = data->fp;
2763         size_t printed = data->printed;
2764         struct trace *trace = data->trace;
2765         struct thread_trace *ttrace = thread__priv(thread);
2766         double ratio;
2767
2768         if (ttrace == NULL)
2769                 return 0;
2770
2771         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2772
2773         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2774         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2775         printed += fprintf(fp, "%.1f%%", ratio);
2776         if (ttrace->pfmaj)
2777                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2778         if (ttrace->pfmin)
2779                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2780         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2781         printed += thread__dump_stats(ttrace, trace, fp);
2782
2783         data->printed += printed;
2784
2785         return 0;
2786 }
2787
2788 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2789 {
2790         struct summary_data data = {
2791                 .fp = fp,
2792                 .trace = trace
2793         };
2794         data.printed = trace__fprintf_threads_header(fp);
2795
2796         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2797
2798         return data.printed;
2799 }
2800
2801 static int trace__set_duration(const struct option *opt, const char *str,
2802                                int unset __maybe_unused)
2803 {
2804         struct trace *trace = opt->value;
2805
2806         trace->duration_filter = atof(str);
2807         return 0;
2808 }
2809
2810 static int trace__set_filter_pids(const struct option *opt, const char *str,
2811                                   int unset __maybe_unused)
2812 {
2813         int ret = -1;
2814         size_t i;
2815         struct trace *trace = opt->value;
2816         /*
2817          * FIXME: introduce a intarray class, plain parse csv and create a
2818          * { int nr, int entries[] } struct...
2819          */
2820         struct intlist *list = intlist__new(str);
2821
2822         if (list == NULL)
2823                 return -1;
2824
2825         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2826         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2827
2828         if (trace->filter_pids.entries == NULL)
2829                 goto out;
2830
2831         trace->filter_pids.entries[0] = getpid();
2832
2833         for (i = 1; i < trace->filter_pids.nr; ++i)
2834                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2835
2836         intlist__delete(list);
2837         ret = 0;
2838 out:
2839         return ret;
2840 }
2841
2842 static int trace__open_output(struct trace *trace, const char *filename)
2843 {
2844         struct stat st;
2845
2846         if (!stat(filename, &st) && st.st_size) {
2847                 char oldname[PATH_MAX];
2848
2849                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2850                 unlink(oldname);
2851                 rename(filename, oldname);
2852         }
2853
2854         trace->output = fopen(filename, "w");
2855
2856         return trace->output == NULL ? -errno : 0;
2857 }
2858
2859 static int parse_pagefaults(const struct option *opt, const char *str,
2860                             int unset __maybe_unused)
2861 {
2862         int *trace_pgfaults = opt->value;
2863
2864         if (strcmp(str, "all") == 0)
2865                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2866         else if (strcmp(str, "maj") == 0)
2867                 *trace_pgfaults |= TRACE_PFMAJ;
2868         else if (strcmp(str, "min") == 0)
2869                 *trace_pgfaults |= TRACE_PFMIN;
2870         else
2871                 return -1;
2872
2873         return 0;
2874 }
2875
2876 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2877 {
2878         struct perf_evsel *evsel;
2879
2880         evlist__for_each(evlist, evsel)
2881                 evsel->handler = handler;
2882 }
2883
2884 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2885 {
2886         const char *trace_usage[] = {
2887                 "perf trace [<options>] [<command>]",
2888                 "perf trace [<options>] -- <command> [<options>]",
2889                 "perf trace record [<options>] [<command>]",
2890                 "perf trace record [<options>] -- <command> [<options>]",
2891                 NULL
2892         };
2893         struct trace trace = {
2894                 .audit = {
2895                         .machine = audit_detect_machine(),
2896                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
2897                 },
2898                 .syscalls = {
2899                         . max = -1,
2900                 },
2901                 .opts = {
2902                         .target = {
2903                                 .uid       = UINT_MAX,
2904                                 .uses_mmap = true,
2905                         },
2906                         .user_freq     = UINT_MAX,
2907                         .user_interval = ULLONG_MAX,
2908                         .no_buffering  = true,
2909                         .mmap_pages    = UINT_MAX,
2910                         .proc_map_timeout  = 500,
2911                 },
2912                 .output = stdout,
2913                 .show_comm = true,
2914                 .trace_syscalls = true,
2915         };
2916         const char *output_name = NULL;
2917         const char *ev_qualifier_str = NULL;
2918         const struct option trace_options[] = {
2919         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2920                      "event selector. use 'perf list' to list available events",
2921                      parse_events_option),
2922         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2923                     "show the thread COMM next to its id"),
2924         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2925         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2926         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2927         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2928         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2929                     "trace events on existing process id"),
2930         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2931                     "trace events on existing thread id"),
2932         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2933                      "pids to filter (by the kernel)", trace__set_filter_pids),
2934         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2935                     "system-wide collection from all CPUs"),
2936         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2937                     "list of cpus to monitor"),
2938         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2939                     "child tasks do not inherit counters"),
2940         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2941                      "number of mmap data pages",
2942                      perf_evlist__parse_mmap_pages),
2943         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2944                    "user to profile"),
2945         OPT_CALLBACK(0, "duration", &trace, "float",
2946                      "show only events with duration > N.M ms",
2947                      trace__set_duration),
2948         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2949         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2950         OPT_BOOLEAN('T', "time", &trace.full_time,
2951                     "Show full timestamp, not time relative to first start"),
2952         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2953                     "Show only syscall summary with statistics"),
2954         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2955                     "Show all syscalls and summary with statistics"),
2956         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2957                      "Trace pagefaults", parse_pagefaults, "maj"),
2958         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2959         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2960         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2961                         "per thread proc mmap processing timeout in ms"),
2962         OPT_END()
2963         };
2964         const char * const trace_subcommands[] = { "record", NULL };
2965         int err;
2966         char bf[BUFSIZ];
2967
2968         signal(SIGSEGV, sighandler_dump_stack);
2969         signal(SIGFPE, sighandler_dump_stack);
2970
2971         trace.evlist = perf_evlist__new();
2972
2973         if (trace.evlist == NULL) {
2974                 pr_err("Not enough memory to run!\n");
2975                 err = -ENOMEM;
2976                 goto out;
2977         }
2978
2979         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2980                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2981
2982         if (trace.trace_pgfaults) {
2983                 trace.opts.sample_address = true;
2984                 trace.opts.sample_time = true;
2985         }
2986
2987         if (trace.evlist->nr_entries > 0)
2988                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2989
2990         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2991                 return trace__record(&trace, argc-1, &argv[1]);
2992
2993         /* summary_only implies summary option, but don't overwrite summary if set */
2994         if (trace.summary_only)
2995                 trace.summary = trace.summary_only;
2996
2997         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2998             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2999                 pr_err("Please specify something to trace.\n");
3000                 return -1;
3001         }
3002
3003         if (output_name != NULL) {
3004                 err = trace__open_output(&trace, output_name);
3005                 if (err < 0) {
3006                         perror("failed to create output file");
3007                         goto out;
3008                 }
3009         }
3010
3011         if (ev_qualifier_str != NULL) {
3012                 const char *s = ev_qualifier_str;
3013                 struct strlist_config slist_config = {
3014                         .dirname = system_path(STRACE_GROUPS_DIR),
3015                 };
3016
3017                 trace.not_ev_qualifier = *s == '!';
3018                 if (trace.not_ev_qualifier)
3019                         ++s;
3020                 trace.ev_qualifier = strlist__new(s, &slist_config);
3021                 if (trace.ev_qualifier == NULL) {
3022                         fputs("Not enough memory to parse event qualifier",
3023                               trace.output);
3024                         err = -ENOMEM;
3025                         goto out_close;
3026                 }
3027
3028                 err = trace__validate_ev_qualifier(&trace);
3029                 if (err)
3030                         goto out_close;
3031         }
3032
3033         err = target__validate(&trace.opts.target);
3034         if (err) {
3035                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3036                 fprintf(trace.output, "%s", bf);
3037                 goto out_close;
3038         }
3039
3040         err = target__parse_uid(&trace.opts.target);
3041         if (err) {
3042                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3043                 fprintf(trace.output, "%s", bf);
3044                 goto out_close;
3045         }
3046
3047         if (!argc && target__none(&trace.opts.target))
3048                 trace.opts.target.system_wide = true;
3049
3050         if (input_name)
3051                 err = trace__replay(&trace);
3052         else
3053                 err = trace__run(&trace, argc, argv);
3054
3055 out_close:
3056         if (output_name != NULL)
3057                 fclose(trace.output);
3058 out:
3059         return err;
3060 }