4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
40 struct perf_tool tool;
41 struct perf_record_opts opts;
43 const char *output_name;
44 struct perf_evlist *evlist;
45 struct perf_session *session;
49 unsigned int page_size;
51 enum write_mode_t write_mode;
53 bool no_buildid_cache;
58 off_t post_processing_offset;
61 static void advance_output(struct perf_record *rec, size_t size)
63 rec->bytes_written += size;
66 static void write_output(struct perf_record *rec, void *buf, size_t size)
69 int ret = write(rec->output, buf, size);
72 die("failed to write");
77 rec->bytes_written += ret;
81 static int process_synthesized_event(struct perf_tool *tool,
82 union perf_event *event,
83 struct perf_sample *sample __used,
84 struct machine *machine __used)
86 struct perf_record *rec = container_of(tool, struct perf_record, tool);
87 write_output(rec, event, event->header.size);
91 static void perf_record__mmap_read(struct perf_record *rec,
94 unsigned int head = perf_mmap__read_head(md);
95 unsigned int old = md->prev;
96 unsigned char *data = md->base + rec->page_size;
107 if ((old & md->mask) + size != (head & md->mask)) {
108 buf = &data[old & md->mask];
109 size = md->mask + 1 - (old & md->mask);
112 write_output(rec, buf, size);
115 buf = &data[old & md->mask];
119 write_output(rec, buf, size);
122 perf_mmap__write_tail(md, old);
125 static volatile int done = 0;
126 static volatile int signr = -1;
127 static volatile int child_finished = 0;
129 static void sig_handler(int sig)
138 static void perf_record__sig_exit(int exit_status __used, void *arg)
140 struct perf_record *rec = arg;
143 if (rec->evlist->workload.pid > 0) {
145 kill(rec->evlist->workload.pid, SIGTERM);
148 if (WIFSIGNALED(status))
149 psignal(WTERMSIG(status), rec->progname);
152 if (signr == -1 || signr == SIGUSR1)
155 signal(signr, SIG_DFL);
156 kill(getpid(), signr);
159 static bool perf_evlist__equal(struct perf_evlist *evlist,
160 struct perf_evlist *other)
162 struct perf_evsel *pos, *pair;
164 if (evlist->nr_entries != other->nr_entries)
167 pair = list_entry(other->entries.next, struct perf_evsel, node);
169 list_for_each_entry(pos, &evlist->entries, node) {
170 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
172 pair = list_entry(pair->node.next, struct perf_evsel, node);
178 static void perf_record__open(struct perf_record *rec)
180 struct perf_evsel *pos, *first;
181 struct perf_evlist *evlist = rec->evlist;
182 struct perf_session *session = rec->session;
183 struct perf_record_opts *opts = &rec->opts;
185 first = list_entry(evlist->entries.next, struct perf_evsel, node);
187 perf_evlist__config_attrs(evlist, opts);
189 list_for_each_entry(pos, &evlist->entries, node) {
190 struct perf_event_attr *attr = &pos->attr;
191 struct xyarray *group_fd = NULL;
193 * Check if parse_single_tracepoint_event has already asked for
196 * XXX this is kludgy but short term fix for problems introduced by
197 * eac23d1c that broke 'perf script' by having different sample_types
198 * when using multiple tracepoint events when we use a perf binary
199 * that tries to use sample_id_all on an older kernel.
201 * We need to move counter creation to perf_session, support
202 * different sample_types, etc.
204 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
206 if (opts->group && pos != first)
207 group_fd = first->fd;
209 attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
211 if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
212 opts->group, group_fd) < 0) {
215 if (err == EPERM || err == EACCES) {
216 ui__error_paranoid();
218 } else if (err == ENODEV && opts->cpu_list) {
219 die("No such device - did you specify"
220 " an out-of-range profile CPU?\n");
221 } else if (err == EINVAL && opts->sample_id_all_avail) {
223 * Old kernel, no attr->sample_id_type_all field
225 opts->sample_id_all_avail = false;
226 if (!opts->sample_time && !opts->raw_samples && !time_needed)
227 attr->sample_type &= ~PERF_SAMPLE_TIME;
229 goto retry_sample_id;
233 * If it's cycles then fall back to hrtimer
234 * based cpu-clock-tick sw counter, which
235 * is always available even if no PMU support:
237 if (attr->type == PERF_TYPE_HARDWARE
238 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
241 ui__warning("The cycles event is not supported, "
242 "trying to fall back to cpu-clock-ticks\n");
243 attr->type = PERF_TYPE_SOFTWARE;
244 attr->config = PERF_COUNT_SW_CPU_CLOCK;
249 ui__warning("The %s event is not supported.\n",
255 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
258 #if defined(__i386__) || defined(__x86_64__)
259 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
260 die("No hardware sampling interrupt available."
261 " No APIC? If so then you can boot the kernel"
262 " with the \"lapic\" boot parameter to"
263 " force-enable it.\n");
266 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
270 if (perf_evlist__set_filters(evlist)) {
271 error("failed to set filter with %d (%s)\n", errno,
276 if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
278 die("Permission error mapping pages.\n"
279 "Consider increasing "
280 "/proc/sys/kernel/perf_event_mlock_kb,\n"
281 "or try again with a smaller value of -m/--mmap_pages.\n"
282 "(current value: %d)\n", opts->mmap_pages);
283 else if (!is_power_of_2(opts->mmap_pages))
284 die("--mmap_pages/-m value must be a power of two.");
286 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
290 session->evlist = evlist;
292 if (!perf_evlist__equal(session->evlist, evlist)) {
293 fprintf(stderr, "incompatible append\n");
298 perf_session__update_sample_type(session);
301 static int process_buildids(struct perf_record *rec)
303 u64 size = lseek(rec->output, 0, SEEK_CUR);
308 rec->session->fd = rec->output;
309 return __perf_session__process_events(rec->session, rec->post_processing_offset,
310 size - rec->post_processing_offset,
311 size, &build_id__mark_dso_hit_ops);
314 static void perf_record__exit(int status __used, void *arg)
316 struct perf_record *rec = arg;
318 if (!rec->opts.pipe_output) {
319 rec->session->header.data_size += rec->bytes_written;
321 if (!rec->no_buildid)
322 process_buildids(rec);
323 perf_session__write_header(rec->session, rec->evlist,
325 perf_session__delete(rec->session);
326 perf_evlist__delete(rec->evlist);
331 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
334 struct perf_tool *tool = data;
336 if (machine__is_host(machine))
340 *As for guest kernel when processing subcommand record&report,
341 *we arrange module mmap prior to guest kernel mmap and trigger
342 *a preload dso because default guest module symbols are loaded
343 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
344 *method is used to avoid symbol missing when the first addr is
345 *in module instead of in guest kernel.
347 err = perf_event__synthesize_modules(tool, process_synthesized_event,
350 pr_err("Couldn't record guest kernel [%d]'s reference"
351 " relocation symbol.\n", machine->pid);
354 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
355 * have no _text sometimes.
357 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
360 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
363 pr_err("Couldn't record guest kernel [%d]'s reference"
364 " relocation symbol.\n", machine->pid);
367 static struct perf_event_header finished_round_event = {
368 .size = sizeof(struct perf_event_header),
369 .type = PERF_RECORD_FINISHED_ROUND,
372 static void perf_record__mmap_read_all(struct perf_record *rec)
376 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
377 if (rec->evlist->mmap[i].base)
378 perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
381 if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO))
382 write_output(rec, &finished_round_event, sizeof(finished_round_event));
385 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
389 int err, output, feat;
390 unsigned long waking = 0;
391 const bool forks = argc > 0;
392 struct machine *machine;
393 struct perf_tool *tool = &rec->tool;
394 struct perf_record_opts *opts = &rec->opts;
395 struct perf_evlist *evsel_list = rec->evlist;
396 const char *output_name = rec->output_name;
397 struct perf_session *session;
399 rec->progname = argv[0];
401 rec->page_size = sysconf(_SC_PAGE_SIZE);
403 on_exit(perf_record__sig_exit, rec);
404 signal(SIGCHLD, sig_handler);
405 signal(SIGINT, sig_handler);
406 signal(SIGUSR1, sig_handler);
409 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
410 opts->pipe_output = true;
412 rec->output_name = output_name = "perf.data";
415 if (!strcmp(output_name, "-"))
416 opts->pipe_output = true;
417 else if (!stat(output_name, &st) && st.st_size) {
418 if (rec->write_mode == WRITE_FORCE) {
419 char oldname[PATH_MAX];
420 snprintf(oldname, sizeof(oldname), "%s.old",
423 rename(output_name, oldname);
425 } else if (rec->write_mode == WRITE_APPEND) {
426 rec->write_mode = WRITE_FORCE;
430 flags = O_CREAT|O_RDWR;
431 if (rec->write_mode == WRITE_APPEND)
436 if (opts->pipe_output)
437 output = STDOUT_FILENO;
439 output = open(output_name, flags, S_IRUSR | S_IWUSR);
441 perror("failed to create output file");
445 rec->output = output;
447 session = perf_session__new(output_name, O_WRONLY,
448 rec->write_mode == WRITE_FORCE, false, NULL);
449 if (session == NULL) {
450 pr_err("Not enough memory for reading perf file header\n");
454 rec->session = session;
456 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
457 perf_header__set_feat(&session->header, feat);
460 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
462 if (!have_tracepoints(&evsel_list->entries))
463 perf_header__clear_feat(&session->header, HEADER_TRACE_INFO);
465 if (!rec->file_new) {
466 err = perf_session__read_header(session, output);
468 goto out_delete_session;
472 err = perf_evlist__prepare_workload(evsel_list, opts, argv);
474 pr_err("Couldn't run the workload!\n");
475 goto out_delete_session;
479 perf_record__open(rec);
482 * perf_session__delete(session) will be called at perf_record__exit()
484 on_exit(perf_record__exit, rec);
486 if (opts->pipe_output) {
487 err = perf_header__write_pipe(output);
490 } else if (rec->file_new) {
491 err = perf_session__write_header(session, evsel_list,
498 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
499 pr_err("Couldn't generate buildids. "
500 "Use --no-buildid to profile anyway.\n");
504 rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
506 machine = perf_session__find_host_machine(session);
508 pr_err("Couldn't find native kernel information.\n");
512 if (opts->pipe_output) {
513 err = perf_event__synthesize_attrs(tool, session,
514 process_synthesized_event);
516 pr_err("Couldn't synthesize attrs.\n");
520 err = perf_event__synthesize_event_types(tool, process_synthesized_event,
523 pr_err("Couldn't synthesize event_types.\n");
527 if (have_tracepoints(&evsel_list->entries)) {
529 * FIXME err <= 0 here actually means that
530 * there were no tracepoints so its not really
531 * an error, just that we don't need to
532 * synthesize anything. We really have to
533 * return this more properly and also
534 * propagate errors that now are calling die()
536 err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
537 process_synthesized_event);
539 pr_err("Couldn't record tracing data.\n");
542 advance_output(rec, err);
546 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
549 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
552 pr_err("Couldn't record kernel reference relocation symbol\n"
553 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
554 "Check /proc/kallsyms permission or run as root.\n");
556 err = perf_event__synthesize_modules(tool, process_synthesized_event,
559 pr_err("Couldn't record kernel module information.\n"
560 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
561 "Check /proc/modules permission or run as root.\n");
564 perf_session__process_machines(session, tool,
565 perf_event__synthesize_guest_os);
567 if (!opts->system_wide)
568 perf_event__synthesize_thread_map(tool, evsel_list->threads,
569 process_synthesized_event,
572 perf_event__synthesize_threads(tool, process_synthesized_event,
575 if (rec->realtime_prio) {
576 struct sched_param param;
578 param.sched_priority = rec->realtime_prio;
579 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
580 pr_err("Could not set realtime priority.\n");
585 perf_evlist__enable(evsel_list);
591 perf_evlist__start_workload(evsel_list);
594 int hits = rec->samples;
596 perf_record__mmap_read_all(rec);
598 if (hits == rec->samples) {
601 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
606 perf_evlist__disable(evsel_list);
609 if (quiet || signr == SIGUSR1)
612 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
615 * Approximate RIP event size: 24 bytes.
618 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
619 (double)rec->bytes_written / 1024.0 / 1024.0,
621 rec->bytes_written / 24);
626 perf_session__delete(session);
630 static const char * const record_usage[] = {
631 "perf record [<options>] [<command>]",
632 "perf record [<options>] -- <command> [<options>]",
637 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
638 * because we need to have access to it in perf_record__exit, that is called
639 * after cmd_record() exits, but since record_options need to be accessible to
640 * builtin-script, leave it here.
642 * At least we don't ouch it in all the other functions here directly.
644 * Just say no to tons of global variables, sigh.
646 static struct perf_record record = {
650 .mmap_pages = UINT_MAX,
651 .user_freq = UINT_MAX,
652 .user_interval = ULLONG_MAX,
654 .sample_id_all_avail = true,
656 .write_mode = WRITE_FORCE,
661 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
662 * with it and switch to use the library functions in perf_evlist that came
663 * from builtin-record.c, i.e. use perf_record_opts,
664 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
667 const struct option record_options[] = {
668 OPT_CALLBACK('e', "event", &record.evlist, "event",
669 "event selector. use 'perf list' to list available events",
670 parse_events_option),
671 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
672 "event filter", parse_filter),
673 OPT_INTEGER('p', "pid", &record.opts.target_pid,
674 "record events on existing process id"),
675 OPT_INTEGER('t', "tid", &record.opts.target_tid,
676 "record events on existing thread id"),
677 OPT_INTEGER('r', "realtime", &record.realtime_prio,
678 "collect data with this RT SCHED_FIFO priority"),
679 OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
680 "collect data without buffering"),
681 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
682 "collect raw sample records from all opened counters"),
683 OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide,
684 "system-wide collection from all CPUs"),
685 OPT_BOOLEAN('A', "append", &record.append_file,
686 "append to the output file to do incremental profiling"),
687 OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu",
688 "list of cpus to monitor"),
689 OPT_BOOLEAN('f', "force", &record.force,
690 "overwrite existing data file (deprecated)"),
691 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
692 OPT_STRING('o', "output", &record.output_name, "file",
694 OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
695 "child tasks do not inherit counters"),
696 OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
697 OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
698 "number of mmap data pages"),
699 OPT_BOOLEAN(0, "group", &record.opts.group,
700 "put the counters into a counter group"),
701 OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
702 "do call-graph (stack chain/backtrace) recording"),
703 OPT_INCR('v', "verbose", &verbose,
704 "be more verbose (show counter open errors, etc)"),
705 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
706 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
707 "per thread counts"),
708 OPT_BOOLEAN('d', "data", &record.opts.sample_address,
710 OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
711 OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
712 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
714 OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
715 "do not update the buildid cache"),
716 OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
717 "do not collect buildids in perf.data"),
718 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
719 "monitor event in cgroup name only",
721 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
725 int cmd_record(int argc, const char **argv, const char *prefix __used)
728 struct perf_evsel *pos;
729 struct perf_evlist *evsel_list;
730 struct perf_record *rec = &record;
732 perf_header__set_cmdline(argc, argv);
734 evsel_list = perf_evlist__new(NULL, NULL);
735 if (evsel_list == NULL)
738 rec->evlist = evsel_list;
740 argc = parse_options(argc, argv, record_options, record_usage,
741 PARSE_OPT_STOP_AT_NON_OPTION);
742 if (!argc && rec->opts.target_pid == -1 && rec->opts.target_tid == -1 &&
743 !rec->opts.system_wide && !rec->opts.cpu_list && !rec->uid_str)
744 usage_with_options(record_usage, record_options);
746 if (rec->force && rec->append_file) {
747 fprintf(stderr, "Can't overwrite and append at the same time."
748 " You need to choose between -f and -A");
749 usage_with_options(record_usage, record_options);
750 } else if (rec->append_file) {
751 rec->write_mode = WRITE_APPEND;
753 rec->write_mode = WRITE_FORCE;
756 if (nr_cgroups && !rec->opts.system_wide) {
757 fprintf(stderr, "cgroup monitoring only available in"
758 " system-wide mode\n");
759 usage_with_options(record_usage, record_options);
764 if (symbol_conf.kptr_restrict)
766 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
767 "check /proc/sys/kernel/kptr_restrict.\n\n"
768 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
769 "file is not found in the buildid cache or in the vmlinux path.\n\n"
770 "Samples in kernel modules won't be resolved at all.\n\n"
771 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
772 "even with a suitable vmlinux or kallsyms file.\n\n");
774 if (rec->no_buildid_cache || rec->no_buildid)
775 disable_buildid_cache();
777 if (evsel_list->nr_entries == 0 &&
778 perf_evlist__add_default(evsel_list) < 0) {
779 pr_err("Not enough memory for event selector list\n");
780 goto out_symbol_exit;
783 rec->opts.uid = parse_target_uid(rec->uid_str, rec->opts.target_tid,
784 rec->opts.target_pid);
785 if (rec->uid_str != NULL && rec->opts.uid == UINT_MAX - 1)
788 if (rec->opts.target_pid != -1)
789 rec->opts.target_tid = rec->opts.target_pid;
791 if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid,
792 rec->opts.target_tid, rec->opts.uid,
793 rec->opts.cpu_list) < 0)
794 usage_with_options(record_usage, record_options);
796 list_for_each_entry(pos, &evsel_list->entries, node) {
797 if (perf_header__push_event(pos->attr.config, event_name(pos)))
801 if (rec->opts.user_interval != ULLONG_MAX)
802 rec->opts.default_interval = rec->opts.user_interval;
803 if (rec->opts.user_freq != UINT_MAX)
804 rec->opts.freq = rec->opts.user_freq;
807 * User specified count overrides default frequency.
809 if (rec->opts.default_interval)
811 else if (rec->opts.freq) {
812 rec->opts.default_interval = rec->opts.freq;
814 fprintf(stderr, "frequency and count are zero, aborting\n");
819 err = __cmd_record(&record, argc, argv);
821 perf_evlist__delete_maps(evsel_list);