4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
40 struct perf_tool tool;
41 struct perf_record_opts opts;
43 const char *output_name;
44 struct perf_evlist *evlist;
45 struct perf_session *session;
49 unsigned int page_size;
51 enum write_mode_t write_mode;
53 bool no_buildid_cache;
58 off_t post_processing_offset;
61 static void advance_output(struct perf_record *rec, size_t size)
63 rec->bytes_written += size;
66 static void write_output(struct perf_record *rec, void *buf, size_t size)
69 int ret = write(rec->output, buf, size);
72 die("failed to write");
77 rec->bytes_written += ret;
81 static int process_synthesized_event(struct perf_tool *tool,
82 union perf_event *event,
83 struct perf_sample *sample __used,
84 struct machine *machine __used)
86 struct perf_record *rec = container_of(tool, struct perf_record, tool);
87 write_output(rec, event, event->header.size);
91 static void perf_record__mmap_read(struct perf_record *rec,
94 unsigned int head = perf_mmap__read_head(md);
95 unsigned int old = md->prev;
96 unsigned char *data = md->base + rec->page_size;
107 if ((old & md->mask) + size != (head & md->mask)) {
108 buf = &data[old & md->mask];
109 size = md->mask + 1 - (old & md->mask);
112 write_output(rec, buf, size);
115 buf = &data[old & md->mask];
119 write_output(rec, buf, size);
122 perf_mmap__write_tail(md, old);
125 static volatile int done = 0;
126 static volatile int signr = -1;
127 static volatile int child_finished = 0;
129 static void sig_handler(int sig)
138 static void perf_record__sig_exit(int exit_status __used, void *arg)
140 struct perf_record *rec = arg;
143 if (rec->evlist->workload.pid > 0) {
145 kill(rec->evlist->workload.pid, SIGTERM);
148 if (WIFSIGNALED(status))
149 psignal(WTERMSIG(status), rec->progname);
152 if (signr == -1 || signr == SIGUSR1)
155 signal(signr, SIG_DFL);
156 kill(getpid(), signr);
159 static bool perf_evlist__equal(struct perf_evlist *evlist,
160 struct perf_evlist *other)
162 struct perf_evsel *pos, *pair;
164 if (evlist->nr_entries != other->nr_entries)
167 pair = list_entry(other->entries.next, struct perf_evsel, node);
169 list_for_each_entry(pos, &evlist->entries, node) {
170 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
172 pair = list_entry(pair->node.next, struct perf_evsel, node);
178 static void perf_record__open(struct perf_record *rec)
180 struct perf_evsel *pos, *first;
181 struct perf_evlist *evlist = rec->evlist;
182 struct perf_session *session = rec->session;
183 struct perf_record_opts *opts = &rec->opts;
185 first = list_entry(evlist->entries.next, struct perf_evsel, node);
187 perf_evlist__config_attrs(evlist, opts);
189 list_for_each_entry(pos, &evlist->entries, node) {
190 struct perf_event_attr *attr = &pos->attr;
191 struct xyarray *group_fd = NULL;
193 * Check if parse_single_tracepoint_event has already asked for
196 * XXX this is kludgy but short term fix for problems introduced by
197 * eac23d1c that broke 'perf script' by having different sample_types
198 * when using multiple tracepoint events when we use a perf binary
199 * that tries to use sample_id_all on an older kernel.
201 * We need to move counter creation to perf_session, support
202 * different sample_types, etc.
204 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
206 if (opts->group && pos != first)
207 group_fd = first->fd;
209 attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
211 if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
212 opts->group, group_fd) < 0) {
215 if (err == EPERM || err == EACCES) {
216 ui__error_paranoid();
218 } else if (err == ENODEV && opts->cpu_list) {
219 die("No such device - did you specify"
220 " an out-of-range profile CPU?\n");
221 } else if (err == EINVAL && opts->sample_id_all_avail) {
223 * Old kernel, no attr->sample_id_type_all field
225 opts->sample_id_all_avail = false;
226 if (!opts->sample_time && !opts->raw_samples && !time_needed)
227 attr->sample_type &= ~PERF_SAMPLE_TIME;
229 goto retry_sample_id;
233 * If it's cycles then fall back to hrtimer
234 * based cpu-clock-tick sw counter, which
235 * is always available even if no PMU support:
237 if (attr->type == PERF_TYPE_HARDWARE
238 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
241 ui__warning("The cycles event is not supported, "
242 "trying to fall back to cpu-clock-ticks\n");
243 attr->type = PERF_TYPE_SOFTWARE;
244 attr->config = PERF_COUNT_SW_CPU_CLOCK;
249 ui__warning("The %s event is not supported.\n",
255 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
258 #if defined(__i386__) || defined(__x86_64__)
259 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
260 die("No hardware sampling interrupt available."
261 " No APIC? If so then you can boot the kernel"
262 " with the \"lapic\" boot parameter to"
263 " force-enable it.\n");
266 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
270 if (perf_evlist__set_filters(evlist)) {
271 error("failed to set filter with %d (%s)\n", errno,
276 if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
278 die("Permission error mapping pages.\n"
279 "Consider increasing "
280 "/proc/sys/kernel/perf_event_mlock_kb,\n"
281 "or try again with a smaller value of -m/--mmap_pages.\n"
282 "(current value: %d)\n", opts->mmap_pages);
283 else if (!is_power_of_2(opts->mmap_pages))
284 die("--mmap_pages/-m value must be a power of two.");
286 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
290 session->evlist = evlist;
292 if (!perf_evlist__equal(session->evlist, evlist)) {
293 fprintf(stderr, "incompatible append\n");
298 perf_session__update_sample_type(session);
301 static int process_buildids(struct perf_record *rec)
303 u64 size = lseek(rec->output, 0, SEEK_CUR);
308 rec->session->fd = rec->output;
309 return __perf_session__process_events(rec->session, rec->post_processing_offset,
310 size - rec->post_processing_offset,
311 size, &build_id__mark_dso_hit_ops);
314 static void perf_record__exit(int status __used, void *arg)
316 struct perf_record *rec = arg;
318 if (!rec->opts.pipe_output) {
319 rec->session->header.data_size += rec->bytes_written;
321 if (!rec->no_buildid)
322 process_buildids(rec);
323 perf_session__write_header(rec->session, rec->evlist,
325 perf_session__delete(rec->session);
326 perf_evlist__delete(rec->evlist);
331 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
334 struct perf_tool *tool = data;
336 if (machine__is_host(machine))
340 *As for guest kernel when processing subcommand record&report,
341 *we arrange module mmap prior to guest kernel mmap and trigger
342 *a preload dso because default guest module symbols are loaded
343 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
344 *method is used to avoid symbol missing when the first addr is
345 *in module instead of in guest kernel.
347 err = perf_event__synthesize_modules(tool, process_synthesized_event,
350 pr_err("Couldn't record guest kernel [%d]'s reference"
351 " relocation symbol.\n", machine->pid);
354 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
355 * have no _text sometimes.
357 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
360 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
363 pr_err("Couldn't record guest kernel [%d]'s reference"
364 " relocation symbol.\n", machine->pid);
367 static struct perf_event_header finished_round_event = {
368 .size = sizeof(struct perf_event_header),
369 .type = PERF_RECORD_FINISHED_ROUND,
372 static void perf_record__mmap_read_all(struct perf_record *rec)
376 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
377 if (rec->evlist->mmap[i].base)
378 perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
381 if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO))
382 write_output(rec, &finished_round_event, sizeof(finished_round_event));
385 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
389 int err, output, feat;
390 unsigned long waking = 0;
391 const bool forks = argc > 0;
392 struct machine *machine;
393 struct perf_tool *tool = &rec->tool;
394 struct perf_record_opts *opts = &rec->opts;
395 struct perf_evlist *evsel_list = rec->evlist;
396 const char *output_name = rec->output_name;
397 struct perf_session *session;
399 rec->progname = argv[0];
401 rec->page_size = sysconf(_SC_PAGE_SIZE);
403 on_exit(perf_record__sig_exit, rec);
404 signal(SIGCHLD, sig_handler);
405 signal(SIGINT, sig_handler);
406 signal(SIGUSR1, sig_handler);
409 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
410 opts->pipe_output = true;
412 rec->output_name = output_name = "perf.data";
415 if (!strcmp(output_name, "-"))
416 opts->pipe_output = true;
417 else if (!stat(output_name, &st) && st.st_size) {
418 if (rec->write_mode == WRITE_FORCE) {
419 char oldname[PATH_MAX];
420 snprintf(oldname, sizeof(oldname), "%s.old",
423 rename(output_name, oldname);
425 } else if (rec->write_mode == WRITE_APPEND) {
426 rec->write_mode = WRITE_FORCE;
430 flags = O_CREAT|O_RDWR;
431 if (rec->write_mode == WRITE_APPEND)
436 if (opts->pipe_output)
437 output = STDOUT_FILENO;
439 output = open(output_name, flags, S_IRUSR | S_IWUSR);
441 perror("failed to create output file");
445 rec->output = output;
447 session = perf_session__new(output_name, O_WRONLY,
448 rec->write_mode == WRITE_FORCE, false, NULL);
449 if (session == NULL) {
450 pr_err("Not enough memory for reading perf file header\n");
454 rec->session = session;
456 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
457 perf_header__set_feat(&session->header, feat);
460 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
462 if (!have_tracepoints(&evsel_list->entries))
463 perf_header__clear_feat(&session->header, HEADER_TRACE_INFO);
465 if (!rec->file_new) {
466 err = perf_session__read_header(session, output);
468 goto out_delete_session;
472 err = perf_evlist__prepare_workload(evsel_list, opts, argv);
474 pr_err("Couldn't run the workload!\n");
475 goto out_delete_session;
479 perf_record__open(rec);
482 * perf_session__delete(session) will be called at perf_record__exit()
484 on_exit(perf_record__exit, rec);
486 if (opts->pipe_output) {
487 err = perf_header__write_pipe(output);
490 } else if (rec->file_new) {
491 err = perf_session__write_header(session, evsel_list,
498 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
499 pr_err("Couldn't generate buildids. "
500 "Use --no-buildid to profile anyway.\n");
504 rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
506 machine = perf_session__find_host_machine(session);
508 pr_err("Couldn't find native kernel information.\n");
512 if (opts->pipe_output) {
513 err = perf_event__synthesize_attrs(tool, session,
514 process_synthesized_event);
516 pr_err("Couldn't synthesize attrs.\n");
520 err = perf_event__synthesize_event_types(tool, process_synthesized_event,
523 pr_err("Couldn't synthesize event_types.\n");
527 if (have_tracepoints(&evsel_list->entries)) {
529 * FIXME err <= 0 here actually means that
530 * there were no tracepoints so its not really
531 * an error, just that we don't need to
532 * synthesize anything. We really have to
533 * return this more properly and also
534 * propagate errors that now are calling die()
536 err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
537 process_synthesized_event);
539 pr_err("Couldn't record tracing data.\n");
542 advance_output(rec, err);
546 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
549 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
552 pr_err("Couldn't record kernel reference relocation symbol\n"
553 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
554 "Check /proc/kallsyms permission or run as root.\n");
556 err = perf_event__synthesize_modules(tool, process_synthesized_event,
559 pr_err("Couldn't record kernel module information.\n"
560 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
561 "Check /proc/modules permission or run as root.\n");
564 perf_session__process_machines(session, tool,
565 perf_event__synthesize_guest_os);
567 if (!opts->system_wide)
568 perf_event__synthesize_thread_map(tool, evsel_list->threads,
569 process_synthesized_event,
572 perf_event__synthesize_threads(tool, process_synthesized_event,
575 if (rec->realtime_prio) {
576 struct sched_param param;
578 param.sched_priority = rec->realtime_prio;
579 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
580 pr_err("Could not set realtime priority.\n");
585 perf_evlist__enable(evsel_list);
591 perf_evlist__start_workload(evsel_list);
594 int hits = rec->samples;
596 perf_record__mmap_read_all(rec);
598 if (hits == rec->samples) {
601 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
606 perf_evlist__disable(evsel_list);
609 if (quiet || signr == SIGUSR1)
612 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
615 * Approximate RIP event size: 24 bytes.
618 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
619 (double)rec->bytes_written / 1024.0 / 1024.0,
621 rec->bytes_written / 24);
626 perf_session__delete(session);
630 static const char * const record_usage[] = {
631 "perf record [<options>] [<command>]",
632 "perf record [<options>] -- <command> [<options>]",
637 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
638 * because we need to have access to it in perf_record__exit, that is called
639 * after cmd_record() exits, but since record_options need to be accessible to
640 * builtin-script, leave it here.
642 * At least we don't ouch it in all the other functions here directly.
644 * Just say no to tons of global variables, sigh.
646 static struct perf_record record = {
648 .mmap_pages = UINT_MAX,
649 .user_freq = UINT_MAX,
650 .user_interval = ULLONG_MAX,
652 .sample_id_all_avail = true,
654 .write_mode = WRITE_FORCE,
659 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
660 * with it and switch to use the library functions in perf_evlist that came
661 * from builtin-record.c, i.e. use perf_record_opts,
662 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
665 const struct option record_options[] = {
666 OPT_CALLBACK('e', "event", &record.evlist, "event",
667 "event selector. use 'perf list' to list available events",
668 parse_events_option),
669 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
670 "event filter", parse_filter),
671 OPT_STRING('p', "pid", &record.opts.target_pid, "pid",
672 "record events on existing process id"),
673 OPT_STRING('t', "tid", &record.opts.target_tid, "tid",
674 "record events on existing thread id"),
675 OPT_INTEGER('r', "realtime", &record.realtime_prio,
676 "collect data with this RT SCHED_FIFO priority"),
677 OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
678 "collect data without buffering"),
679 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
680 "collect raw sample records from all opened counters"),
681 OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide,
682 "system-wide collection from all CPUs"),
683 OPT_BOOLEAN('A', "append", &record.append_file,
684 "append to the output file to do incremental profiling"),
685 OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu",
686 "list of cpus to monitor"),
687 OPT_BOOLEAN('f', "force", &record.force,
688 "overwrite existing data file (deprecated)"),
689 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
690 OPT_STRING('o', "output", &record.output_name, "file",
692 OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
693 "child tasks do not inherit counters"),
694 OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
695 OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
696 "number of mmap data pages"),
697 OPT_BOOLEAN(0, "group", &record.opts.group,
698 "put the counters into a counter group"),
699 OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
700 "do call-graph (stack chain/backtrace) recording"),
701 OPT_INCR('v', "verbose", &verbose,
702 "be more verbose (show counter open errors, etc)"),
703 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
704 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
705 "per thread counts"),
706 OPT_BOOLEAN('d', "data", &record.opts.sample_address,
708 OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
709 OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
710 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
712 OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
713 "do not update the buildid cache"),
714 OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
715 "do not collect buildids in perf.data"),
716 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
717 "monitor event in cgroup name only",
719 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
723 int cmd_record(int argc, const char **argv, const char *prefix __used)
726 struct perf_evsel *pos;
727 struct perf_evlist *evsel_list;
728 struct perf_record *rec = &record;
730 perf_header__set_cmdline(argc, argv);
732 evsel_list = perf_evlist__new(NULL, NULL);
733 if (evsel_list == NULL)
736 rec->evlist = evsel_list;
738 argc = parse_options(argc, argv, record_options, record_usage,
739 PARSE_OPT_STOP_AT_NON_OPTION);
740 if (!argc && !rec->opts.target_pid && !rec->opts.target_tid &&
741 !rec->opts.system_wide && !rec->opts.cpu_list && !rec->uid_str)
742 usage_with_options(record_usage, record_options);
744 if (rec->force && rec->append_file) {
745 fprintf(stderr, "Can't overwrite and append at the same time."
746 " You need to choose between -f and -A");
747 usage_with_options(record_usage, record_options);
748 } else if (rec->append_file) {
749 rec->write_mode = WRITE_APPEND;
751 rec->write_mode = WRITE_FORCE;
754 if (nr_cgroups && !rec->opts.system_wide) {
755 fprintf(stderr, "cgroup monitoring only available in"
756 " system-wide mode\n");
757 usage_with_options(record_usage, record_options);
762 if (symbol_conf.kptr_restrict)
764 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
765 "check /proc/sys/kernel/kptr_restrict.\n\n"
766 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
767 "file is not found in the buildid cache or in the vmlinux path.\n\n"
768 "Samples in kernel modules won't be resolved at all.\n\n"
769 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
770 "even with a suitable vmlinux or kallsyms file.\n\n");
772 if (rec->no_buildid_cache || rec->no_buildid)
773 disable_buildid_cache();
775 if (evsel_list->nr_entries == 0 &&
776 perf_evlist__add_default(evsel_list) < 0) {
777 pr_err("Not enough memory for event selector list\n");
778 goto out_symbol_exit;
781 rec->opts.uid = parse_target_uid(rec->uid_str, rec->opts.target_tid,
782 rec->opts.target_pid);
783 if (rec->uid_str != NULL && rec->opts.uid == UINT_MAX - 1)
786 if (rec->opts.target_pid)
787 rec->opts.target_tid = rec->opts.target_pid;
789 if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid,
790 rec->opts.target_tid, rec->opts.uid,
791 rec->opts.cpu_list) < 0)
792 usage_with_options(record_usage, record_options);
794 list_for_each_entry(pos, &evsel_list->entries, node) {
795 if (perf_header__push_event(pos->attr.config, event_name(pos)))
799 if (rec->opts.user_interval != ULLONG_MAX)
800 rec->opts.default_interval = rec->opts.user_interval;
801 if (rec->opts.user_freq != UINT_MAX)
802 rec->opts.freq = rec->opts.user_freq;
805 * User specified count overrides default frequency.
807 if (rec->opts.default_interval)
809 else if (rec->opts.freq) {
810 rec->opts.default_interval = rec->opts.freq;
812 fprintf(stderr, "frequency and count are zero, aborting\n");
817 err = __cmd_record(&record, argc, argv);
819 perf_evlist__delete_maps(evsel_list);