4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
40 struct perf_tool tool;
41 struct perf_record_opts opts;
43 const char *output_name;
44 struct perf_evlist *evlist;
45 struct perf_session *session;
49 unsigned int page_size;
51 enum write_mode_t write_mode;
53 bool no_buildid_cache;
58 off_t post_processing_offset;
61 static void advance_output(struct perf_record *rec, size_t size)
63 rec->bytes_written += size;
66 static void write_output(struct perf_record *rec, void *buf, size_t size)
69 int ret = write(rec->output, buf, size);
72 die("failed to write");
77 rec->bytes_written += ret;
81 static int process_synthesized_event(struct perf_tool *tool,
82 union perf_event *event,
83 struct perf_sample *sample __used,
84 struct machine *machine __used)
86 struct perf_record *rec = container_of(tool, struct perf_record, tool);
87 write_output(rec, event, event->header.size);
91 static void perf_record__mmap_read(struct perf_record *rec,
94 unsigned int head = perf_mmap__read_head(md);
95 unsigned int old = md->prev;
96 unsigned char *data = md->base + rec->page_size;
107 if ((old & md->mask) + size != (head & md->mask)) {
108 buf = &data[old & md->mask];
109 size = md->mask + 1 - (old & md->mask);
112 write_output(rec, buf, size);
115 buf = &data[old & md->mask];
119 write_output(rec, buf, size);
122 perf_mmap__write_tail(md, old);
125 static volatile int done = 0;
126 static volatile int signr = -1;
127 static volatile int child_finished = 0;
129 static void sig_handler(int sig)
138 static void perf_record__sig_exit(int exit_status __used, void *arg)
140 struct perf_record *rec = arg;
143 if (rec->evlist->workload.pid > 0) {
145 kill(rec->evlist->workload.pid, SIGTERM);
148 if (WIFSIGNALED(status))
149 psignal(WTERMSIG(status), rec->progname);
152 if (signr == -1 || signr == SIGUSR1)
155 signal(signr, SIG_DFL);
156 kill(getpid(), signr);
159 static bool perf_evlist__equal(struct perf_evlist *evlist,
160 struct perf_evlist *other)
162 struct perf_evsel *pos, *pair;
164 if (evlist->nr_entries != other->nr_entries)
167 pair = list_entry(other->entries.next, struct perf_evsel, node);
169 list_for_each_entry(pos, &evlist->entries, node) {
170 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
172 pair = list_entry(pair->node.next, struct perf_evsel, node);
178 static void perf_record__open(struct perf_record *rec)
180 struct perf_evsel *pos, *first;
181 struct perf_evlist *evlist = rec->evlist;
182 struct perf_session *session = rec->session;
183 struct perf_record_opts *opts = &rec->opts;
185 first = list_entry(evlist->entries.next, struct perf_evsel, node);
187 perf_evlist__config_attrs(evlist, opts);
189 list_for_each_entry(pos, &evlist->entries, node) {
190 struct perf_event_attr *attr = &pos->attr;
191 struct xyarray *group_fd = NULL;
193 * Check if parse_single_tracepoint_event has already asked for
196 * XXX this is kludgy but short term fix for problems introduced by
197 * eac23d1c that broke 'perf script' by having different sample_types
198 * when using multiple tracepoint events when we use a perf binary
199 * that tries to use sample_id_all on an older kernel.
201 * We need to move counter creation to perf_session, support
202 * different sample_types, etc.
204 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
206 if (opts->group && pos != first)
207 group_fd = first->fd;
208 fallback_missing_features:
209 if (opts->exclude_guest_missing)
210 attr->exclude_guest = attr->exclude_host = 0;
212 attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
214 if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
215 opts->group, group_fd) < 0) {
218 if (err == EPERM || err == EACCES) {
219 ui__error_paranoid();
221 } else if (err == ENODEV && opts->cpu_list) {
222 die("No such device - did you specify"
223 " an out-of-range profile CPU?\n");
224 } else if (err == EINVAL) {
225 if (!opts->exclude_guest_missing &&
226 (attr->exclude_guest || attr->exclude_host)) {
227 pr_debug("Old kernel, cannot exclude "
228 "guest or host samples.\n");
229 opts->exclude_guest_missing = true;
230 goto fallback_missing_features;
231 } else if (opts->sample_id_all_avail) {
233 * Old kernel, no attr->sample_id_type_all field
235 opts->sample_id_all_avail = false;
236 if (!opts->sample_time && !opts->raw_samples && !time_needed)
237 attr->sample_type &= ~PERF_SAMPLE_TIME;
239 goto retry_sample_id;
244 * If it's cycles then fall back to hrtimer
245 * based cpu-clock-tick sw counter, which
246 * is always available even if no PMU support:
248 if (attr->type == PERF_TYPE_HARDWARE
249 && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
252 ui__warning("The cycles event is not supported, "
253 "trying to fall back to cpu-clock-ticks\n");
254 attr->type = PERF_TYPE_SOFTWARE;
255 attr->config = PERF_COUNT_SW_CPU_CLOCK;
260 ui__warning("The %s event is not supported.\n",
266 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
269 #if defined(__i386__) || defined(__x86_64__)
270 if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
271 die("No hardware sampling interrupt available."
272 " No APIC? If so then you can boot the kernel"
273 " with the \"lapic\" boot parameter to"
274 " force-enable it.\n");
277 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
281 if (perf_evlist__set_filters(evlist)) {
282 error("failed to set filter with %d (%s)\n", errno,
287 if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
289 die("Permission error mapping pages.\n"
290 "Consider increasing "
291 "/proc/sys/kernel/perf_event_mlock_kb,\n"
292 "or try again with a smaller value of -m/--mmap_pages.\n"
293 "(current value: %d)\n", opts->mmap_pages);
294 else if (!is_power_of_2(opts->mmap_pages))
295 die("--mmap_pages/-m value must be a power of two.");
297 die("failed to mmap with %d (%s)\n", errno, strerror(errno));
301 session->evlist = evlist;
303 if (!perf_evlist__equal(session->evlist, evlist)) {
304 fprintf(stderr, "incompatible append\n");
309 perf_session__update_sample_type(session);
312 static int process_buildids(struct perf_record *rec)
314 u64 size = lseek(rec->output, 0, SEEK_CUR);
319 rec->session->fd = rec->output;
320 return __perf_session__process_events(rec->session, rec->post_processing_offset,
321 size - rec->post_processing_offset,
322 size, &build_id__mark_dso_hit_ops);
325 static void perf_record__exit(int status __used, void *arg)
327 struct perf_record *rec = arg;
329 if (!rec->opts.pipe_output) {
330 rec->session->header.data_size += rec->bytes_written;
332 if (!rec->no_buildid)
333 process_buildids(rec);
334 perf_session__write_header(rec->session, rec->evlist,
336 perf_session__delete(rec->session);
337 perf_evlist__delete(rec->evlist);
342 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
345 struct perf_tool *tool = data;
347 if (machine__is_host(machine))
351 *As for guest kernel when processing subcommand record&report,
352 *we arrange module mmap prior to guest kernel mmap and trigger
353 *a preload dso because default guest module symbols are loaded
354 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
355 *method is used to avoid symbol missing when the first addr is
356 *in module instead of in guest kernel.
358 err = perf_event__synthesize_modules(tool, process_synthesized_event,
361 pr_err("Couldn't record guest kernel [%d]'s reference"
362 " relocation symbol.\n", machine->pid);
365 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
366 * have no _text sometimes.
368 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
371 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
374 pr_err("Couldn't record guest kernel [%d]'s reference"
375 " relocation symbol.\n", machine->pid);
378 static struct perf_event_header finished_round_event = {
379 .size = sizeof(struct perf_event_header),
380 .type = PERF_RECORD_FINISHED_ROUND,
383 static void perf_record__mmap_read_all(struct perf_record *rec)
387 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
388 if (rec->evlist->mmap[i].base)
389 perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
392 if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO))
393 write_output(rec, &finished_round_event, sizeof(finished_round_event));
396 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
400 int err, output, feat;
401 unsigned long waking = 0;
402 const bool forks = argc > 0;
403 struct machine *machine;
404 struct perf_tool *tool = &rec->tool;
405 struct perf_record_opts *opts = &rec->opts;
406 struct perf_evlist *evsel_list = rec->evlist;
407 const char *output_name = rec->output_name;
408 struct perf_session *session;
410 rec->progname = argv[0];
412 rec->page_size = sysconf(_SC_PAGE_SIZE);
414 on_exit(perf_record__sig_exit, rec);
415 signal(SIGCHLD, sig_handler);
416 signal(SIGINT, sig_handler);
417 signal(SIGUSR1, sig_handler);
420 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
421 opts->pipe_output = true;
423 rec->output_name = output_name = "perf.data";
426 if (!strcmp(output_name, "-"))
427 opts->pipe_output = true;
428 else if (!stat(output_name, &st) && st.st_size) {
429 if (rec->write_mode == WRITE_FORCE) {
430 char oldname[PATH_MAX];
431 snprintf(oldname, sizeof(oldname), "%s.old",
434 rename(output_name, oldname);
436 } else if (rec->write_mode == WRITE_APPEND) {
437 rec->write_mode = WRITE_FORCE;
441 flags = O_CREAT|O_RDWR;
442 if (rec->write_mode == WRITE_APPEND)
447 if (opts->pipe_output)
448 output = STDOUT_FILENO;
450 output = open(output_name, flags, S_IRUSR | S_IWUSR);
452 perror("failed to create output file");
456 rec->output = output;
458 session = perf_session__new(output_name, O_WRONLY,
459 rec->write_mode == WRITE_FORCE, false, NULL);
460 if (session == NULL) {
461 pr_err("Not enough memory for reading perf file header\n");
465 rec->session = session;
467 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
468 perf_header__set_feat(&session->header, feat);
471 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
473 if (!have_tracepoints(&evsel_list->entries))
474 perf_header__clear_feat(&session->header, HEADER_TRACE_INFO);
476 if (!rec->file_new) {
477 err = perf_session__read_header(session, output);
479 goto out_delete_session;
483 err = perf_evlist__prepare_workload(evsel_list, opts, argv);
485 pr_err("Couldn't run the workload!\n");
486 goto out_delete_session;
490 perf_record__open(rec);
493 * perf_session__delete(session) will be called at perf_record__exit()
495 on_exit(perf_record__exit, rec);
497 if (opts->pipe_output) {
498 err = perf_header__write_pipe(output);
501 } else if (rec->file_new) {
502 err = perf_session__write_header(session, evsel_list,
509 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
510 pr_err("Couldn't generate buildids. "
511 "Use --no-buildid to profile anyway.\n");
515 rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
517 machine = perf_session__find_host_machine(session);
519 pr_err("Couldn't find native kernel information.\n");
523 if (opts->pipe_output) {
524 err = perf_event__synthesize_attrs(tool, session,
525 process_synthesized_event);
527 pr_err("Couldn't synthesize attrs.\n");
531 err = perf_event__synthesize_event_types(tool, process_synthesized_event,
534 pr_err("Couldn't synthesize event_types.\n");
538 if (have_tracepoints(&evsel_list->entries)) {
540 * FIXME err <= 0 here actually means that
541 * there were no tracepoints so its not really
542 * an error, just that we don't need to
543 * synthesize anything. We really have to
544 * return this more properly and also
545 * propagate errors that now are calling die()
547 err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
548 process_synthesized_event);
550 pr_err("Couldn't record tracing data.\n");
553 advance_output(rec, err);
557 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
560 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
563 pr_err("Couldn't record kernel reference relocation symbol\n"
564 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
565 "Check /proc/kallsyms permission or run as root.\n");
567 err = perf_event__synthesize_modules(tool, process_synthesized_event,
570 pr_err("Couldn't record kernel module information.\n"
571 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
572 "Check /proc/modules permission or run as root.\n");
575 perf_session__process_machines(session, tool,
576 perf_event__synthesize_guest_os);
578 if (!opts->system_wide)
579 perf_event__synthesize_thread_map(tool, evsel_list->threads,
580 process_synthesized_event,
583 perf_event__synthesize_threads(tool, process_synthesized_event,
586 if (rec->realtime_prio) {
587 struct sched_param param;
589 param.sched_priority = rec->realtime_prio;
590 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
591 pr_err("Could not set realtime priority.\n");
596 perf_evlist__enable(evsel_list);
602 perf_evlist__start_workload(evsel_list);
605 int hits = rec->samples;
607 perf_record__mmap_read_all(rec);
609 if (hits == rec->samples) {
612 err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
617 perf_evlist__disable(evsel_list);
620 if (quiet || signr == SIGUSR1)
623 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
626 * Approximate RIP event size: 24 bytes.
629 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
630 (double)rec->bytes_written / 1024.0 / 1024.0,
632 rec->bytes_written / 24);
637 perf_session__delete(session);
641 static const char * const record_usage[] = {
642 "perf record [<options>] [<command>]",
643 "perf record [<options>] -- <command> [<options>]",
648 * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
649 * because we need to have access to it in perf_record__exit, that is called
650 * after cmd_record() exits, but since record_options need to be accessible to
651 * builtin-script, leave it here.
653 * At least we don't ouch it in all the other functions here directly.
655 * Just say no to tons of global variables, sigh.
657 static struct perf_record record = {
659 .mmap_pages = UINT_MAX,
660 .user_freq = UINT_MAX,
661 .user_interval = ULLONG_MAX,
663 .sample_id_all_avail = true,
665 .write_mode = WRITE_FORCE,
670 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
671 * with it and switch to use the library functions in perf_evlist that came
672 * from builtin-record.c, i.e. use perf_record_opts,
673 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
676 const struct option record_options[] = {
677 OPT_CALLBACK('e', "event", &record.evlist, "event",
678 "event selector. use 'perf list' to list available events",
679 parse_events_option),
680 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
681 "event filter", parse_filter),
682 OPT_STRING('p', "pid", &record.opts.target_pid, "pid",
683 "record events on existing process id"),
684 OPT_STRING('t', "tid", &record.opts.target_tid, "tid",
685 "record events on existing thread id"),
686 OPT_INTEGER('r', "realtime", &record.realtime_prio,
687 "collect data with this RT SCHED_FIFO priority"),
688 OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
689 "collect data without buffering"),
690 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
691 "collect raw sample records from all opened counters"),
692 OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide,
693 "system-wide collection from all CPUs"),
694 OPT_BOOLEAN('A', "append", &record.append_file,
695 "append to the output file to do incremental profiling"),
696 OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu",
697 "list of cpus to monitor"),
698 OPT_BOOLEAN('f', "force", &record.force,
699 "overwrite existing data file (deprecated)"),
700 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
701 OPT_STRING('o', "output", &record.output_name, "file",
703 OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
704 "child tasks do not inherit counters"),
705 OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
706 OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
707 "number of mmap data pages"),
708 OPT_BOOLEAN(0, "group", &record.opts.group,
709 "put the counters into a counter group"),
710 OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
711 "do call-graph (stack chain/backtrace) recording"),
712 OPT_INCR('v', "verbose", &verbose,
713 "be more verbose (show counter open errors, etc)"),
714 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
715 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
716 "per thread counts"),
717 OPT_BOOLEAN('d', "data", &record.opts.sample_address,
719 OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
720 OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
721 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
723 OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
724 "do not update the buildid cache"),
725 OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
726 "do not collect buildids in perf.data"),
727 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
728 "monitor event in cgroup name only",
730 OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
734 int cmd_record(int argc, const char **argv, const char *prefix __used)
737 struct perf_evsel *pos;
738 struct perf_evlist *evsel_list;
739 struct perf_record *rec = &record;
741 perf_header__set_cmdline(argc, argv);
743 evsel_list = perf_evlist__new(NULL, NULL);
744 if (evsel_list == NULL)
747 rec->evlist = evsel_list;
749 argc = parse_options(argc, argv, record_options, record_usage,
750 PARSE_OPT_STOP_AT_NON_OPTION);
751 if (!argc && !rec->opts.target_pid && !rec->opts.target_tid &&
752 !rec->opts.system_wide && !rec->opts.cpu_list && !rec->uid_str)
753 usage_with_options(record_usage, record_options);
755 if (rec->force && rec->append_file) {
756 fprintf(stderr, "Can't overwrite and append at the same time."
757 " You need to choose between -f and -A");
758 usage_with_options(record_usage, record_options);
759 } else if (rec->append_file) {
760 rec->write_mode = WRITE_APPEND;
762 rec->write_mode = WRITE_FORCE;
765 if (nr_cgroups && !rec->opts.system_wide) {
766 fprintf(stderr, "cgroup monitoring only available in"
767 " system-wide mode\n");
768 usage_with_options(record_usage, record_options);
773 if (symbol_conf.kptr_restrict)
775 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
776 "check /proc/sys/kernel/kptr_restrict.\n\n"
777 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
778 "file is not found in the buildid cache or in the vmlinux path.\n\n"
779 "Samples in kernel modules won't be resolved at all.\n\n"
780 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
781 "even with a suitable vmlinux or kallsyms file.\n\n");
783 if (rec->no_buildid_cache || rec->no_buildid)
784 disable_buildid_cache();
786 if (evsel_list->nr_entries == 0 &&
787 perf_evlist__add_default(evsel_list) < 0) {
788 pr_err("Not enough memory for event selector list\n");
789 goto out_symbol_exit;
792 rec->opts.uid = parse_target_uid(rec->uid_str, rec->opts.target_tid,
793 rec->opts.target_pid);
794 if (rec->uid_str != NULL && rec->opts.uid == UINT_MAX - 1)
797 if (rec->opts.target_pid)
798 rec->opts.target_tid = rec->opts.target_pid;
800 if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid,
801 rec->opts.target_tid, rec->opts.uid,
802 rec->opts.cpu_list) < 0)
803 usage_with_options(record_usage, record_options);
805 list_for_each_entry(pos, &evsel_list->entries, node) {
806 if (perf_header__push_event(pos->attr.config, event_name(pos)))
810 if (rec->opts.user_interval != ULLONG_MAX)
811 rec->opts.default_interval = rec->opts.user_interval;
812 if (rec->opts.user_freq != UINT_MAX)
813 rec->opts.freq = rec->opts.user_freq;
816 * User specified count overrides default frequency.
818 if (rec->opts.default_interval)
820 else if (rec->opts.freq) {
821 rec->opts.default_interval = rec->opts.freq;
823 fprintf(stderr, "frequency and count are zero, aborting\n");
828 err = __cmd_record(&record, argc, argv);
830 perf_evlist__delete_maps(evsel_list);