perf tools: Speed up thread map generation
authorDon Zickus <dzickus@redhat.com>
Fri, 14 Mar 2014 14:43:44 +0000 (10:43 -0400)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Fri, 14 Mar 2014 21:08:41 +0000 (18:08 -0300)
When trying to capture perf data on a system running spejbb2013, perf
hung for about 15 minutes.  This is because it took that long to gather
about 10,000 thread maps and process them.

I don't think a user wants to wait that long.

Instead, recognize that thread maps are roughly equivalent to pid maps
and just quickly copy those instead.

To do this, I synthesize 'fork' events, this eventually calls
thread__fork() and copies the maps over.

The overhead goes from 15 minutes down to about a few seconds.

--
V2: based on Jiri's comments, moved malloc up a level
    and made sure the memory was freed

Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Joe Mario <jmario@redhat.com>
Link: http://lkml.kernel.org/r/1394808224-113774-1-git-send-email-dzickus@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/util/event.c

index 55eebe9365134491a857cd76340010326e4fa7fe..3e580be0f6fb9a5a9111f57bc5b3cf67b13f9e4e 100644 (file)
@@ -129,6 +129,28 @@ out:
        return tgid;
 }
 
+static int perf_event__synthesize_fork(struct perf_tool *tool,
+                                      union perf_event *event, pid_t pid,
+                                      pid_t tgid, perf_event__handler_t process,
+                                      struct machine *machine)
+{
+       memset(&event->fork, 0, sizeof(event->fork) + machine->id_hdr_size);
+
+       /* this is really a clone event but we use fork to synthesize it */
+       event->fork.ppid = tgid;
+       event->fork.ptid = tgid;
+       event->fork.pid  = tgid;
+       event->fork.tid  = pid;
+       event->fork.header.type = PERF_RECORD_FORK;
+
+       event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size);
+
+       if (process(tool, event, &synth_sample, machine) != 0)
+               return -1;
+
+       return 0;
+}
+
 int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                                       union perf_event *event,
                                       pid_t pid, pid_t tgid,
@@ -278,6 +300,7 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
 
 static int __event__synthesize_thread(union perf_event *comm_event,
                                      union perf_event *mmap_event,
+                                     union perf_event *fork_event,
                                      pid_t pid, int full,
                                          perf_event__handler_t process,
                                      struct perf_tool *tool,
@@ -326,9 +349,15 @@ static int __event__synthesize_thread(union perf_event *comm_event,
                if (tgid == -1)
                        return -1;
 
-               /* process the thread's maps too */
-               rc = perf_event__synthesize_mmap_events(tool, mmap_event, _pid, tgid,
-                                                       process, machine, mmap_data);
+               if (_pid == pid) {
+                       /* process the parent's maps too */
+                       rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
+                                               process, machine, mmap_data);
+               } else {
+                       /* only fork the tid's map, to save time */
+                       rc = perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
+                                                process, machine);
+               }
 
                if (rc)
                        return rc;
@@ -344,7 +373,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                                      struct machine *machine,
                                      bool mmap_data)
 {
-       union perf_event *comm_event, *mmap_event;
+       union perf_event *comm_event, *mmap_event, *fork_event;
        int err = -1, thread, j;
 
        comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
@@ -355,9 +384,14 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
        if (mmap_event == NULL)
                goto out_free_comm;
 
+       fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size);
+       if (fork_event == NULL)
+               goto out_free_mmap;
+
        err = 0;
        for (thread = 0; thread < threads->nr; ++thread) {
                if (__event__synthesize_thread(comm_event, mmap_event,
+                                              fork_event,
                                               threads->map[thread], 0,
                                               process, tool, machine,
                                               mmap_data)) {
@@ -383,6 +417,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                        /* if not, generate events for it */
                        if (need_leader &&
                            __event__synthesize_thread(comm_event, mmap_event,
+                                                      fork_event,
                                                       comm_event->comm.pid, 0,
                                                       process, tool, machine,
                                                       mmap_data)) {
@@ -391,6 +426,8 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                        }
                }
        }
+       free(fork_event);
+out_free_mmap:
        free(mmap_event);
 out_free_comm:
        free(comm_event);
@@ -405,7 +442,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
        DIR *proc;
        char proc_path[PATH_MAX];
        struct dirent dirent, *next;
-       union perf_event *comm_event, *mmap_event;
+       union perf_event *comm_event, *mmap_event, *fork_event;
        int err = -1;
 
        comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
@@ -416,6 +453,10 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
        if (mmap_event == NULL)
                goto out_free_comm;
 
+       fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size);
+       if (fork_event == NULL)
+               goto out_free_mmap;
+
        if (machine__is_default_guest(machine))
                return 0;
 
@@ -423,7 +464,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
        proc = opendir(proc_path);
 
        if (proc == NULL)
-               goto out_free_mmap;
+               goto out_free_fork;
 
        while (!readdir_r(proc, &dirent, &next) && next) {
                char *end;
@@ -435,12 +476,14 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
                 * We may race with exiting thread, so don't stop just because
                 * one thread couldn't be synthesized.
                 */
-               __event__synthesize_thread(comm_event, mmap_event, pid, 1,
-                                          process, tool, machine, mmap_data);
+               __event__synthesize_thread(comm_event, mmap_event, fork_event, pid,
+                                          1, process, tool, machine, mmap_data);
        }
 
        err = 0;
        closedir(proc);
+out_free_fork:
+       free(fork_event);
 out_free_mmap:
        free(mmap_event);
 out_free_comm: