perf tool source code analysis: perf record

Perf tool in Linux Kernel is used to analyze various kinds of performance issues. More information could be accessed here, but this article goes with the typical calling procedure of the built in core function “cmd_record”. Source code of perf could be found in linux/tools/perf/perf.c. Now let’s begin with function main(). You could read this article for a quick review. Don’t panic!
Take perf record -a sleep 3 for example.
Initialization:
1. main()->run_argv()->handle_internal_command()->run_builtin()->
status = p->fn(argc, argv, prefix)->cmd_record()
build a new record struct rec with struct record *rec = &record;
/*record is initialized with the following code*/
static struct record record = {
    .opts = {
        …
    },
    .tool = {
        …
    },
};
Skip these data structures as you wish but don’t hesitate to look up for the variables later.
struct perf_evlist {
    struct list_head entries;
    struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];//1<<8
    struct fdarray     pollfd;
    struct thread_map *threads;
    struct cpu_map      *cpus;
    struct perf_evsel *selected;

};
Note: perf_evsel stands for one event, and perf_evlist stands for all events we selected, which perf uses to communicate with kernel.
2. rec->evlist = perf_evlist__new();
struct perf_evlist *evlist = zalloc(sizeof(*evlist));
perf_evlist__init(evlist, NULL, NULL);
2.1 init all 256 struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]  to NULL
2.2 init struct list_head entries
2.3 perf_evlist__set_maps(evlist, NULL, NULL)
  • set evlist->cpu and evlist->threads to NULL
  • perf_evlist__propagate_maps
    1. struct perf_evsel *evsel;
    2. evlist__for_each(evlist, evsel) {
      set all evsel->cpu and evsel->thread in evlist to NULL;
      }
      # __evlist__for_each(&(evlist)->entries, evsel)
      # list_for_each_entry(evsel, &(evlist)->entries, node)
      # struct list_head entries; entries belongs to struct evlist
      # struct list_head node; node belongs to struct evsel,
      # we use node to insert evsel into evlist->entries or in other list_heads
      # #define list_for_each_entry(evsel, list, node)
      # for (evsel = list_first_entry(list, typeof(*evsel), node);      
                   &evsel->node != (list);                    
                   evsel = list_next_entry;
      # evlist and evsel are connected through double linked list struct list_head entries in evlist and node in evsel. We’ll talk about it later.
  • fdarray__init(&evlist->pollfd, 64);
    # set fdarray.nr_autogrow to 64 and others to 0 or NULL;

3. perf_evlist__add_default(rec->evlist);
    struct perf_event_attr attr = {
        .type = PERF_TYPE_HARDWARE,
        .config = PERF_COUNT_HW_CPU_CYCLES,
    };
add a new evsel named “cycles” to evlist
3.1 evsel = perf_evsel__new(&attr);
3.2 perf_evlist__add(evlist, evsel)

  • entry->evlist = evlist; # note: put evsel in one evlist for further useage
  • list_add_tail(&entry->node, &evlist->entries);
    # note: list_head is a double linked list which is quite commonly used in kernel.
4. target__parse_uid(&rec->opts.target)
target->uid = UINT_MAX;
5. perf_evlist__create_maps(rec->evlist, &rec->opts.target)
target = {pid = 0x0, tid = 0x0, cpu_list = 0x0, uid_str = 0x0, uid = 4294967295, system_wide = true, uses_mmap = true, default_per_cpu = true, per_thread = false}
5.1 evlist->threads = thread_map__new_str(target->pid, target->tid, target->uid)
  • thread_map__new_str(NULL, NULL, UINT_MAX)
  • thread_map__new_by_tid_str(NULL);
  • thread_map__new_dummy();
  • thread_map__alloc(1);
  • thread_map__realloc(NULL, __nr);

# allocate sizeof(struct thread_map) + sizeof(struct thread_map_data) for thread_map
5.2 cpu_map__new(target->cpu_list);

  • cpu_map__read_all_cpu_map()
    1. fp = open(/sys/devices/system/cpu/online, r)
    2. struct cpu_map *cpus = cpu_map__read(fp);
      # read online cpus and allocate a integer array for them. (tmp_cpus[0,1,2,3,…])
    3. cpu_map__trim_new(nr_cpus, tmp_cpus)
      struct cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(int)*nr_cpus);
      memcpy(cpus->map, tmp_cpus, sizeof(int)*nr_cpus);
      # copy the content of online cpus directly to cpus->map.

    Till now, target->cpu_list got online cpu information.
    Like: for (i = 0; i < nr_cpus; ++i) cpus->map[i] = i;

5.3 perf_evlist__propagate_maps(evlist, !!target->cpu_list)
ensure each evsel in evlist get the right cpu_list
5.4 record_opts__config(&rec->opts)
set recording frequency for perf
CORE FUNCTION __cmd_record(&record, argc, argv)
struct machine {
    char          *root_dir;
    struct dsos      dsos;
    struct map_groups kmaps;
    struct map      *vmlinux_maps[MAP__NR_TYPES];
    u64          kernel_start;
    symbol_filter_t      symbol_filter;
    …
};
struct map_groups {
    struct maps     maps[MAP__NR_TYPES];
    struct machine     *machine;
atomic_t     refcnt;
};
1. perf_session__new(struct perf_data_file *file, bool repipe, struct perf_tool *tool)
Will be called by perf record with write way.(perf report with read way)
perf_session__new(&rec->file, false, &rec->tool);
1.1 machines__init(&session->machines);
machine__init((struct machine)&machines->host, “”, HOST_KERNEL_ID);
  • map_groups__init((struct map_groups)&machine->kmaps, machine);
    maps__init((struct maps)&mg->maps[i]);
  • dsos__init(&machine->dsos)

1.2 ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
init ordered_events
1.3 perf_data_file__open(file)
by default open the output file as “perf.data” with create mode
1.4 perf_session__create_kernel_maps(session)
machine__create_kernel_maps(&session->machines.host)
allocate memory for the initialized session->machines.machine

  • struct dso *kernel = machine__get_kernel(machine);
    • kernel = machine__findnew_kernel(machine, vmlinux_name, “[kernel]”, DSO_TYPE_KERNEL);
      machine__findnew_kernel(machine, “[kernel.kallsyms]”, “[kernel]”, 1);
      struct dso *dso = machine__findnew_dso(machine, “[kernel.kallsyms]”);
      dsos__findnew((struct dsos)&machine->dsos, “[kernel.kallsyms]”)
      find kernel with name “[kernel.kallsyms]”

      dso__get(__dsos__findnew(dsos, name));->
      __dsos__findnew(dsos, name)->
      __dsos__find(dsos, name, false);->

      __dso__find_by_longname(&dsos->root, name);->
      Make a dynamic shared object with “[kernel.kallsyms]” then insert it into &machine->dsos
      __dsos__addnew(dsos, name);->
      dso__new(name);->
      __dsos__add(dsos, dso);

    • dso__read_running_kernel_build_id(kernel, machine);
      Read build_id from /sys/kernel/notes which n_type=3 and n_namesz=3
      sysfs__read_build_id(path, dso->build_id, sizeof(dso->build_id)
      read_build_id(void *buf, buf_size,dso->build_id, sizeof(dso->build_id), false); // buf_size (stbuf.st_size of /sys/kernel/notes) typical value: 360B
      //Symbol-minimal.c
      // File note is made of series of structure like
      struct {
           u32 n_namesz;
           u32 n_descsz;
           u32 n_type;
      } *nhdr;
      if (nhdr->n_type == NT_GNU_BUILD_ID &&
      nhdr->n_namesz == sizeof(“GNU”))
      In the first n_namesz stores a pointer points to the name of the very field. See the code here for the whole content of notes file.
      Copy the very desc to dso->build_id.
      Set dso->has_build_id = true.
  • machine__get_running_kernel_start(machine, &name);
    Figure out the start address of _text or _stext in /proc/kallsyms
    addr = kallsyms__get_function_start(filename, name); //filename = “/proc/kallsyms”
    kallsyms__parse(kallsyms_filename, &args, find_symbol_cb)
    Content of /proc/kallsyms filled with lines like 00000000 t fuse_async_req_send
    Read each line and find the very line c1000000 T _text | _stext and record the start address probably c1000000 to start address then return the hex value.(3238002688)
  • __machine__create_kernel_maps(machine, kernel) < 0)
    start = machine__get_running_kernel_start(machine, NULL);// do it again
    Create struct map *map for all MAP__NR_TYPES for passed in structure machine and allocate sizeof(struct map) + sizeof(struct kmap) for each of them.
    for type from 0 to MAP__NR_TYPES
    machine->vmlinux_maps[type] = map__new2(start, kernel, type);
    map__new2(start, kernel, type);->
    map__init(map, type, start, 0, 0, dso);
    void map__init(struct map *map, enum map_type type, u64 start, u64 end, u64,
    pgoff, struct dso *dso)
    kmap = map__kmap(machine->vmlinux_maps[type]);

    struct kmap *kmap = (struct kmap *)(map + 1)
    Set the kmap structure points to machine->kmaps.
    kmap->kmaps = &machine->kmaps;
    MAP__NR_TYPES struct map vmlinux_maps
    |0 | 1|  2 | 3 | …. | MAP__NR_TYPES-1|
    stuct map *map; struct kmap *kmap;
    kmap->kmaps = machine->kmaps;
    map_groups__insert(&machine->kmaps, machine->vmlinux_maps[type]);
    __maps__insert(struct maps *maps, struct map *map)
    Insert new allocated map in machine->vmlinux[type] into kmaps in machine.
    note:

    struct machine {
        struct dsos      dsos;
        struct map_groups kmaps;
        struct map      *vmlinux_maps[MAP__NR_TYPES];
        u64          kernel_start;

    };
    struct map {
    struct dso        *dso;
    struct map_groups    *groups;
    ….
    };


    Now there is extra space of struct kmap after struct map in machine, and in witch kmaps(map_groups type) points to struct kmap of machine(type map_groups) itself. Another thing is
    1. the struct map_groups *group in the former struct map is initialized by NULL and then set to machine->kmaps;
    2. dso in struct map points to machine->dso
    3. struct map_groups *group in the later struct kmap points to struct map_groups in machine structure.
    machine {
    struct map vmlinux[MAP__NR_TYPES];
    //vmlinux[*]->dso = created dso;
    //(struct kmap *)(vmlinux[*]+1) -> kmaps= machine->kmaps;
    }
    struct kmap {

        struct ref_reloc_sym    *ref_reloc_sym;
        struct map_groups    *kmaps;
    };
    struct map_groups {
        struct maps     maps[MAP__NR_TYPES];
        struct machine     *machine;   // machine->kmaps -> machine = machine;
        atomic_t     refcnt;
    };
  • machine__create_modules(machine);
    modules__parse(modules, machine, machine__create_module)
    Get start address and names for all modules.

    1. struct map *map = machine__findnew_module_map(machine, start, name);
      For each modules, find out whether OS have already has module inserted to machine->dsos, if not, a new dso will be created with passed in module name and then inserted into machine->dsos.
      struct map *map = map_groups__find_by_name(&machine->kmaps, MAP__FUNCTION, m.name);
      if (map == NULL) //Can’t find map, so create one for this module
      struct dso *dso = machine__findnew_module_dso(machine, &m, filename);
      if (dso != NULL)
      Find out if there is existing dso for this module name, if not, create one. Module numbers linked to dso is counted by dso->refcnt.
      struct map *map = map__new2(start, dso, MAP__FUNCTION);
      map_groups__insert(&machine->kmaps, map);
    2. dso__kernel_module_get_build_id(map->dso, machine->root_dir);
      Still for each modules, read /sys/module/[MODULE_NAME]/notes/.note.gnu.build-id just like we did in sysfs__read_build_id(path, dso->build_id, sizeof(dso->build_id) in Symbol-minimal.c
      sysfs__read_build_id(filename, dso->build_id, sizeof(dso->build_id)
    3. machine__set_modules_path(machine);
      Go to /lib/modules/$(KENERL_VERSION)/, check all files including sub-directory if there is modules end with “.ko”, if yes, set the dso’s long_name to the module’s absolute path.kmod_path__parse_name(&m, dent->d_name);
      m->kmod = !strncmp(ext, “.ko”, 3);
      m->name = strdup(name)
      map_groups__set_module_path(mg, path, &m);
      Try to find module in (struct dso *)mg->maps by name. If succeed, set dso’s long_name with module’s absolute path.
      long_name = strdup(path);
      dso__set_long_name(map->dso, long_name, true);
  • map_groups__fixup_end(&machine->kmaps);
    Set prev->end = cur->start;
    last->end = ~0ULL;

Get back to __cmd_record

2. record__init_features(rec);
3. perf_evlist__prepare_workload(rec->evlist, &opts->target, argv, file->is_pipe,
workload_exec_failed_signal);

  • Start  a new process using fork() for the suffix parameters e.g. sleep 3. evlist->workload.pid = fork();
    Child process(cp) tells parent process(pp) by:
    close(child_ready_pipe[0]);
    close(go_pipe[1]);
    fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); // set go_pipe[0] to FD_CLOEXEC
    close(child_ready_pipe[1]);
    ret = read(go_pipe[0], &bf, 1)
    pp set cp go by
    close(child_ready_pipe[1]);
    close(go_pipe[0]);
    if (read(child_ready_pipe[0], &bf, 1) == -1) {
        goto out_close_pipes;
    };
    fcntl(go_pipe[1], F_SETFD, FD_CLOEXEC);// SET go_pipe[1] to FD_CLOEXEC
    Then cp runs execvp(argv[0], (char **)argv);

4. record__open(rec)

  • perf_evlist__config(rec.evlist, rec.opts);
    perf_can_comm_exec();
    perf_probe_api(perf_probe_comm_exec);
    perf_do_probe_api(perf_probe_comm_exec, cpu, try[]);
    //const char *try[] = {“cycles:u”, “instructions:u”, “cpu-clock:u”, NULL};
    perf_event_open_cloexec_flag
    fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1, flags);
    //sys_perf_event_open(&evsel->attr, -1, 0, -1, 8); just a probe here
    //This syscall creates a file descriptor that allows measuring performance //information. Each file descriptor corresponds to one event measured;
    //for more details please visit here.
  • Open each of the fd for each cpu on each threads for every evsel on evlist
    evlist__for_each(evlist, pos) {

    perf_evsel__open(pos, pos->cpus, pos->threads)
    __perf_evsel__open(evsel, cpus, threads);
    Set the very file descriptor to the evsel->fd->content[]
    for (cpu = 0; cpu < cpus->nr; cpu++) {
        for (thread = 0; thread < nthreads; thread++) {
            FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
    pid,
    cpus->map[cpu], group_fd, flags);
    } // endof nthreads
    } // endof cpus->nr
    /*The pid and cpu arguments specifies which process and CPU to
    monitor.

    pid: 0 cpu: -1 measures the calling process for all cpu

    pid: 0 cpu: >= 0 measures the calling process for specified cpu

    pid: > 0 cpu: -1 measures the specified process for all cpu
    pid: > 0 cpu: >= 0 measures the specified process for specified cpu
    pid: -1 cpu: >= 0 measures the all process for specified cpu
    This requires CAP_SYS_ADMIN capability or a
    “/proc/sys/kernel/perf_event_paranoid value of less than 1.”
    pid: -1 cpu: -1 error.

    */

    #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
    static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
    {return &xy->contents[x * xy->row_size + y * xy->entry_size];}
    #define FD(evsel, cpu, thread) (*(int *))xyarrary__entry(evsel->fd, cpu, thread){return evsel->fd->content[cpu * evsel->fd->row_size + thread * evsel->fd->entry_size]}
  • Created fds are stored in evsel->fd->contents[].
    Now perf got all fd with syscall in kernel space, we need to mmap them to userspace.
    perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
    opts->auxtrace_mmap_pages,
    (opts->auxtrace_snapshot_mode<0))
    perf_evlist__mmap_ex(evlist, 4294967295, false, 0, false)

    struct mmap_params mp = {
        .prot = PROT_READ | (overwrite ? 0 : PROT_WRITE),
    };

    perf_evlist__alloc_mmap(evlist)
    perf_evlist__alloc_pollfd(evlist)

    perf_evlist__mmap_per_cpu(evlist, &mp)

    for each online cpu: {
        int output = -1;
        for each thread:
    perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu, thread, &output))

    };

    perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu, thread, &output)) ->
    evlist__for_each(evlist, evsel) {
    __perf_evlist__mmap(evlist, cpu, mp, *output)->
    //*output = FD(evsel, cpu, thread)
    evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, mp->prot,
    MAP_SHARED, *output, 0);
    Then memory could be accessed through evlist->mmap[cpu].base

    auxtrace_mmap__mmap(&evlist->mmap[cpu].auxtrace_mmap,
    &mp->auxtrace_mp, evlist->mmap[cpu].base, *output)
    munmap(mm->base, mm->len)
    }

Then we need to write and map information to perf.data file.
5. perf_session__write_header(session, rec->evlist, fd, false);
Write all f_attr of evsel in evlist down to fd

  1.  lseek(fd, sizeof(f_header), SEEK_SET);
  2. evlist__for_each(session->evlist, evsel) {
    evsel->id_offset = lseek(fd, 0, SEEK_CUR);
    //set offset for each evsel in evlist
    do_write(fd, evsel->id, evsel->ids * sizeof(u64))
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    // record all evsel’s id info into id_offset section.
  3. evlist__for_each(evlist, evsel) {
    f_attr = (struct perf_file_attr){
    .attr = evsel->attr,
    .ids  = {
    .offset = evsel->id_offset,
    .size   = evsel->ids * sizeof(u64),
    }
    };
    do_write(fd, &f_attr, sizeof(f_attr));
    // write f_attr above into fd
    }
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    header->data_offset = lseek(fd, 0, SEEK_CUR);
    header->feat_offset = header->data_offset + header->data_size;
    // set offset for data and feat_offset
  4. do_write(fd, &f_header, sizeof(f_header));
    // write header of fd
    evlist__for_each(evlist, evsel) {
    f_attr = (struct perf_file_attr){
    .attr = evsel->attr,
    .ids  = {
    .offset = evsel->id_offset,
    .size   = evsel->ids * sizeof(u64),
    }
    };
    err = do_write(fd, &f_attr, sizeof(f_attr));
    }
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    f_header = (struct perf_file_header){
    .magic       = PERF_MAGIC,
    .size       = sizeof(f_header),
    .attr_size = sizeof(f_attr),
    .attrs = {
    .offset = attr_offset,
    .size   = evlist->nr_entries * sizeof(f_attr),
    },
    .data = {
    .offset = header->data_offset,
    .size    = header->data_size,
    },
    /* event_types is ignored, store zeros */
    };
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    attr_offset   data_offset  data_size
    attr_offset = lseek(fd, 0, SEEK_CUR);
    // set offset for attr in fd

6. perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,machine);
build a union perf_event *event;
size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),”%s%s”,
mmap_name, kmap->ref_reloc_sym->name); //”[kernel.kallsyms]_text
event->mmap.header.size = (sizeof(event->mmap) –
(sizeof(event->mmap.filename) – size) + machine->id_hdr_size);
event->mmap.pgoff = kmap->ref_reloc_sym->addr;
event->mmap.start = map->start;
event->mmap.len   = map->end – event->mmap.start;
event->mmap.pid   = machine->pid;
Insert event to rec by calling process_synthesized_event();
record__write(rec, event, event->header.size);

7. perf_event__synthesize_modules(tool, process_synthesized_event,
machine)

struct mmap_event {
    struct perf_event_header header;
    u32 pid, tid;
    u64 start;
    u64 len;
    u64 pgoff;
    char filename[PATH_MAX];
};
union perf_event{
struct mmap_event        mmap;
}
union perf_event *event;

Create perf_events (as shown above) for all maps in machine->kmaps->maps[MAP__FUNCTION] respectively and fill in members like
mmap.header.type = PERF_RECORD_MMAP
MMAP |struct perf_event_header header| pid, tid| start | len | pgoff | filename |
size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
// Length of long_name in dso
event->mmap.header.size =
sizeof(event->mmap) – (sizeof(event->mmap.filename) – size))
// Calculate the real length of mmap and write into header.size, step one.
memset(event->mmap.filename + size, 0, machine->id_hdr_size);
event->mmap.header.size += machine->id_hdr_size;
// Calculate the real length of mmap, step two.
memset(event->mmap.filename + size, 0, machine->id_hdr_size);
Writer machine->id_hdr_size to filename+size part for filename need to store long_name with the first size part.
MMAP | header| pid, tid | start | len | pgoff | filename |
header.size
filename |machine->dso->long_name| machine->id_hdr_size|
mmap.start, mmap.len, mmap.pid, mmap.filename. Then insert them all to “perf.data” file.
MMAP | header | pid, tid | start | len | pgoff | filename |

8. __machine__synthesize_threads
Since we called perf with -a option, perf_event__synthesize_threads will be called.
For all process in /proc/
__event__synthesize_thread(comm_event, mmap_event, fork_event, pid, (int full)1, process, tool, machine, mmap_data, proc_map_timeout);
For every task(_pid) in /proc/pid/task/, create an comm_event, fork_event, mmap_event for each of them (perf_event__prepare_comm)and get COMM, tigd and ppid for pid in /proc/pid/status by calling perf_event__prepare_comm(comm_event, _pid, machine, &tgid, &ppid).
PID = 3665
COMM_EVENT
*tgid = 3665
*ppid = 2485

event->comm.pid = *tgid;(3665) // they share the same pid
event->comm.header.type = PERF_RECORD_COMM;

struct comm_event {
    struct perf_event_header {
        PERF_RECORD_COMM;
        size
    } header;
    u32 pid; 3665
    u32 tid; _pid in /proc/3665/task/
    char comm[16]; name+machine->id_hdr_size
    },
};

Then insert fork_event, comm_event to perf.data.
For the main thread, create an mmap_event and read /proc/pid/maps, each line printed will be recorded and then written into perf.data in terms of mmap_event.

perf_event__synthesize_mmap_events(tool, mmap_event, pid: 3665, tgid: 3665,
process, machine, mmap_data: false, proc_map_timeout: 500)
cat /proc/3665/maps
82a23000-82e75000 r-xp 00000000 08:01 134003    /lib/i386-linux-gnu/libnss_files-2.19.so
sscanf(bf, “%llx-%llx %s %llx %x:%x %u %sn”,
&event->mmap2.start, &event->mmap2.len, prot,
&event->mmap2.pgoff, &event->mmap2.maj,
&event->mmap2.min,
&ino, execname);
struct mmap2_event {
struct perf_event_header header;
u32 pid, tid; 3665, 3665
u64 start; 0x82a23000
u64 len; 452000 (event->mmap2.len -= event->mmap.start;)
u64 pgoff; 00000000
u32 maj; 08
u32 min; 01
u64 ino; 134003
u64 ino_generation;
u32 prot; 0x4 | 0x1
u32 flags; 0x02;
char filename[PATH_MAX]; “/lib/i386-linux-gnu/libnss_files-2.19.so”
};
char execname[PATH_MAX] =  ” /lib/i386-linux-gnu/libnss_files-2.19.so”
header.size = sizeof(event->mmap2)-sizeof(filename)+(strlen(filename)+1)+id_hdr_size