C programming language calling procedure with assembly language description

Source code:

int g(int x)
return x + 3;

int f(int x)
return g(x);

int main(void)
return f(8) + 1;

The assembly language is generated by:
gcc –S –o main.s main.c -m32
Assembly Source Code











_main:                               ## @main
pushl %ebp                       ##  suppose ebp = esp = 2000
movl %esp, %ebp
subl $4, %esp
movl $20, (%esp)              ## set content in %esp to 20 for later use. (Parameter)
call f: push %eip; movl %(f), %eip
## suppose f is 23, then 23 stores in addresss 1992
## see the sketch below for more details
pushl %ebp                       ## save ebp and esp = esp – 4
movel %esp, %ebp           ## let the new ebp points to the new esp
subl $4, %esp
movl 8(%ebp), %eax         ## eax = (%ebp) + 8 = 20
movl %eax, (%esp)           ## (%esp) = 20
call g: push %eip, movl %g, %eip
to be contined by readers. 🙂

Source Code Analysis

Thanks for reading!

perf tool source code analysis: perf record

Perf tool in Linux Kernel is used to analyze various kinds of performance issues. More information could be accessed here, but this article goes with the typical calling procedure of the built in core function “cmd_record”. Source code of perf could be found in linux/tools/perf/perf.c. Now let’s begin with function main(). You could read this article for a quick review. Don’t panic!
Take perf record -a sleep 3 for example.
1. main()->run_argv()->handle_internal_command()->run_builtin()->
status = p->fn(argc, argv, prefix)->cmd_record()
build a new record struct rec with struct record *rec = &record;
/*record is initialized with the following code*/
static struct record record = {
    .opts = {
    .tool = {
Skip these data structures as you wish but don’t hesitate to look up for the variables later.
struct perf_evlist {
    struct list_head entries;
    struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];//1<<8
    struct fdarray     pollfd;
    struct thread_map *threads;
    struct cpu_map      *cpus;
    struct perf_evsel *selected;

Note: perf_evsel stands for one event, and perf_evlist stands for all events we selected, which perf uses to communicate with kernel.
2. rec->evlist = perf_evlist__new();
struct perf_evlist *evlist = zalloc(sizeof(*evlist));
perf_evlist__init(evlist, NULL, NULL);
2.1 init all 256 struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]  to NULL
2.2 init struct list_head entries
2.3 perf_evlist__set_maps(evlist, NULL, NULL)
  • set evlist->cpu and evlist->threads to NULL
  • perf_evlist__propagate_maps
    1. struct perf_evsel *evsel;
    2. evlist__for_each(evlist, evsel) {
      set all evsel->cpu and evsel->thread in evlist to NULL;
      # __evlist__for_each(&(evlist)->entries, evsel)
      # list_for_each_entry(evsel, &(evlist)->entries, node)
      # struct list_head entries; entries belongs to struct evlist
      # struct list_head node; node belongs to struct evsel,
      # we use node to insert evsel into evlist->entries or in other list_heads
      # #define list_for_each_entry(evsel, list, node)
      # for (evsel = list_first_entry(list, typeof(*evsel), node);      
                   &evsel->node != (list);                    
                   evsel = list_next_entry;
      # evlist and evsel are connected through double linked list struct list_head entries in evlist and node in evsel. We’ll talk about it later.
  • fdarray__init(&evlist->pollfd, 64);
    # set fdarray.nr_autogrow to 64 and others to 0 or NULL;

3. perf_evlist__add_default(rec->evlist);
    struct perf_event_attr attr = {
        .type = PERF_TYPE_HARDWARE,
        .config = PERF_COUNT_HW_CPU_CYCLES,
add a new evsel named “cycles” to evlist
3.1 evsel = perf_evsel__new(&attr);
3.2 perf_evlist__add(evlist, evsel)

  • entry->evlist = evlist; # note: put evsel in one evlist for further useage
  • list_add_tail(&entry->node, &evlist->entries);
    # note: list_head is a double linked list which is quite commonly used in kernel.
4. target__parse_uid(&rec->opts.target)
target->uid = UINT_MAX;
5. perf_evlist__create_maps(rec->evlist, &rec->opts.target)
target = {pid = 0x0, tid = 0x0, cpu_list = 0x0, uid_str = 0x0, uid = 4294967295, system_wide = true, uses_mmap = true, default_per_cpu = true, per_thread = false}
5.1 evlist->threads = thread_map__new_str(target->pid, target->tid, target->uid)
  • thread_map__new_str(NULL, NULL, UINT_MAX)
  • thread_map__new_by_tid_str(NULL);
  • thread_map__new_dummy();
  • thread_map__alloc(1);
  • thread_map__realloc(NULL, __nr);

# allocate sizeof(struct thread_map) + sizeof(struct thread_map_data) for thread_map
5.2 cpu_map__new(target->cpu_list);

  • cpu_map__read_all_cpu_map()
    1. fp = open(/sys/devices/system/cpu/online, r)
    2. struct cpu_map *cpus = cpu_map__read(fp);
      # read online cpus and allocate a integer array for them. (tmp_cpus[0,1,2,3,…])
    3. cpu_map__trim_new(nr_cpus, tmp_cpus)
      struct cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(int)*nr_cpus);
      memcpy(cpus->map, tmp_cpus, sizeof(int)*nr_cpus);
      # copy the content of online cpus directly to cpus->map.

    Till now, target->cpu_list got online cpu information.
    Like: for (i = 0; i < nr_cpus; ++i) cpus->map[i] = i;

5.3 perf_evlist__propagate_maps(evlist, !!target->cpu_list)
ensure each evsel in evlist get the right cpu_list
5.4 record_opts__config(&rec->opts)
set recording frequency for perf
CORE FUNCTION __cmd_record(&record, argc, argv)
struct machine {
    char          *root_dir;
    struct dsos      dsos;
    struct map_groups kmaps;
    struct map      *vmlinux_maps[MAP__NR_TYPES];
    u64          kernel_start;
    symbol_filter_t      symbol_filter;
struct map_groups {
    struct maps     maps[MAP__NR_TYPES];
    struct machine     *machine;
atomic_t     refcnt;
1. perf_session__new(struct perf_data_file *file, bool repipe, struct perf_tool *tool)
Will be called by perf record with write way.(perf report with read way)
perf_session__new(&rec->file, false, &rec->tool);
1.1 machines__init(&session->machines);
machine__init((struct machine)&machines->host, “”, HOST_KERNEL_ID);
  • map_groups__init((struct map_groups)&machine->kmaps, machine);
    maps__init((struct maps)&mg->maps[i]);
  • dsos__init(&machine->dsos)

1.2 ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
init ordered_events
1.3 perf_data_file__open(file)
by default open the output file as “perf.data” with create mode
1.4 perf_session__create_kernel_maps(session)
allocate memory for the initialized session->machines.machine

  • struct dso *kernel = machine__get_kernel(machine);
    • kernel = machine__findnew_kernel(machine, vmlinux_name, “[kernel]”, DSO_TYPE_KERNEL);
      machine__findnew_kernel(machine, “[kernel.kallsyms]”, “[kernel]”, 1);
      struct dso *dso = machine__findnew_dso(machine, “[kernel.kallsyms]”);
      dsos__findnew((struct dsos)&machine->dsos, “[kernel.kallsyms]”)
      find kernel with name “[kernel.kallsyms]”

      dso__get(__dsos__findnew(dsos, name));->
      __dsos__findnew(dsos, name)->
      __dsos__find(dsos, name, false);->

      __dso__find_by_longname(&dsos->root, name);->
      Make a dynamic shared object with “[kernel.kallsyms]” then insert it into &machine->dsos
      __dsos__addnew(dsos, name);->
      __dsos__add(dsos, dso);

    • dso__read_running_kernel_build_id(kernel, machine);
      Read build_id from /sys/kernel/notes which n_type=3 and n_namesz=3
      sysfs__read_build_id(path, dso->build_id, sizeof(dso->build_id)
      read_build_id(void *buf, buf_size,dso->build_id, sizeof(dso->build_id), false); // buf_size (stbuf.st_size of /sys/kernel/notes) typical value: 360B
      // File note is made of series of structure like
      struct {
           u32 n_namesz;
           u32 n_descsz;
           u32 n_type;
      } *nhdr;
      if (nhdr->n_type == NT_GNU_BUILD_ID &&
      nhdr->n_namesz == sizeof(“GNU”))
      In the first n_namesz stores a pointer points to the name of the very field. See the code here for the whole content of notes file.
      Copy the very desc to dso->build_id.
      Set dso->has_build_id = true.
  • machine__get_running_kernel_start(machine, &name);
    Figure out the start address of _text or _stext in /proc/kallsyms
    addr = kallsyms__get_function_start(filename, name); //filename = “/proc/kallsyms”
    kallsyms__parse(kallsyms_filename, &args, find_symbol_cb)
    Content of /proc/kallsyms filled with lines like 00000000 t fuse_async_req_send
    Read each line and find the very line c1000000 T _text | _stext and record the start address probably c1000000 to start address then return the hex value.(3238002688)
  • __machine__create_kernel_maps(machine, kernel) < 0)
    start = machine__get_running_kernel_start(machine, NULL);// do it again
    Create struct map *map for all MAP__NR_TYPES for passed in structure machine and allocate sizeof(struct map) + sizeof(struct kmap) for each of them.
    for type from 0 to MAP__NR_TYPES
    machine->vmlinux_maps[type] = map__new2(start, kernel, type);
    map__new2(start, kernel, type);->
    map__init(map, type, start, 0, 0, dso);
    void map__init(struct map *map, enum map_type type, u64 start, u64 end, u64,
    pgoff, struct dso *dso)
    kmap = map__kmap(machine->vmlinux_maps[type]);

    struct kmap *kmap = (struct kmap *)(map + 1)
    Set the kmap structure points to machine->kmaps.
    kmap->kmaps = &machine->kmaps;
    MAP__NR_TYPES struct map vmlinux_maps
    |0 | 1|  2 | 3 | …. | MAP__NR_TYPES-1|
    stuct map *map; struct kmap *kmap;
    kmap->kmaps = machine->kmaps;
    map_groups__insert(&machine->kmaps, machine->vmlinux_maps[type]);
    __maps__insert(struct maps *maps, struct map *map)
    Insert new allocated map in machine->vmlinux[type] into kmaps in machine.

    struct machine {
        struct dsos      dsos;
        struct map_groups kmaps;
        struct map      *vmlinux_maps[MAP__NR_TYPES];
        u64          kernel_start;

    struct map {
    struct dso        *dso;
    struct map_groups    *groups;

    Now there is extra space of struct kmap after struct map in machine, and in witch kmaps(map_groups type) points to struct kmap of machine(type map_groups) itself. Another thing is
    1. the struct map_groups *group in the former struct map is initialized by NULL and then set to machine->kmaps;
    2. dso in struct map points to machine->dso
    3. struct map_groups *group in the later struct kmap points to struct map_groups in machine structure.
    machine {
    struct map vmlinux[MAP__NR_TYPES];
    //vmlinux[*]->dso = created dso;
    //(struct kmap *)(vmlinux[*]+1) -> kmaps= machine->kmaps;
    struct kmap {

        struct ref_reloc_sym    *ref_reloc_sym;
        struct map_groups    *kmaps;
    struct map_groups {
        struct maps     maps[MAP__NR_TYPES];
        struct machine     *machine;   // machine->kmaps -> machine = machine;
        atomic_t     refcnt;
  • machine__create_modules(machine);
    modules__parse(modules, machine, machine__create_module)
    Get start address and names for all modules.

    1. struct map *map = machine__findnew_module_map(machine, start, name);
      For each modules, find out whether OS have already has module inserted to machine->dsos, if not, a new dso will be created with passed in module name and then inserted into machine->dsos.
      struct map *map = map_groups__find_by_name(&machine->kmaps, MAP__FUNCTION, m.name);
      if (map == NULL) //Can’t find map, so create one for this module
      struct dso *dso = machine__findnew_module_dso(machine, &m, filename);
      if (dso != NULL)
      Find out if there is existing dso for this module name, if not, create one. Module numbers linked to dso is counted by dso->refcnt.
      struct map *map = map__new2(start, dso, MAP__FUNCTION);
      map_groups__insert(&machine->kmaps, map);
    2. dso__kernel_module_get_build_id(map->dso, machine->root_dir);
      Still for each modules, read /sys/module/[MODULE_NAME]/notes/.note.gnu.build-id just like we did in sysfs__read_build_id(path, dso->build_id, sizeof(dso->build_id) in Symbol-minimal.c
      sysfs__read_build_id(filename, dso->build_id, sizeof(dso->build_id)
    3. machine__set_modules_path(machine);
      Go to /lib/modules/$(KENERL_VERSION)/, check all files including sub-directory if there is modules end with “.ko”, if yes, set the dso’s long_name to the module’s absolute path.kmod_path__parse_name(&m, dent->d_name);
      m->kmod = !strncmp(ext, “.ko”, 3);
      m->name = strdup(name)
      map_groups__set_module_path(mg, path, &m);
      Try to find module in (struct dso *)mg->maps by name. If succeed, set dso’s long_name with module’s absolute path.
      long_name = strdup(path);
      dso__set_long_name(map->dso, long_name, true);
  • map_groups__fixup_end(&machine->kmaps);
    Set prev->end = cur->start;
    last->end = ~0ULL;

Get back to __cmd_record

2. record__init_features(rec);
3. perf_evlist__prepare_workload(rec->evlist, &opts->target, argv, file->is_pipe,

  • Start  a new process using fork() for the suffix parameters e.g. sleep 3. evlist->workload.pid = fork();
    Child process(cp) tells parent process(pp) by:
    fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); // set go_pipe[0] to FD_CLOEXEC
    ret = read(go_pipe[0], &bf, 1)
    pp set cp go by
    if (read(child_ready_pipe[0], &bf, 1) == -1) {
        goto out_close_pipes;
    fcntl(go_pipe[1], F_SETFD, FD_CLOEXEC);// SET go_pipe[1] to FD_CLOEXEC
    Then cp runs execvp(argv[0], (char **)argv);

4. record__open(rec)

  • perf_evlist__config(rec.evlist, rec.opts);
    perf_do_probe_api(perf_probe_comm_exec, cpu, try[]);
    //const char *try[] = {“cycles:u”, “instructions:u”, “cpu-clock:u”, NULL};
    fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1, flags);
    //sys_perf_event_open(&evsel->attr, -1, 0, -1, 8); just a probe here
    //This syscall creates a file descriptor that allows measuring performance //information. Each file descriptor corresponds to one event measured;
    //for more details please visit here.
  • Open each of the fd for each cpu on each threads for every evsel on evlist
    evlist__for_each(evlist, pos) {

    perf_evsel__open(pos, pos->cpus, pos->threads)
    __perf_evsel__open(evsel, cpus, threads);
    Set the very file descriptor to the evsel->fd->content[]
    for (cpu = 0; cpu < cpus->nr; cpu++) {
        for (thread = 0; thread < nthreads; thread++) {
            FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
    cpus->map[cpu], group_fd, flags);
    } // endof nthreads
    } // endof cpus->nr
    /*The pid and cpu arguments specifies which process and CPU to

    pid: 0 cpu: -1 measures the calling process for all cpu

    pid: 0 cpu: >= 0 measures the calling process for specified cpu

    pid: > 0 cpu: -1 measures the specified process for all cpu
    pid: > 0 cpu: >= 0 measures the specified process for specified cpu
    pid: -1 cpu: >= 0 measures the all process for specified cpu
    This requires CAP_SYS_ADMIN capability or a
    “/proc/sys/kernel/perf_event_paranoid value of less than 1.”
    pid: -1 cpu: -1 error.


    #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
    static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
    {return &xy->contents[x * xy->row_size + y * xy->entry_size];}
    #define FD(evsel, cpu, thread) (*(int *))xyarrary__entry(evsel->fd, cpu, thread){return evsel->fd->content[cpu * evsel->fd->row_size + thread * evsel->fd->entry_size]}
  • Created fds are stored in evsel->fd->contents[].
    Now perf got all fd with syscall in kernel space, we need to mmap them to userspace.
    perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
    perf_evlist__mmap_ex(evlist, 4294967295, false, 0, false)

    struct mmap_params mp = {
        .prot = PROT_READ | (overwrite ? 0 : PROT_WRITE),


    perf_evlist__mmap_per_cpu(evlist, &mp)

    for each online cpu: {
        int output = -1;
        for each thread:
    perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu, thread, &output))


    perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu, thread, &output)) ->
    evlist__for_each(evlist, evsel) {
    __perf_evlist__mmap(evlist, cpu, mp, *output)->
    //*output = FD(evsel, cpu, thread)
    evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, mp->prot,
    MAP_SHARED, *output, 0);
    Then memory could be accessed through evlist->mmap[cpu].base

    &mp->auxtrace_mp, evlist->mmap[cpu].base, *output)
    munmap(mm->base, mm->len)

Then we need to write and map information to perf.data file.
5. perf_session__write_header(session, rec->evlist, fd, false);
Write all f_attr of evsel in evlist down to fd

  1.  lseek(fd, sizeof(f_header), SEEK_SET);
  2. evlist__for_each(session->evlist, evsel) {
    evsel->id_offset = lseek(fd, 0, SEEK_CUR);
    //set offset for each evsel in evlist
    do_write(fd, evsel->id, evsel->ids * sizeof(u64))
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    // record all evsel’s id info into id_offset section.
  3. evlist__for_each(evlist, evsel) {
    f_attr = (struct perf_file_attr){
    .attr = evsel->attr,
    .ids  = {
    .offset = evsel->id_offset,
    .size   = evsel->ids * sizeof(u64),
    do_write(fd, &f_attr, sizeof(f_attr));
    // write f_attr above into fd
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    header->data_offset = lseek(fd, 0, SEEK_CUR);
    header->feat_offset = header->data_offset + header->data_size;
    // set offset for data and feat_offset
  4. do_write(fd, &f_header, sizeof(f_header));
    // write header of fd
    evlist__for_each(evlist, evsel) {
    f_attr = (struct perf_file_attr){
    .attr = evsel->attr,
    .ids  = {
    .offset = evsel->id_offset,
    .size   = evsel->ids * sizeof(u64),
    err = do_write(fd, &f_attr, sizeof(f_attr));
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    f_header = (struct perf_file_header){
    .magic       = PERF_MAGIC,
    .size       = sizeof(f_header),
    .attr_size = sizeof(f_attr),
    .attrs = {
    .offset = attr_offset,
    .size   = evlist->nr_entries * sizeof(f_attr),
    .data = {
    .offset = header->data_offset,
    .size    = header->data_size,
    /* event_types is ignored, store zeros */
    fd | f_header | id_offset * nr | f_attr * nr | data        |
    attr_offset   data_offset  data_size
    attr_offset = lseek(fd, 0, SEEK_CUR);
    // set offset for attr in fd

6. perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,machine);
build a union perf_event *event;
size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),”%s%s”,
mmap_name, kmap->ref_reloc_sym->name); //”[kernel.kallsyms]_text
event->mmap.header.size = (sizeof(event->mmap) –
(sizeof(event->mmap.filename) – size) + machine->id_hdr_size);
event->mmap.pgoff = kmap->ref_reloc_sym->addr;
event->mmap.start = map->start;
event->mmap.len   = map->end – event->mmap.start;
event->mmap.pid   = machine->pid;
Insert event to rec by calling process_synthesized_event();
record__write(rec, event, event->header.size);

7. perf_event__synthesize_modules(tool, process_synthesized_event,

struct mmap_event {
    struct perf_event_header header;
    u32 pid, tid;
    u64 start;
    u64 len;
    u64 pgoff;
    char filename[PATH_MAX];
union perf_event{
struct mmap_event        mmap;
union perf_event *event;

Create perf_events (as shown above) for all maps in machine->kmaps->maps[MAP__FUNCTION] respectively and fill in members like
mmap.header.type = PERF_RECORD_MMAP
MMAP |struct perf_event_header header| pid, tid| start | len | pgoff | filename |
size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
// Length of long_name in dso
event->mmap.header.size =
sizeof(event->mmap) – (sizeof(event->mmap.filename) – size))
// Calculate the real length of mmap and write into header.size, step one.
memset(event->mmap.filename + size, 0, machine->id_hdr_size);
event->mmap.header.size += machine->id_hdr_size;
// Calculate the real length of mmap, step two.
memset(event->mmap.filename + size, 0, machine->id_hdr_size);
Writer machine->id_hdr_size to filename+size part for filename need to store long_name with the first size part.
MMAP | header| pid, tid | start | len | pgoff | filename |
filename |machine->dso->long_name| machine->id_hdr_size|
mmap.start, mmap.len, mmap.pid, mmap.filename. Then insert them all to “perf.data” file.
MMAP | header | pid, tid | start | len | pgoff | filename |

8. __machine__synthesize_threads
Since we called perf with -a option, perf_event__synthesize_threads will be called.
For all process in /proc/
__event__synthesize_thread(comm_event, mmap_event, fork_event, pid, (int full)1, process, tool, machine, mmap_data, proc_map_timeout);
For every task(_pid) in /proc/pid/task/, create an comm_event, fork_event, mmap_event for each of them (perf_event__prepare_comm)and get COMM, tigd and ppid for pid in /proc/pid/status by calling perf_event__prepare_comm(comm_event, _pid, machine, &tgid, &ppid).
PID = 3665
*tgid = 3665
*ppid = 2485

event->comm.pid = *tgid;(3665) // they share the same pid
event->comm.header.type = PERF_RECORD_COMM;

struct comm_event {
    struct perf_event_header {
    } header;
    u32 pid; 3665
    u32 tid; _pid in /proc/3665/task/
    char comm[16]; name+machine->id_hdr_size

Then insert fork_event, comm_event to perf.data.
For the main thread, create an mmap_event and read /proc/pid/maps, each line printed will be recorded and then written into perf.data in terms of mmap_event.

perf_event__synthesize_mmap_events(tool, mmap_event, pid: 3665, tgid: 3665,
process, machine, mmap_data: false, proc_map_timeout: 500)
cat /proc/3665/maps
82a23000-82e75000 r-xp 00000000 08:01 134003    /lib/i386-linux-gnu/libnss_files-2.19.so
sscanf(bf, “%llx-%llx %s %llx %x:%x %u %sn”,
&event->mmap2.start, &event->mmap2.len, prot,
&event->mmap2.pgoff, &event->mmap2.maj,
&ino, execname);
struct mmap2_event {
struct perf_event_header header;
u32 pid, tid; 3665, 3665
u64 start; 0x82a23000
u64 len; 452000 (event->mmap2.len -= event->mmap.start;)
u64 pgoff; 00000000
u32 maj; 08
u32 min; 01
u64 ino; 134003
u64 ino_generation;
u32 prot; 0x4 | 0x1
u32 flags; 0x02;
char filename[PATH_MAX]; “/lib/i386-linux-gnu/libnss_files-2.19.so”
char execname[PATH_MAX] =  ” /lib/i386-linux-gnu/libnss_files-2.19.so”
header.size = sizeof(event->mmap2)-sizeof(filename)+(strlen(filename)+1)+id_hdr_size


Got my vps ddosed the other days, and so busy was my working days that I couldn’t spare my time to fix it. Now you see it just works fine, this article write down how I did.

On July 1st I received an email from my vps provider and they told me that I have over 10GB anonymous bit torrent data up/down load per day, hence they have to shut it down to avoid legal issues. Also, I was told that the hacker may left a backdoor even if I change my root password, so I could only reinstall my VPS system. The problem now is: how I could do to save my old home page. Luckily, they offered me an rescue mode, with which I can mount my old disk image, what’s worse is that I couldn’t use chroot. Terrible things just happened.

First of all, mount -t ext4 /media/temp /dev/xvdb and copy important files to my own host, including
wp-content/uploads                                                    //uploaded files
/etc/apache2/sites-available/www.haodong.org      //my apache settings
/usr/share/wordpress/wp-config.php                       //wordpress settings
/etc/wordpress/*                                                          //wordpress settings
wp-content/uploads/2015_xx_xx_database.sql      //back up data base, most important

Second, install a new operating system, and install the following programs.
apt-get install apache2 php5  mysql-server mysql-common mysql-client
apt-get install libapache2-mod-auth-mysql php5-mysql php5-gd
Start apache service
service apache2 start
install phpmyadmin and wordpress
apt-get install phpmyadmin wordpress
Now I have wordpress in /usr/share/wordpress.
Upload settings to the corresponding positions.
scp www.haodong.org root@dong:/etc/apache2/site-available/
a2ensite www.haodong.org
It seems apache2 has made a limitation on its web config file that it must ends with .conf so rename www.haodong.org to haodong.conf if you got “www.haodong.org does not exist
For /etc/apache2, which includes default website config containing /var/www/html, make my website work by a2dissite 000-default.conf since in apache2.conf there is

IncludeOptional sites-enabled/*.conf

scp wp-content/uploads root@dong:/usr/share/wordpress/uploads/
scp wp-config.php root@dong:/usr/share/wordpress/wp-config.php                      
scp etc/wordpress/* root@dong:/etc/wordpress

Add on Oct. 16 2015, it seems apache2 has changed its way of setting. Website folders are restrained to /var/www/html, so synchronize files in wordpress.

rsync -avP /usr/share/wordpress/  /var/www/html/

Make sure:
1. define(‘WPLANG’, ‘zh_CN’); exist in /etc/wordpress/wp-config.php
2. Enable write permition of htaccess since I need permenant link.
3. The location in /etc/apache2/sites-available/www.haodong.org is just the wordpress symbolic setting file.
4. Be careful on /etc/wordpress/config-xxx.php, which stored the name and password to mysql data file. Make sure you put them in a safe place, and if not, don’t hesitate to crash your head on a wall. :p
Technically, we finished reinstalling and enabling everthing, let’s try.
1. Open https://haodong.org/phpadmin and input your mysql user and password.(Should be set when you install mysql, like root:123456), if you forget everything about mysql, you could reset DB_NAME, DB_USER, DB_PASSWORD by
mysql -u root -p
CREATE USER wordpressuser@localhost IDENTIFIED BY ‘password’;
GRANT ALL PRIVILEGES ON wordpress.* TO wordpressuser@localhost;
2. upload the sql file we set aside, remember wordpress only support upload file under 2MB, so we need to gzip it to a xxx.sql.zip. After that, open https://haodong.org/wp-admin/install.php, set an administrator user:password pair, which will be stored in the wp-users table we just imported.
Should everything be done?
3. Open www.haodong.org, oops, I forget my administrator user and password. How should I do? Edit the table wp-users table and set password field to “5d41402abc4b2a76b9719d911017c592” to reset my password to “hello”.
Login, and see how my new website going, but what I saw is an amount of messy code. Actually, the old mysql data file is stored in latin1, so before upload the data file, I need to set it to UTF8 in order to make Chinese characters looks well.
vi xxx_database.sql
gzip xxx.sql and upload the file again with UTF-8 Unicode (utf8)  and utf8_general_ci
Here we got beautiful Chinese words. But wait, from ssh, I still saw Chinese names abnormly showed in my screen.
vi /var/lib/locales/supported.d/local
zh_CN.UTF-8 UTF-8
en_US.UTF-8 UTF-8
zh_CN GB2312
vi /etc/environment
Then run locale-gen
Untill now, I could get access to all of my articles when I open my website. But there is one more thing. The media files could still not be touched. I could change uploads file to 777 but this may cause security issues, actually I don’t want to save my vps again. So what I do is:
Change uploads file to 777 temporarily and upload one picture, log on to my vps and see who owns wordpress as a user. And I got:
# ll wordpress/wp-conteng/uploads/xxx.jpg
OK, gotcha,
chown -R www-data:www-data /usr/share/wordpress
chmod 755 /usr/share/wordpress
Now everything works well.
Oh, there is still one more thing, I’d like to talk about is the security problem, I need to do something to avoid being hacked again.




唠叨两句,线性地址和物理地址经过硬件转化,所以看到奇奇怪怪的物理地址也不用担心,这跟逻辑地址没什么直接联系,另外,读代码之前要先明白这段代码是做swapper_pg_dir初始化用的,带着目的读会提高效率. 回顾x86的内存寻址过程,大概是CR3寄存器配合PDT,然后找到PTE的某项后加Page Offset找到所要的内容。不过这里是临时页表的初始化,NO PAE,OK,继续!

/arch/x86/kernel/head_32.S 页目录初始化代码节选
/* Physical address */
#define pa(X) ((X) – __PAGE_OFFSET)
page_pde_offset = (__PAGE_OFFSET >> 20);
  • movl $pa(__brk_base), %edi
  • movl $pa(initial_page_table), %edx
  • movl $PTE_IDENT_ATTR, %eax
    x86/include/asm/pgtable_types.h(#define PTE_IDENT_ATTR 0x003) PTE的属性是0x003,即PRESENT+RW,另外还有 #define PDE_IDENT_ATTR   0x067      /* PRESENT+RW+USER+DIRTY+ACCESSED */  #define PGD_IDENT_ATTR   0x001      /* PRESENT (no other attributes) */


  • leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
    leal     S,D 结果:&S->D
    movel S,D 结果:S->D
    movel (S), D 结果:&S->D
    leal (S), D 结果:S->D
  • movl %ecx,(%edx) /* Store identity PDE entry */
  • movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
    /*再把ecx送到swapper_pg_dir的第768项,page_pde_offset为0xc00,swapper_pg_dir每项为4字节,因此为768项,回顾1024个页表项,前768为用户空间,后256为内核空间,这里叫kernel PDE entry是这个意思*/
  • addl $4,%edx
  • movl $1024, %ecx
    /*tag 10做的工作就是创建一个PDE entry并放入ecx,然后加两个属性标志位存在swapper_pg_dir里*/
  • stosl/*eax(0x003)的内容放入edi指向的物理地址(pg0),然后edi+4,*/
  • addl $0x1000,%eax
    /*eax: 0x1003, 0x2003, 0x3003 …, 0x3ff003*/
    /*提示:pg0页有4K,按照edi+4依次递增放入eax的值,换句话说,pg0的第零项是0x0003,第一项0x1003…, 第1023项,0x3ff003*/
  • loop 11b
  • /*
  • * End condition: we must map up to the end + MAPPING_BEYOND_END.
  • */
  • movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp

    • Enough space to fit pagetables for the low memory linear map */
    • #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
      #define PTRS_PER_PGD 1024
      #define PAGE_SHIFT 12
      PAGE_TABLE_SIZE(0X40000000>>12) <<12
    • PAGE_OFFSET 以外的地址包括0x100000000-0xc0000000=0x40000000,共0x40000页,算得配套页表大小 PAGE_TABLE_SIZE为0x40000/1024=0x100(256个) MAPPING_BEYOND_END为0x100>>PAGE_SHIFT = 0x100000,即256KB
  • cmpl %ebp,%eax
    /*第二轮 eax = 0x7ff03 …. */
  • jb 10b
  • addl $__PAGE_OFFSET, %edi
  • movl %edi, pa(_brk_end)
  • shrl $12, %eax
  • movl %eax, pa(max_pfn_mapped)
  • /* Do early initialization of the fixmap area */
  • movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
  • movl %eax,pa(initial_page_table+0xffc)
swapper_pg_dir第0项内容是pg0的物理地址+PDE_INDENT_ATTR,也就是0x567067, 表示该页表是用户页表,可读写,可访问..(PRESENT+RW+USER+DIRTY+ACCESSED)
(回想stosl,即 movl eax, edi; addl $4, edi)pg0的第0项表示第一页的物理地址,内容是0x003, 第二页物理地址是0x1003 … 下同. 它们对应物理地址前4MB.




暂时以x86为例子,下同,KVM在X86下表现为一个驱动. 这里主要讨论AMD的svm,驱动代码主要位于arch/x86/kvm/svm.c. 英特尔的vmx也会顺带提及.不多说,直接上干货!

加载驱动模块svm.ko -> module_init(svm_init) (svm.c)加粗括号内为函数所在文件名或路径,下同.

     调用kvm_init(kvm_x86_ops,sizeof (struct vcpu_svm))(virt/kvm/kvm_main.c)
intel的vmx.c中的初始化函数同样会调用kvm_init(),只是参数kvm_x86_ops, sizeof struct vcpu_svm/vcpu_vmx不同.

1. kvm_x86_ops(svm.c)中预定义的N多函数.
2. vcpu_svm(kvm_svm.h)我们需要用到它的大小(size)作为参数
3. 调用kvm_init(kvm_main.c)

创建了一些debug entry, kvm_stats_debugfs_item结构体在x86.c中初始化,之后kvm_init()开始调用kvm_arch_init(opaque)(x86.c),

  • 使用传入的kvm_x86_ops参数,留心下会发现这个东西是通过一个void *类型的指针opaque传入的(svm.c).
  • 进入kvm_arch_init(opaque),首先它检测当前的OS是否支持kvm并初始化 kvm_x86_ops全局指针,然后调用kvm_mmu_module_init()(mmu.c)
    • 这个函数初始化了三个暂存的cache,然后调用kvm_init_msr_list()(msr的全称为machine specific registers)它通过rdmr_safe()把msr保存到全局变量msrs_to_save[]数组.
  • 回到kvm_arch_init()(x86.c),调用kvm_mmu_set_mask_ptes()(mmu.c)

之后返回到kvm_init().调用kvm_arch_hardware_setup()(x86.c),实际上调用的是 kvm_x86_ops->hardware_setup(),仍以svm为例,假设我们使用svm.c中的kvm_x86_ops连接了参数结构 体,则需要跳至svm.c中的svm_hardware_setup()函数(intel的vmx则是vmx.c中的 hardware_setup())

  • 进入svm_hardware_setup()分配两个内存页,然后两页全部填充1,之后再init_msrpm_offsets()申请一个全局内存页变量,同样全部填充1,注意第二次申请页时调用了set_msr_interception()对可以拦截的MSR进行设置标记.
  • 然后通过宏对每一个虚拟cpu调用svm_cpu_init()(for_each_possible_cpu(cpu))
    • svm_cpu_init()
      1. 为传入的cpu分配一个svm_cpu_data结构体sd
      2. 为sd的cpu字段初始化
      3. 为sd的saved_data变量分配一个页面


  • per_cpu(svm_data, cpu) = svm_data (include/asm-generic/percpu.h)展开宏定义
    • per_cpu(var, cpu) => (*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu))) => #define SHIFT_PERCPU_PTR(__p, __offset)
      ({ __verify_pcpu_ptr((__p));
      RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); \
      最终调用了RELOC_HIDE(include/linux/compiler-gcc.h) 猜想可能是一个隐藏重定位用的
  • svm.c文件的起始位置有行代码static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);(include/linux/percpu-defs.h)会在运行时在栈空间创建名与svm_data相关的几个系统变量.
  • 我们的目标是应该可以通过调用per_cpu(svm_data,cpu)可以获取当前cpu变量(描述结构体),而为了cpu值则可以通过调用get_cpu_var()(include/asm-generic/percpu.h)获取.为了达到这一目的,我们需要对每一个虚拟cpu做如下操作
    per_cpu(svm_data, cpu) = svm_data
  • 这时每个虚拟cpu均被初始化,进而保证以后每当调用per_cpu()的时候能找到当时存在这里的svm_cpu_data和svm_data->save_data. (貌似说太多了,其实就是初始化)
    int me = raw_smp_processor_id();
    sd = per_cpu(svm_data, me);


  • 先看kvm_arch_check_processor_compat,它返回 kvm_x86_ops->check_processor_compatibility.进入svm.c看这个void函数发现它仅仅强制参数指针指向的值转换为0,所以还是关心smp_call_function_single()(kernel/smp.c)好了,它让第一个参数指定的cpu运行第二个参数传入的回调函数,即刚刚提到的check_processor_compatibility, smp_ops.smp_call_function_mask(mask, func, info, wait)


  • 注册kvm_cpu_notifier回调函数.notifier_call = kvm_cpu_hotplug,
  • kvm_cpu_hotplug
    • 分别处理CPU_DYING, CPU_UP_CANCELED 与 CPU_ONLINE三种通知,并使能或禁止cpu虚拟化特性
    • hardware_enable
      • hardware_enable_nolock()
        • kvm_arch_hardware_enable()
    • hardware_disable



然后把早先传给kvm_init()的参数THIS_MODULE,也就是svm的模块名分别赋值给三个file operation结构体变量:
    kvm_chardev_ops.owner = module;
    kvm_vm_fops.owner = module;
    kvm_vcpu_fops.owner = module;


  • syscore_ops结构体,包括resume,suspend,
  • preempt_ops结构体,包括kvm_sched_in,kvm_sched_out





  2. Linux内核源代码kvm树, 下载点这里

dell E6430 修复记


公司发了戴尔的高配latitude E6430,配置了第三代的i7, 8G内存, 500GB硬盘和1G显存的NV显卡(这个对于Linux简直是灾难,后面会谈), 外部报价1w1左右. 可惜不能连外网. holy shit!! 这尼玛是严重的资源浪费! 安全软件是赛门铁克和趋势科技哼哈二将.撇开自娱自乐的趋势科技不谈,赛门铁克真是在windows防火墙上下足了功夫,流行的各种安全软件都不能干掉它几个相互关联的守护进程,卸载退出需要密码,而且给用户仅仅enable了一个客户端,nac(network access control)更是没戏.我做了试着在注册表里删掉了symantec的退出验证, 运行smc -stop,欣喜的是赛门铁克关了成功连接了家里的wifi,蛋疼的是依旧上不了网,不停的有数据包送到,但就是发不出去.之后我修改赛门铁克的策略,这个实属下策了,因为策略文件长的让人发指,要研究透了它可不容易.操作如下,导出策略文件xxx.policy,打开,把本来应该丢弃的ip段访问从丢弃改为通过.导入之,开始运行,好么,是不报错了,但是赛门铁克直接提示无法提供完全保护.担心这段日志被上报,于是赶紧把之前的策略导回来.这个方案让我看到了曙光,但需要进一步改进.

最终方案: 双系统.这个是最先应该想倒的,也是最先做的.一个月前就装了ubuntu,但是前几天系统总出现各种奇葩问题:

1. 安装virtualbox后每次启动都会提示一个模块无法初始化,需要手动启动.
2. 显卡驱动(NV5200)3D特效莫名其妙消失了,出于蛋疼,去官网更新了NV驱动,3D特效回来,心想果然闭源的驱动还是不靠谱,
3. 没过两天,系统不能关机了.表现为开机后alt+ctrl+f1显示黑屏(有一个cursor在屏幕左上角闪),而alt+ctrl+f2正常进入控制台.虽然不影响使用,但是关机时系统hang住(偶尔,很偶尔关机会表现为注销),这时alt+ctrl+del热启动有效.忽视之.又没过几天热启动无效,每次关机都需要长按电源键.在持续了近一个星期的这种关机方式之后,我顿时觉得这是对IT从业人员的一种侮辱.
1. 第一个问题很常见,virtualbox(以下简称VB)在软件源里更新不及时,添加官方源后重新安装即可,但是之前VB所建立的数据似乎向下兼容性问题很大,表现为新版本VB打开之前的虚拟机镜像后Guest OS的CPU usage几乎稳定在90%以上.换了几个模拟方式,问题依旧,期间还蓝屏好几次.于是只好放弃之前的旧数据,重新安装了新的image.正暗自庆幸问题解决,没过两天老问题又出现了,这让我一口老血喷了一墙.放弃治疗.
2. 显卡驱动重新安装后运行良好,但是安装期间的一行日志引起了我的注意,就是NV重新编译了内核,好让NV显卡替代之前的板载显卡,其实就是dkms把NV的一个驱动模块设置为了开机启动启动,可是VB的核心模块同样需要dkms开机自动启动,更何况我每次启动VB的时候做的就是这个操作,于是怀疑NV显卡和VB冲突.选择卸载NV驱动,安装开源驱动.
3. 不能关机的问题我实在摸不着北,试着在启动参数里加了acpi=off, 仍未解决,怀疑前两个问题导致,卸载NV驱动后仍然不能关机,聊以慰藉的是系统关机hang住的时候按电源键时可以被OS捕获,也就是说可以正常关机了.
目前状态: VB问题依旧, NV显卡驱动被卸载后系统无特效, 关机不正常.
我擦, 折腾了半天一个问题都没解决.
重新安装NV闭源驱动,安装完成后发现仍然没有3D特效, 艹! 放弃治疗,do-release-update!重启后发现出现了键盘偶尔连击,没错,按一个钮,比如a,屏幕出现一排a.果断不能忍.重装系统!

手上有三张iso,一个是Ubuntu 12.04 lts,两个是Ubuntu 11.10 alternate release,后者提供了更多的功能,比如硬盘加密,LVM等,于是选择安装后者,当然主要原因还是在于之前那个糟心的系统是12.04(心理阴影). 备份数据后,安装过程无任何异常,但是重启进入新系统后,擦,黑屏! 第一反应是我的iso有问题, 于是换了另一张11.10重装,重启进入新系统,依旧黑屏!!无奈了,换12.04装吧,没想到这个iso在初始化界面就直接悲剧了.
考虑到之前报废的系统也是用这个iso装的(也许是我打开的方式不对),试着把它烧到U盘里安装,问题依旧.猜想可能是dell BIOS的问题,因为我上个月进行过一次升级.可这么大的bug戴尔不能任由它作死吧?给戴尔客服打电话,语音提示今天是周六,非工作日,人家不上班,好吧,还得靠自己搞定.

后记,这次NV果断选择了源里的驱动,虽然旧点,虽然依旧没有3D特效,但是总比这么折腾强.虚拟机选择virsh了,虽然翘课之前是做KVM的,总用Oracle的东西似乎不太合适(VB确实好用). 安装完firefox的flash-plugin后,发现3D回来了.人品爆发!!撒花!!!

一切的一切,是为了双十一啊, 有了外网我才能抢宝贝啊!!!!购物车已经塞满了啊!!!!