diff mbox

[RFC,11/14,v9] introduce a new monitor command 'dump' to dump guest's memory

Message ID 4F5FFE57.8010308@cn.fujitsu.com
State New
Headers show

Commit Message

Wen Congyang March 14, 2012, 2:11 a.m. UTC
The command's usage:
   dump [-p] file
file should be start with "file:"(the file's path) or "fd:"(the fd's name).

Note:
  1. If you want to use gdb to analyse the core, please specify -p option.
  2. This command doesn't support the fd that is is associated with a pipe,
     socket, or FIFO(lseek will fail with such fd).

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 Makefile.target  |    2 +-
 dump.c           |  714 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 elf.h            |    5 +
 hmp-commands.hx  |   21 ++
 hmp.c            |   10 +
 hmp.h            |    1 +
 qapi-schema.json |   14 +
 qmp-commands.hx  |   34 +++
 8 files changed, 800 insertions(+), 1 deletions(-)
 create mode 100644 dump.c

Comments

Luiz Capitulino March 14, 2012, 5:18 p.m. UTC | #1
On Wed, 14 Mar 2012 10:11:35 +0800
Wen Congyang <wency@cn.fujitsu.com> wrote:

> The command's usage:
>    dump [-p] file
> file should be start with "file:"(the file's path) or "fd:"(the fd's name).
> 
> Note:
>   1. If you want to use gdb to analyse the core, please specify -p option.
>   2. This command doesn't support the fd that is is associated with a pipe,
>      socket, or FIFO(lseek will fail with such fd).
> 
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  Makefile.target  |    2 +-
>  dump.c           |  714 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  elf.h            |    5 +
>  hmp-commands.hx  |   21 ++
>  hmp.c            |   10 +
>  hmp.h            |    1 +
>  qapi-schema.json |   14 +
>  qmp-commands.hx  |   34 +++
>  8 files changed, 800 insertions(+), 1 deletions(-)
>  create mode 100644 dump.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index c81c4fa..287fbe7 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -213,7 +213,7 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o
>  obj-$(CONFIG_VGA) += vga.o
>  obj-y += memory.o savevm.o
>  obj-y += memory_mapping.o
> -obj-$(CONFIG_HAVE_CORE_DUMP) += arch_dump.o
> +obj-$(CONFIG_HAVE_CORE_DUMP) += arch_dump.o dump.o
>  LIBS+=-lz
>  
>  obj-i386-$(CONFIG_KVM) += hyperv.o
> diff --git a/dump.c b/dump.c
> new file mode 100644
> index 0000000..42e1681
> --- /dev/null
> +++ b/dump.c
> @@ -0,0 +1,714 @@
> +/*
> + * QEMU dump
> + *
> + * Copyright Fujitsu, Corp. 2011
> + *
> + * Authors:
> + *     Wen Congyang <wency@cn.fujitsu.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include <unistd.h>
> +#include "elf.h"
> +#include <sys/procfs.h>
> +#include <glib.h>
> +#include "cpu.h"
> +#include "cpu-all.h"
> +#include "targphys.h"
> +#include "monitor.h"
> +#include "kvm.h"
> +#include "dump.h"
> +#include "sysemu.h"
> +#include "bswap.h"
> +#include "memory_mapping.h"
> +#include "error.h"
> +#include "qmp-commands.h"
> +#include "gdbstub.h"
> +
> +static inline uint16_t cpu_convert_to_target16(uint16_t val, int endian)
> +{
> +    if (endian == ELFDATA2LSB) {
> +        val = cpu_to_le16(val);
> +    } else {
> +        val = cpu_to_be16(val);
> +    }
> +
> +    return val;
> +}
> +
> +static inline uint32_t cpu_convert_to_target32(uint32_t val, int endian)
> +{
> +    if (endian == ELFDATA2LSB) {
> +        val = cpu_to_le32(val);
> +    } else {
> +        val = cpu_to_be32(val);
> +    }
> +
> +    return val;
> +}
> +
> +static inline uint64_t cpu_convert_to_target64(uint64_t val, int endian)
> +{
> +    if (endian == ELFDATA2LSB) {
> +        val = cpu_to_le64(val);
> +    } else {
> +        val = cpu_to_be64(val);
> +    }
> +
> +    return val;
> +}
> +
> +enum {
> +    DUMP_STATE_ERROR,
> +    DUMP_STATE_SETUP,
> +    DUMP_STATE_CANCELLED,
> +    DUMP_STATE_ACTIVE,
> +    DUMP_STATE_COMPLETED,
> +};
> +
> +typedef struct DumpState {
> +    ArchDumpInfo dump_info;
> +    MemoryMappingList list;
> +    uint16_t phdr_num;
> +    uint32_t sh_info;
> +    bool have_section;
> +    int state;
> +    bool resume;
> +    char *error;
> +    target_phys_addr_t memory_offset;
> +    write_core_dump_function f;
> +    void (*cleanup)(void *opaque);
> +    void *opaque;
> +} DumpState;
> +
> +static DumpState *dump_get_current(void)
> +{
> +    static DumpState current_dump = {
> +        .state = DUMP_STATE_SETUP,
> +    };
> +
> +    return &current_dump;
> +}

You just dropped a few asynchronous bits and resent this as a synchronous
command, letting all the asynchronous infrastructure in. This is bad, as the
command is more complex then it should be and doesn't make full use of the
added infrastructure.

For example, does the synchronous version really uses DumpState? If it doesn't,
let's just drop it and everything else which is not necessary.

*However*, note that while it's fine with me to have this as a synchronous
command we need a few more ACKs (from libvirt and Anthony and/or Jan). So, I
wouldn't go too far on making changes before we get those ACKs.

> +
> +static int dump_cleanup(DumpState *s)
> +{
> +    int ret = 0;
> +
> +    memory_mapping_list_free(&s->list);
> +    s->cleanup(s->opaque);
> +    if (s->resume) {
> +        vm_start();
> +    }
> +
> +    return ret;
> +}
> +
> +static void dump_error(DumpState *s, const char *reason)
> +{
> +    s->state = DUMP_STATE_ERROR;
> +    s->error = g_strdup(reason);
> +    dump_cleanup(s);
> +}
> +
> +static int write_elf64_header(DumpState *s)
> +{
> +    Elf64_Ehdr elf_header;
> +    int ret;
> +    int endian = s->dump_info.d_endian;
> +
> +    memset(&elf_header, 0, sizeof(Elf64_Ehdr));
> +    memcpy(&elf_header, ELFMAG, 4);
> +    elf_header.e_ident[EI_CLASS] = ELFCLASS64;
> +    elf_header.e_ident[EI_DATA] = s->dump_info.d_endian;
> +    elf_header.e_ident[EI_VERSION] = EV_CURRENT;
> +    elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
> +    elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
> +                                                   endian);
> +    elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
> +    elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
> +    elf_header.e_phoff = cpu_convert_to_target64(sizeof(Elf64_Ehdr), endian);
> +    elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf64_Phdr),
> +                                                     endian);
> +    elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
> +    if (s->have_section) {
> +        uint64_t shoff = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr) * s->sh_info;
> +
> +        elf_header.e_shoff = cpu_convert_to_target64(shoff, endian);
> +        elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf64_Shdr),
> +                                                         endian);
> +        elf_header.e_shnum = cpu_convert_to_target16(1, endian);
> +    }
> +
> +    ret = s->f(0, &elf_header, sizeof(elf_header), s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write elf header.\n");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int write_elf32_header(DumpState *s)
> +{
> +    Elf32_Ehdr elf_header;
> +    int ret;
> +    int endian = s->dump_info.d_endian;
> +
> +    memset(&elf_header, 0, sizeof(Elf32_Ehdr));
> +    memcpy(&elf_header, ELFMAG, 4);
> +    elf_header.e_ident[EI_CLASS] = ELFCLASS32;
> +    elf_header.e_ident[EI_DATA] = endian;
> +    elf_header.e_ident[EI_VERSION] = EV_CURRENT;
> +    elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
> +    elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
> +                                                   endian);
> +    elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
> +    elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
> +    elf_header.e_phoff = cpu_convert_to_target32(sizeof(Elf32_Ehdr), endian);
> +    elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf32_Phdr),
> +                                                     endian);
> +    elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
> +    if (s->have_section) {
> +        uint32_t shoff = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr) * s->sh_info;
> +
> +        elf_header.e_shoff = cpu_convert_to_target32(shoff, endian);
> +        elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf32_Shdr),
> +                                                         endian);
> +        elf_header.e_shnum = cpu_convert_to_target16(1, endian);
> +    }
> +
> +    ret = s->f(0, &elf_header, sizeof(elf_header), s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write elf header.\n");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
> +                            int phdr_index, target_phys_addr_t offset)
> +{
> +    Elf64_Phdr phdr;
> +    off_t phdr_offset;
> +    int ret;
> +    int endian = s->dump_info.d_endian;
> +
> +    memset(&phdr, 0, sizeof(Elf64_Phdr));
> +    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
> +    phdr.p_offset = cpu_convert_to_target64(offset, endian);
> +    phdr.p_paddr = cpu_convert_to_target64(memory_mapping->phys_addr, endian);
> +    if (offset == -1) {
> +        phdr.p_filesz = 0;
> +    } else {
> +        phdr.p_filesz = cpu_convert_to_target64(memory_mapping->length, endian);
> +    }
> +    phdr.p_memsz = cpu_convert_to_target64(memory_mapping->length, endian);
> +    phdr.p_vaddr = cpu_convert_to_target64(memory_mapping->virt_addr, endian);
> +
> +    phdr_offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*phdr_index;
> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf64_Phdr), s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write program header table.\n");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int write_elf32_load(DumpState *s, MemoryMapping *memory_mapping,
> +                            int phdr_index, target_phys_addr_t offset)
> +{
> +    Elf32_Phdr phdr;
> +    off_t phdr_offset;
> +    int ret;
> +    int endian = s->dump_info.d_endian;
> +
> +    memset(&phdr, 0, sizeof(Elf32_Phdr));
> +    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
> +    phdr.p_offset = cpu_convert_to_target32(offset, endian);
> +    phdr.p_paddr = cpu_convert_to_target32(memory_mapping->phys_addr, endian);
> +    if (offset == -1) {
> +        phdr.p_filesz = 0;
> +    } else {
> +        phdr.p_filesz = cpu_convert_to_target32(memory_mapping->length, endian);
> +    }
> +    phdr.p_memsz = cpu_convert_to_target32(memory_mapping->length, endian);
> +    phdr.p_vaddr = cpu_convert_to_target32(memory_mapping->virt_addr, endian);
> +
> +    phdr_offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*phdr_index;
> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf32_Phdr), s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write program header table.\n");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int write_elf64_notes(DumpState *s, int phdr_index,
> +                             target_phys_addr_t *offset)
> +{
> +    CPUState *env;
> +    int ret;
> +    target_phys_addr_t begin = *offset;
> +    Elf64_Phdr phdr;
> +    off_t phdr_offset;
> +    int id;
> +    int endian = s->dump_info.d_endian;
> +
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        id = gdb_id(env);
> +        ret = cpu_write_elf64_note(s->f, env, id, offset, s->opaque);
> +        if (ret < 0) {
> +            dump_error(s, "dump: failed to write elf notes.\n");
> +            return -1;
> +        }
> +    }
> +
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        ret = cpu_write_elf64_qemunote(s->f, env, offset, s->opaque);
> +        if (ret < 0) {
> +            dump_error(s, "dump: failed to write CPU status.\n");
> +            return -1;
> +        }
> +    }
> +
> +    memset(&phdr, 0, sizeof(Elf64_Phdr));
> +    phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
> +    phdr.p_offset = cpu_convert_to_target64(begin, endian);
> +    phdr.p_paddr = 0;
> +    phdr.p_filesz = cpu_convert_to_target64(*offset - begin, endian);
> +    phdr.p_memsz = cpu_convert_to_target64(*offset - begin, endian);
> +    phdr.p_vaddr = 0;
> +
> +    phdr_offset = sizeof(Elf64_Ehdr);
> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf64_Phdr), s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write program header table.\n");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int write_elf32_notes(DumpState *s, int phdr_index,
> +                             target_phys_addr_t *offset)
> +{
> +    CPUState *env;
> +    int ret;
> +    target_phys_addr_t begin = *offset;
> +    Elf32_Phdr phdr;
> +    off_t phdr_offset;
> +    int id;
> +    int endian = s->dump_info.d_endian;
> +
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        id = gdb_id(env);
> +        ret = cpu_write_elf32_note(s->f, env, id, offset, s->opaque);
> +        if (ret < 0) {
> +            dump_error(s, "dump: failed to write elf notes.\n");
> +            return -1;
> +        }
> +    }
> +
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        ret = cpu_write_elf32_qemunote(s->f, env, offset, s->opaque);
> +        if (ret < 0) {
> +            dump_error(s, "dump: failed to write CPU status.\n");
> +            return -1;
> +        }
> +    }
> +
> +    memset(&phdr, 0, sizeof(Elf32_Phdr));
> +    phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
> +    phdr.p_offset = cpu_convert_to_target32(begin, endian);
> +    phdr.p_paddr = 0;
> +    phdr.p_filesz = cpu_convert_to_target32(*offset - begin, endian);
> +    phdr.p_memsz = cpu_convert_to_target32(*offset - begin, endian);
> +    phdr.p_vaddr = 0;
> +
> +    phdr_offset = sizeof(Elf32_Ehdr);
> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf32_Phdr), s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write program header table.\n");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static int write_elf_section(DumpState *s, target_phys_addr_t *offset, int type)
> +{
> +    Elf32_Shdr shdr32;
> +    Elf64_Shdr shdr64;
> +    int endian = s->dump_info.d_endian;
> +    int shdr_size;
> +    void *shdr;
> +    int ret;
> +
> +    if (type == 0) {
> +        shdr_size = sizeof(Elf32_Shdr);
> +        memset(&shdr32, 0, shdr_size);
> +        shdr32.sh_info = cpu_convert_to_target32(s->sh_info, endian);
> +        shdr = &shdr32;
> +    } else {
> +        shdr_size = sizeof(Elf64_Shdr);
> +        memset(&shdr64, 0, shdr_size);
> +        shdr64.sh_info = cpu_convert_to_target32(s->sh_info, endian);
> +        shdr = &shdr64;
> +    }
> +
> +    ret = s->f(*offset, &shdr, shdr_size, s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to write section header table.\n");
> +        return -1;
> +    }
> +
> +    *offset += shdr_size;
> +    return 0;
> +}
> +
> +static int write_data(DumpState *s, void *buf, int length,
> +                      target_phys_addr_t *offset)
> +{
> +    int ret;
> +
> +    ret = s->f(*offset, buf, length, s->opaque);
> +    if (ret < 0) {
> +        dump_error(s, "dump: failed to save memory.\n");
> +        return -1;
> +    }
> +
> +    *offset += length;
> +    return 0;
> +}
> +
> +/* write the memroy to vmcore. 1 page per I/O. */
> +static int write_memory(DumpState *s, RAMBlock *block,
> +                        target_phys_addr_t *offset)
> +{
> +    int i, ret;
> +
> +    for (i = 0; i < block->length / TARGET_PAGE_SIZE; i++) {
> +        ret = write_data(s, block->host + i * TARGET_PAGE_SIZE,
> +                         TARGET_PAGE_SIZE, offset);
> +        if (ret < 0) {
> +            return -1;
> +        }
> +    }
> +
> +    if ((block->length % TARGET_PAGE_SIZE) != 0) {
> +        ret = write_data(s, block->host + i * TARGET_PAGE_SIZE,
> +                         block->length % TARGET_PAGE_SIZE, offset);
> +        if (ret < 0) {
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/* get the memory's offset in the vmcore */
> +static target_phys_addr_t get_offset(target_phys_addr_t phys_addr,
> +                                     target_phys_addr_t memory_offset)
> +{
> +    RAMBlock *block;
> +    target_phys_addr_t offset = memory_offset;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (phys_addr >= block->offset &&
> +            phys_addr < block->offset + block->length) {
> +            return phys_addr - block->offset + offset;
> +        }
> +        offset += block->length;
> +    }
> +
> +    return -1;
> +}
> +
> +/* write elf header, PT_NOTE and elf note to vmcore. */
> +static int dump_begin(DumpState *s)
> +{
> +    target_phys_addr_t offset;
> +    int ret;
> +
> +    s->state = DUMP_STATE_ACTIVE;
> +
> +    /*
> +     * the vmcore's format is:
> +     *   --------------
> +     *   |  elf header |
> +     *   --------------
> +     *   |  PT_NOTE    |
> +     *   --------------
> +     *   |  PT_LOAD    |
> +     *   --------------
> +     *   |  ......     |
> +     *   --------------
> +     *   |  PT_LOAD    |
> +     *   --------------
> +     *   |  sec_hdr    |
> +     *   --------------
> +     *   |  elf note   |
> +     *   --------------
> +     *   |  memory     |
> +     *   --------------
> +     *
> +     * we only know where the memory is saved after we write elf note into
> +     * vmcore.
> +     */
> +
> +    /* write elf header to vmcore */
> +    if (s->dump_info.d_class == ELFCLASS64) {
> +        ret = write_elf64_header(s);
> +    } else {
> +        ret = write_elf32_header(s);
> +    }
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    /* write elf section and notes to vmcore */
> +    if (s->dump_info.d_class == ELFCLASS64) {
> +        if (s->have_section) {
> +            offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*s->sh_info;
> +            if (write_elf_section(s, &offset, 1) < 0) {
> +                return -1;
> +            }
> +        } else {
> +            offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*s->phdr_num;
> +        }
> +        ret = write_elf64_notes(s, 0, &offset);
> +    } else {
> +        if (s->have_section) {
> +            offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*s->sh_info;
> +            if (write_elf_section(s, &offset, 0) < 0) {
> +                return -1;
> +            }
> +        } else {
> +            offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*s->phdr_num;
> +        }
> +        ret = write_elf32_notes(s, 0, &offset);
> +    }
> +
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    s->memory_offset = offset;
> +    return 0;
> +}
> +
> +/* write PT_LOAD to vmcore */
> +static int dump_completed(DumpState *s)
> +{
> +    target_phys_addr_t offset;
> +    MemoryMapping *memory_mapping;
> +    int phdr_index = 1, ret;
> +
> +    QTAILQ_FOREACH(memory_mapping, &s->list.head, next) {
> +        offset = get_offset(memory_mapping->phys_addr, s->memory_offset);
> +        if (s->dump_info.d_class == ELFCLASS64) {
> +            ret = write_elf64_load(s, memory_mapping, phdr_index++, offset);
> +        } else {
> +            ret = write_elf32_load(s, memory_mapping, phdr_index++, offset);
> +        }
> +        if (ret < 0) {
> +            return -1;
> +        }
> +    }
> +
> +    s->state = DUMP_STATE_COMPLETED;
> +    dump_cleanup(s);
> +    return 0;
> +}
> +
> +/* write all memory to vmcore */
> +static int dump_iterate(DumpState *s)
> +{
> +    RAMBlock *block;
> +    target_phys_addr_t offset = s->memory_offset;
> +    int ret;
> +
> +    /* write all memory to vmcore */
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        ret = write_memory(s, block, &offset);
> +        if (ret < 0) {
> +            return -1;
> +        }
> +    }
> +
> +    return dump_completed(s);
> +}
> +
> +static int create_vmcore(DumpState *s)
> +{
> +    int ret;
> +
> +    ret = dump_begin(s);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    ret = dump_iterate(s);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static DumpState *dump_init(bool paging, Error **errp)
> +{
> +    CPUState *env;
> +    DumpState *s = dump_get_current();
> +    int ret;
> +
> +    if (runstate_is_running()) {
> +        vm_stop(RUN_STATE_PAUSED);
> +        s->resume = true;

Hmm, you actually stop the VM. Seems obvious now, but when people talked about
making this asynchronous I automatically assumed that what we didn't want was
having the global mutex held for too much time (ie. while this command was
running).

The only disadvantage of having this as a synchronous command is that libvirt
won't be able to cancel it and won't be able to run other commands in parallel.
Doesn't seem that serious to me.

Btw, RUN_STATE_PAUSED is not a good one. Doesn't matter that much, as this
is unlikely to be visible, but you should use RUN_STATE_SAVE_VM or
RUN_STATE_DEBUG.

> +    } else {
> +        s->resume = false;
> +    }
> +    s->state = DUMP_STATE_SETUP;
> +    if (s->error) {
> +        g_free(s->error);
> +        s->error = NULL;
> +    }
> +
> +    /*
> +     * get dump info: endian, class and architecture.
> +     * If the target architecture is not supported, cpu_get_dump_info() will
> +     * return -1.
> +     *
> +     * if we use kvm, we should synchronize the register before we get dump
> +     * info.
> +     */
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        cpu_synchronize_state(env);
> +    }
> +
> +    ret = cpu_get_dump_info(&s->dump_info);
> +    if (ret < 0) {
> +        error_set(errp, QERR_UNSUPPORTED);

This will let the VM paused.

> +        return NULL;
> +    }
> +
> +    /* get memory mapping */
> +    memory_mapping_list_init(&s->list);
> +    if (paging) {
> +        qemu_get_guest_memory_mapping(&s->list);
> +    } else {
> +        qemu_get_guest_simple_memory_mapping(&s->list);
> +    }
> +
> +    /*
> +     * calculate phdr_num
> +     *
> +     * the type of phdr->num is uint16_t, so we should avoid overflow
> +     */
> +    s->phdr_num = 1; /* PT_NOTE */
> +    if (s->list.num < (1 << 16) - 2) {
> +        s->phdr_num += s->list.num;
> +        s->have_section = false;
> +    } else {
> +        s->have_section = true;
> +        s->phdr_num = PN_XNUM;
> +
> +        /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */
> +        if (s->list.num > (1ULL << 32) - 2) {
> +            s->sh_info = 0xffffffff;
> +        } else {
> +            s->sh_info += s->list.num;
> +        }
> +    }
> +
> +    return s;
> +}
> +
> +static int fd_write_vmcore(target_phys_addr_t offset, void *buf, size_t size,
> +                           void *opaque)
> +{
> +    int fd = (int)(intptr_t)opaque;
> +    int ret;
> +
> +    ret = lseek(fd, offset, SEEK_SET);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    ret = write(fd, buf, size);
> +    if (ret != size) {
> +        return -1;
> +    }

I think you should use send_all() instead of plain write().

> +
> +    return 0;
> +}
> +
> +static void fd_cleanup(void *opaque)
> +{
> +    int fd = (int)(intptr_t)opaque;
> +
> +    if (fd != -1) {
> +        close(fd);
> +    }
> +}
> +
> +static DumpState *dump_init_fd(int fd, bool paging, Error **errp)
> +{
> +    DumpState *s = dump_init(paging, errp);
> +
> +    if (s == NULL) {
> +        return NULL;
> +    }
> +
> +    s->f = fd_write_vmcore;
> +    s->cleanup = fd_cleanup;
> +    s->opaque = (void *)(intptr_t)fd;

Do we really need all these indirections?

> +
> +    return s;
> +}
> +
> +void qmp_dump(bool paging, const char *file, Error **errp)
> +{
> +    const char *p;
> +    int fd = -1;
> +    DumpState *s;
> +
> +#if !defined(WIN32)
> +    if (strstart(file, "fd:", &p)) {
> +        fd = qemu_get_fd(p);

qemu_get_fd() won't be merged, you should use monitor_get_fd(cur_mon, p);

> +        if (fd == -1) {
> +            error_set(errp, QERR_FD_NOT_FOUND, p);
> +            return;
> +        }
> +    }
> +#endif
> +
> +    if  (strstart(file, "file:", &p)) {
> +        fd = open(p, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR);

This is minor, but I'd use qemu_open() here.

> +        if (fd < 0) {
> +            error_set(errp, QERR_OPEN_FILE_FAILED, p);
> +            return;
> +        }
> +    }
> +
> +    if (fd == -1) {
> +        error_set(errp, QERR_INVALID_PARAMETER, "file");
> +        return;
> +    }
> +
> +    s = dump_init_fd(fd, paging, errp);
> +    if (!s) {
> +        return;
> +    }
> +
> +    if (create_vmcore(s) < 0) {
> +        error_set(errp, QERR_IO_ERROR);
> +    }
> +}
> diff --git a/elf.h b/elf.h
> index 2e05d34..6a10657 100644
> --- a/elf.h
> +++ b/elf.h
> @@ -1000,6 +1000,11 @@ typedef struct elf64_sym {
>  
>  #define EI_NIDENT	16
>  
> +/* Special value for e_phnum.  This indicates that the real number of
> +   program headers is too large to fit into e_phnum.  Instead the real
> +   value is in the field sh_info of section 0.  */
> +#define PN_XNUM         0xffff
> +
>  typedef struct elf32_hdr{
>    unsigned char	e_ident[EI_NIDENT];
>    Elf32_Half	e_type;
> diff --git a/hmp-commands.hx b/hmp-commands.hx
> index 6980214..d4cf2e5 100644
> --- a/hmp-commands.hx
> +++ b/hmp-commands.hx
> @@ -880,6 +880,27 @@ server will ask the spice/vnc client to automatically reconnect using the
>  new parameters (if specified) once the vm migration finished successfully.
>  ETEXI
>  
> +#if defined(CONFIG_HAVE_CORE_DUMP)
> +    {
> +        .name       = "dump",
> +        .args_type  = "paging:-p,file:s",
> +        .params     = "[-p] file",
> +        .help       = "dump to file",
> +        .user_print = monitor_user_noop,
> +        .mhandler.cmd = hmp_dump,
> +    },
> +
> +
> +STEXI
> +@item dump [-p] @var{file}
> +@findex dump
> +Dump to @var{file}. The file can be processed with crash or gdb.
> +    file: destination file(started with "file:") or destination file descriptor
> +          (started with "fd:")
> +  paging: do paging to get guest's memory mapping
> +ETEXI
> +#endif
> +
>      {
>          .name       = "snapshot_blkdev",
>          .args_type  = "reuse:-n,device:B,snapshot-file:s?,format:s?",
> diff --git a/hmp.c b/hmp.c
> index 290c43d..e13b793 100644
> --- a/hmp.c
> +++ b/hmp.c
> @@ -860,3 +860,13 @@ void hmp_block_job_cancel(Monitor *mon, const QDict *qdict)
>  
>      hmp_handle_error(mon, &error);
>  }
> +
> +void hmp_dump(Monitor *mon, const QDict *qdict)
> +{
> +    Error *errp = NULL;
> +    int paging = qdict_get_try_bool(qdict, "paging", 0);
> +    const char *file = qdict_get_str(qdict, "file");
> +
> +    qmp_dump(!!paging, file, &errp);

Why the double negation on 'paging'?

> +    hmp_handle_error(mon, &errp);
> +}
> diff --git a/hmp.h b/hmp.h
> index 5409464..b055e50 100644
> --- a/hmp.h
> +++ b/hmp.h
> @@ -59,5 +59,6 @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict);
>  void hmp_block_stream(Monitor *mon, const QDict *qdict);
>  void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict);
>  void hmp_block_job_cancel(Monitor *mon, const QDict *qdict);
> +void hmp_dump(Monitor *mon, const QDict *qdict);
>  
>  #endif
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 04fa84f..81b8c7c 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -1663,3 +1663,17 @@
>  { 'command': 'qom-list-types',
>    'data': { '*implements': 'str', '*abstract': 'bool' },
>    'returns': [ 'ObjectTypeInfo' ] }
> +
> +##
> +# @dump

'dump' is too generic, please call this dump-guest-memory-vmcore or something
more descriptive.

> +#
> +# Dump guest's memory to vmcore.
> +#
> +# @paging: if true, do paging to get guest's memory mapping
> +# @file: the filename or file descriptor of the vmcore.

'file' is not a good name because it can also dump to an fd, maybe 'protocol'?

> +#
> +# Returns: nothing on success
> +#
> +# Since: 1.1
> +##
> +{ 'command': 'dump', 'data': { 'paging': 'bool', 'file': 'str' } }
> diff --git a/qmp-commands.hx b/qmp-commands.hx
> index dfe8a5b..9e39bd9 100644
> --- a/qmp-commands.hx
> +++ b/qmp-commands.hx
> @@ -586,6 +586,40 @@ Example:
>  
>  EQMP
>  
> +#if defined(CONFIG_HAVE_CORE_DUMP)
> +    {
> +        .name       = "dump",
> +        .args_type  = "paging:-p,file:s",
> +        .params     = "[-p] file",
> +        .help       = "dump to file",
> +        .user_print = monitor_user_noop,
> +        .mhandler.cmd_new = qmp_marshal_input_dump,
> +    },
> +
> +SQMP
> +dump
> +
> +
> +Dump to file. The file can be processed with crash or gdb.
> +
> +Arguments:
> +
> +- "paging": do paging to get guest's memory mapping (json-bool)
> +- "file": destination file(started with "file:") or destination file descriptor
> +          (started with "fd:") (json-string)
> +
> +Example:
> +
> +-> { "execute": "dump", "arguments": { "file": "fd:dump" } }
> +<- { "return": {} }
> +
> +Notes:
> +
> +(1) All boolean arguments default to false
> +
> +EQMP
> +#endif
> +
>      {
>          .name       = "netdev_add",
>          .args_type  = "netdev:O",
Wen Congyang March 15, 2012, 2:29 a.m. UTC | #2
At 03/15/2012 01:18 AM, Luiz Capitulino Wrote:
> On Wed, 14 Mar 2012 10:11:35 +0800
> Wen Congyang <wency@cn.fujitsu.com> wrote:
> 

<cut>

> 
> You just dropped a few asynchronous bits and resent this as a synchronous
> command, letting all the asynchronous infrastructure in. This is bad, as the
> command is more complex then it should be and doesn't make full use of the
> added infrastructure.
> 
> For example, does the synchronous version really uses DumpState? If it doesn't,
> let's just drop it and everything else which is not necessary.
> 
> *However*, note that while it's fine with me to have this as a synchronous
> command we need a few more ACKs (from libvirt and Anthony and/or Jan). So, I
> wouldn't go too far on making changes before we get those ACKs.
> 


Hi, Anthony, Luiz, Eric, Jan

At 03/15/2012 01:49 AM, Anthony Liguori Wrote:
> 
> Can we start by posting a non-RFC because we start discussing committing
> this.

At 03/15/2012 01:37 AM, Eric Blake Wrote:
> are sufficient that I'm okay with a synchronous-only version for qemu
> 1.1.

So I think Anthony and Eric may ACK to it.

Jan reviewed the early version, and give many comments. So I think he also ACKs to it.

Is it OK to post non-RFC version?

Thanks
Wen Congyang
Luiz Capitulino March 15, 2012, 2:25 p.m. UTC | #3
On Wed, 14 Mar 2012 14:18:47 -0300
Luiz Capitulino <lcapitulino@redhat.com> wrote:

> > +    ret = write(fd, buf, size);
> > +    if (ret != size) {
> > +        return -1;
> > +    }  
> 
> I think you should use send_all() instead of plain write().

Missed the fact that we actually have qemu_write_full().
Hatayama, Daisuke March 16, 2012, 3:23 a.m. UTC | #4
From: Wen Congyang <wency@cn.fujitsu.com>
Subject: [RFC][PATCH 11/14 v9] introduce a new monitor command 'dump' to dump guest's memory
Date: Wed, 14 Mar 2012 10:11:35 +0800

> +/*
> + * QEMU dump
> + *
> + * Copyright Fujitsu, Corp. 2011
> + *

Now 2012.

> +    /*
> +     * calculate phdr_num
> +     *
> +     * the type of phdr->num is uint16_t, so we should avoid overflow

e_phnum is correct.

> +     */
> +    s->phdr_num = 1; /* PT_NOTE */
> +    if (s->list.num < (1 << 16) - 2) {

s->list.num < UINT16_MAX is better.

> +        s->phdr_num += s->list.num;
> +        s->have_section = false;
> +    } else {
> +        s->have_section = true;
> +        s->phdr_num = PN_XNUM;
> +
> +        /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */
> +        if (s->list.num > (1ULL << 32) - 2) {

s->list.num < UINT32_MAX is better.

> +            s->sh_info = 0xffffffff;

UINT32_MAX is better. Is it rough around here?

> +        } else {
> +            s->sh_info += s->list.num;
> +        }
> +    }

Now orders of processings in positive and negative cases for e_phnum
and sh_info are different. It's better to make them sorted in the same
order.

  if (phdr_num not overflow?) {
    not overflow case;
  } else {
    overflow case;
    if (sh_info not overflow?) {
      not overflow case;
    } else {
      overflow case;
    }
  }

is better.

Thanks.
HATAYAMA, Daisuke
Wen Congyang March 16, 2012, 6:41 a.m. UTC | #5
At 03/16/2012 11:23 AM, HATAYAMA Daisuke Wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> Subject: [RFC][PATCH 11/14 v9] introduce a new monitor command 'dump' to dump guest's memory
> Date: Wed, 14 Mar 2012 10:11:35 +0800
> 
>> +/*
>> + * QEMU dump
>> + *
>> + * Copyright Fujitsu, Corp. 2011
>> + *
> 
> Now 2012.

On, I forgot to update it.

> 
>> +    /*
>> +     * calculate phdr_num
>> +     *
>> +     * the type of phdr->num is uint16_t, so we should avoid overflow
> 
> e_phnum is correct.

Yes
> 
>> +     */
>> +    s->phdr_num = 1; /* PT_NOTE */
>> +    if (s->list.num < (1 << 16) - 2) {
> 
> s->list.num < UINT16_MAX is better.
> 
>> +        s->phdr_num += s->list.num;
>> +        s->have_section = false;
>> +    } else {
>> +        s->have_section = true;
>> +        s->phdr_num = PN_XNUM;
>> +
>> +        /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */
>> +        if (s->list.num > (1ULL << 32) - 2) {
> 
> s->list.num < UINT32_MAX is better.
> 
>> +            s->sh_info = 0xffffffff;
> 
> UINT32_MAX is better. Is it rough around here?
> 
>> +        } else {
>> +            s->sh_info += s->list.num;
>> +        }
>> +    }
> 
> Now orders of processings in positive and negative cases for e_phnum
> and sh_info are different. It's better to make them sorted in the same
> order.
> 
>   if (phdr_num not overflow?) {
>     not overflow case;
>   } else {
>     overflow case;
>     if (sh_info not overflow?) {
>       not overflow case;
>     } else {
>       overflow case;
>     }
>   }
> 
> is better.

OK

Thanks
Wen Congyang
> 
> Thanks.
> HATAYAMA, Daisuke
> 
>
Wen Congyang March 16, 2012, 10:13 a.m. UTC | #6
At 03/15/2012 01:18 AM, Luiz Capitulino Wrote:
> On Wed, 14 Mar 2012 10:11:35 +0800
> Wen Congyang <wency@cn.fujitsu.com> wrote:
> 
>> The command's usage:
>>    dump [-p] file
>> file should be start with "file:"(the file's path) or "fd:"(the fd's name).
>>
>> Note:
>>   1. If you want to use gdb to analyse the core, please specify -p option.
>>   2. This command doesn't support the fd that is is associated with a pipe,
>>      socket, or FIFO(lseek will fail with such fd).
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> ---
>>  Makefile.target  |    2 +-
>>  dump.c           |  714 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  elf.h            |    5 +
>>  hmp-commands.hx  |   21 ++
>>  hmp.c            |   10 +
>>  hmp.h            |    1 +
>>  qapi-schema.json |   14 +
>>  qmp-commands.hx  |   34 +++
>>  8 files changed, 800 insertions(+), 1 deletions(-)
>>  create mode 100644 dump.c
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index c81c4fa..287fbe7 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -213,7 +213,7 @@ obj-$(CONFIG_NO_KVM) += kvm-stub.o
>>  obj-$(CONFIG_VGA) += vga.o
>>  obj-y += memory.o savevm.o
>>  obj-y += memory_mapping.o
>> -obj-$(CONFIG_HAVE_CORE_DUMP) += arch_dump.o
>> +obj-$(CONFIG_HAVE_CORE_DUMP) += arch_dump.o dump.o
>>  LIBS+=-lz
>>  
>>  obj-i386-$(CONFIG_KVM) += hyperv.o
>> diff --git a/dump.c b/dump.c
>> new file mode 100644
>> index 0000000..42e1681
>> --- /dev/null
>> +++ b/dump.c
>> @@ -0,0 +1,714 @@
>> +/*
>> + * QEMU dump
>> + *
>> + * Copyright Fujitsu, Corp. 2011
>> + *
>> + * Authors:
>> + *     Wen Congyang <wency@cn.fujitsu.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include <unistd.h>
>> +#include "elf.h"
>> +#include <sys/procfs.h>
>> +#include <glib.h>
>> +#include "cpu.h"
>> +#include "cpu-all.h"
>> +#include "targphys.h"
>> +#include "monitor.h"
>> +#include "kvm.h"
>> +#include "dump.h"
>> +#include "sysemu.h"
>> +#include "bswap.h"
>> +#include "memory_mapping.h"
>> +#include "error.h"
>> +#include "qmp-commands.h"
>> +#include "gdbstub.h"
>> +
>> +static inline uint16_t cpu_convert_to_target16(uint16_t val, int endian)
>> +{
>> +    if (endian == ELFDATA2LSB) {
>> +        val = cpu_to_le16(val);
>> +    } else {
>> +        val = cpu_to_be16(val);
>> +    }
>> +
>> +    return val;
>> +}
>> +
>> +static inline uint32_t cpu_convert_to_target32(uint32_t val, int endian)
>> +{
>> +    if (endian == ELFDATA2LSB) {
>> +        val = cpu_to_le32(val);
>> +    } else {
>> +        val = cpu_to_be32(val);
>> +    }
>> +
>> +    return val;
>> +}
>> +
>> +static inline uint64_t cpu_convert_to_target64(uint64_t val, int endian)
>> +{
>> +    if (endian == ELFDATA2LSB) {
>> +        val = cpu_to_le64(val);
>> +    } else {
>> +        val = cpu_to_be64(val);
>> +    }
>> +
>> +    return val;
>> +}
>> +
>> +enum {
>> +    DUMP_STATE_ERROR,
>> +    DUMP_STATE_SETUP,
>> +    DUMP_STATE_CANCELLED,
>> +    DUMP_STATE_ACTIVE,
>> +    DUMP_STATE_COMPLETED,
>> +};
>> +
>> +typedef struct DumpState {
>> +    ArchDumpInfo dump_info;
>> +    MemoryMappingList list;
>> +    uint16_t phdr_num;
>> +    uint32_t sh_info;
>> +    bool have_section;
>> +    int state;
>> +    bool resume;
>> +    char *error;
>> +    target_phys_addr_t memory_offset;
>> +    write_core_dump_function f;
>> +    void (*cleanup)(void *opaque);
>> +    void *opaque;
>> +} DumpState;
>> +
>> +static DumpState *dump_get_current(void)
>> +{
>> +    static DumpState current_dump = {
>> +        .state = DUMP_STATE_SETUP,
>> +    };
>> +
>> +    return &current_dump;
>> +}
> 
> You just dropped a few asynchronous bits and resent this as a synchronous
> command, letting all the asynchronous infrastructure in. This is bad, as the
> command is more complex then it should be and doesn't make full use of the
> added infrastructure.
> 
> For example, does the synchronous version really uses DumpState? If it doesn't,
> let's just drop it and everything else which is not necessary.

I use this struct to avoid too many parameters...

I will try to make it simple and clean.

Thanks
Wen Congyang

> 
> *However*, note that while it's fine with me to have this as a synchronous
> command we need a few more ACKs (from libvirt and Anthony and/or Jan). So, I
> wouldn't go too far on making changes before we get those ACKs.
> 
>> +
>> +static int dump_cleanup(DumpState *s)
>> +{
>> +    int ret = 0;
>> +
>> +    memory_mapping_list_free(&s->list);
>> +    s->cleanup(s->opaque);
>> +    if (s->resume) {
>> +        vm_start();
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static void dump_error(DumpState *s, const char *reason)
>> +{
>> +    s->state = DUMP_STATE_ERROR;
>> +    s->error = g_strdup(reason);
>> +    dump_cleanup(s);
>> +}
>> +
>> +static int write_elf64_header(DumpState *s)
>> +{
>> +    Elf64_Ehdr elf_header;
>> +    int ret;
>> +    int endian = s->dump_info.d_endian;
>> +
>> +    memset(&elf_header, 0, sizeof(Elf64_Ehdr));
>> +    memcpy(&elf_header, ELFMAG, 4);
>> +    elf_header.e_ident[EI_CLASS] = ELFCLASS64;
>> +    elf_header.e_ident[EI_DATA] = s->dump_info.d_endian;
>> +    elf_header.e_ident[EI_VERSION] = EV_CURRENT;
>> +    elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
>> +    elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
>> +                                                   endian);
>> +    elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
>> +    elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
>> +    elf_header.e_phoff = cpu_convert_to_target64(sizeof(Elf64_Ehdr), endian);
>> +    elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf64_Phdr),
>> +                                                     endian);
>> +    elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
>> +    if (s->have_section) {
>> +        uint64_t shoff = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr) * s->sh_info;
>> +
>> +        elf_header.e_shoff = cpu_convert_to_target64(shoff, endian);
>> +        elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf64_Shdr),
>> +                                                         endian);
>> +        elf_header.e_shnum = cpu_convert_to_target16(1, endian);
>> +    }
>> +
>> +    ret = s->f(0, &elf_header, sizeof(elf_header), s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write elf header.\n");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int write_elf32_header(DumpState *s)
>> +{
>> +    Elf32_Ehdr elf_header;
>> +    int ret;
>> +    int endian = s->dump_info.d_endian;
>> +
>> +    memset(&elf_header, 0, sizeof(Elf32_Ehdr));
>> +    memcpy(&elf_header, ELFMAG, 4);
>> +    elf_header.e_ident[EI_CLASS] = ELFCLASS32;
>> +    elf_header.e_ident[EI_DATA] = endian;
>> +    elf_header.e_ident[EI_VERSION] = EV_CURRENT;
>> +    elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
>> +    elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
>> +                                                   endian);
>> +    elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
>> +    elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
>> +    elf_header.e_phoff = cpu_convert_to_target32(sizeof(Elf32_Ehdr), endian);
>> +    elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf32_Phdr),
>> +                                                     endian);
>> +    elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
>> +    if (s->have_section) {
>> +        uint32_t shoff = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr) * s->sh_info;
>> +
>> +        elf_header.e_shoff = cpu_convert_to_target32(shoff, endian);
>> +        elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf32_Shdr),
>> +                                                         endian);
>> +        elf_header.e_shnum = cpu_convert_to_target16(1, endian);
>> +    }
>> +
>> +    ret = s->f(0, &elf_header, sizeof(elf_header), s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write elf header.\n");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
>> +                            int phdr_index, target_phys_addr_t offset)
>> +{
>> +    Elf64_Phdr phdr;
>> +    off_t phdr_offset;
>> +    int ret;
>> +    int endian = s->dump_info.d_endian;
>> +
>> +    memset(&phdr, 0, sizeof(Elf64_Phdr));
>> +    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
>> +    phdr.p_offset = cpu_convert_to_target64(offset, endian);
>> +    phdr.p_paddr = cpu_convert_to_target64(memory_mapping->phys_addr, endian);
>> +    if (offset == -1) {
>> +        phdr.p_filesz = 0;
>> +    } else {
>> +        phdr.p_filesz = cpu_convert_to_target64(memory_mapping->length, endian);
>> +    }
>> +    phdr.p_memsz = cpu_convert_to_target64(memory_mapping->length, endian);
>> +    phdr.p_vaddr = cpu_convert_to_target64(memory_mapping->virt_addr, endian);
>> +
>> +    phdr_offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*phdr_index;
>> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf64_Phdr), s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write program header table.\n");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int write_elf32_load(DumpState *s, MemoryMapping *memory_mapping,
>> +                            int phdr_index, target_phys_addr_t offset)
>> +{
>> +    Elf32_Phdr phdr;
>> +    off_t phdr_offset;
>> +    int ret;
>> +    int endian = s->dump_info.d_endian;
>> +
>> +    memset(&phdr, 0, sizeof(Elf32_Phdr));
>> +    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
>> +    phdr.p_offset = cpu_convert_to_target32(offset, endian);
>> +    phdr.p_paddr = cpu_convert_to_target32(memory_mapping->phys_addr, endian);
>> +    if (offset == -1) {
>> +        phdr.p_filesz = 0;
>> +    } else {
>> +        phdr.p_filesz = cpu_convert_to_target32(memory_mapping->length, endian);
>> +    }
>> +    phdr.p_memsz = cpu_convert_to_target32(memory_mapping->length, endian);
>> +    phdr.p_vaddr = cpu_convert_to_target32(memory_mapping->virt_addr, endian);
>> +
>> +    phdr_offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*phdr_index;
>> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf32_Phdr), s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write program header table.\n");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int write_elf64_notes(DumpState *s, int phdr_index,
>> +                             target_phys_addr_t *offset)
>> +{
>> +    CPUState *env;
>> +    int ret;
>> +    target_phys_addr_t begin = *offset;
>> +    Elf64_Phdr phdr;
>> +    off_t phdr_offset;
>> +    int id;
>> +    int endian = s->dump_info.d_endian;
>> +
>> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> +        id = gdb_id(env);
>> +        ret = cpu_write_elf64_note(s->f, env, id, offset, s->opaque);
>> +        if (ret < 0) {
>> +            dump_error(s, "dump: failed to write elf notes.\n");
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> +        ret = cpu_write_elf64_qemunote(s->f, env, offset, s->opaque);
>> +        if (ret < 0) {
>> +            dump_error(s, "dump: failed to write CPU status.\n");
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    memset(&phdr, 0, sizeof(Elf64_Phdr));
>> +    phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
>> +    phdr.p_offset = cpu_convert_to_target64(begin, endian);
>> +    phdr.p_paddr = 0;
>> +    phdr.p_filesz = cpu_convert_to_target64(*offset - begin, endian);
>> +    phdr.p_memsz = cpu_convert_to_target64(*offset - begin, endian);
>> +    phdr.p_vaddr = 0;
>> +
>> +    phdr_offset = sizeof(Elf64_Ehdr);
>> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf64_Phdr), s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write program header table.\n");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int write_elf32_notes(DumpState *s, int phdr_index,
>> +                             target_phys_addr_t *offset)
>> +{
>> +    CPUState *env;
>> +    int ret;
>> +    target_phys_addr_t begin = *offset;
>> +    Elf32_Phdr phdr;
>> +    off_t phdr_offset;
>> +    int id;
>> +    int endian = s->dump_info.d_endian;
>> +
>> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> +        id = gdb_id(env);
>> +        ret = cpu_write_elf32_note(s->f, env, id, offset, s->opaque);
>> +        if (ret < 0) {
>> +            dump_error(s, "dump: failed to write elf notes.\n");
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> +        ret = cpu_write_elf32_qemunote(s->f, env, offset, s->opaque);
>> +        if (ret < 0) {
>> +            dump_error(s, "dump: failed to write CPU status.\n");
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    memset(&phdr, 0, sizeof(Elf32_Phdr));
>> +    phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
>> +    phdr.p_offset = cpu_convert_to_target32(begin, endian);
>> +    phdr.p_paddr = 0;
>> +    phdr.p_filesz = cpu_convert_to_target32(*offset - begin, endian);
>> +    phdr.p_memsz = cpu_convert_to_target32(*offset - begin, endian);
>> +    phdr.p_vaddr = 0;
>> +
>> +    phdr_offset = sizeof(Elf32_Ehdr);
>> +    ret = s->f(phdr_offset, &phdr, sizeof(Elf32_Phdr), s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write program header table.\n");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int write_elf_section(DumpState *s, target_phys_addr_t *offset, int type)
>> +{
>> +    Elf32_Shdr shdr32;
>> +    Elf64_Shdr shdr64;
>> +    int endian = s->dump_info.d_endian;
>> +    int shdr_size;
>> +    void *shdr;
>> +    int ret;
>> +
>> +    if (type == 0) {
>> +        shdr_size = sizeof(Elf32_Shdr);
>> +        memset(&shdr32, 0, shdr_size);
>> +        shdr32.sh_info = cpu_convert_to_target32(s->sh_info, endian);
>> +        shdr = &shdr32;
>> +    } else {
>> +        shdr_size = sizeof(Elf64_Shdr);
>> +        memset(&shdr64, 0, shdr_size);
>> +        shdr64.sh_info = cpu_convert_to_target32(s->sh_info, endian);
>> +        shdr = &shdr64;
>> +    }
>> +
>> +    ret = s->f(*offset, &shdr, shdr_size, s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to write section header table.\n");
>> +        return -1;
>> +    }
>> +
>> +    *offset += shdr_size;
>> +    return 0;
>> +}
>> +
>> +static int write_data(DumpState *s, void *buf, int length,
>> +                      target_phys_addr_t *offset)
>> +{
>> +    int ret;
>> +
>> +    ret = s->f(*offset, buf, length, s->opaque);
>> +    if (ret < 0) {
>> +        dump_error(s, "dump: failed to save memory.\n");
>> +        return -1;
>> +    }
>> +
>> +    *offset += length;
>> +    return 0;
>> +}
>> +
>> +/* write the memroy to vmcore. 1 page per I/O. */
>> +static int write_memory(DumpState *s, RAMBlock *block,
>> +                        target_phys_addr_t *offset)
>> +{
>> +    int i, ret;
>> +
>> +    for (i = 0; i < block->length / TARGET_PAGE_SIZE; i++) {
>> +        ret = write_data(s, block->host + i * TARGET_PAGE_SIZE,
>> +                         TARGET_PAGE_SIZE, offset);
>> +        if (ret < 0) {
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    if ((block->length % TARGET_PAGE_SIZE) != 0) {
>> +        ret = write_data(s, block->host + i * TARGET_PAGE_SIZE,
>> +                         block->length % TARGET_PAGE_SIZE, offset);
>> +        if (ret < 0) {
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/* get the memory's offset in the vmcore */
>> +static target_phys_addr_t get_offset(target_phys_addr_t phys_addr,
>> +                                     target_phys_addr_t memory_offset)
>> +{
>> +    RAMBlock *block;
>> +    target_phys_addr_t offset = memory_offset;
>> +
>> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
>> +        if (phys_addr >= block->offset &&
>> +            phys_addr < block->offset + block->length) {
>> +            return phys_addr - block->offset + offset;
>> +        }
>> +        offset += block->length;
>> +    }
>> +
>> +    return -1;
>> +}
>> +
>> +/* write elf header, PT_NOTE and elf note to vmcore. */
>> +static int dump_begin(DumpState *s)
>> +{
>> +    target_phys_addr_t offset;
>> +    int ret;
>> +
>> +    s->state = DUMP_STATE_ACTIVE;
>> +
>> +    /*
>> +     * the vmcore's format is:
>> +     *   --------------
>> +     *   |  elf header |
>> +     *   --------------
>> +     *   |  PT_NOTE    |
>> +     *   --------------
>> +     *   |  PT_LOAD    |
>> +     *   --------------
>> +     *   |  ......     |
>> +     *   --------------
>> +     *   |  PT_LOAD    |
>> +     *   --------------
>> +     *   |  sec_hdr    |
>> +     *   --------------
>> +     *   |  elf note   |
>> +     *   --------------
>> +     *   |  memory     |
>> +     *   --------------
>> +     *
>> +     * we only know where the memory is saved after we write elf note into
>> +     * vmcore.
>> +     */
>> +
>> +    /* write elf header to vmcore */
>> +    if (s->dump_info.d_class == ELFCLASS64) {
>> +        ret = write_elf64_header(s);
>> +    } else {
>> +        ret = write_elf32_header(s);
>> +    }
>> +    if (ret < 0) {
>> +        return -1;
>> +    }
>> +
>> +    /* write elf section and notes to vmcore */
>> +    if (s->dump_info.d_class == ELFCLASS64) {
>> +        if (s->have_section) {
>> +            offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*s->sh_info;
>> +            if (write_elf_section(s, &offset, 1) < 0) {
>> +                return -1;
>> +            }
>> +        } else {
>> +            offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*s->phdr_num;
>> +        }
>> +        ret = write_elf64_notes(s, 0, &offset);
>> +    } else {
>> +        if (s->have_section) {
>> +            offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*s->sh_info;
>> +            if (write_elf_section(s, &offset, 0) < 0) {
>> +                return -1;
>> +            }
>> +        } else {
>> +            offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*s->phdr_num;
>> +        }
>> +        ret = write_elf32_notes(s, 0, &offset);
>> +    }
>> +
>> +    if (ret < 0) {
>> +        return -1;
>> +    }
>> +
>> +    s->memory_offset = offset;
>> +    return 0;
>> +}
>> +
>> +/* write PT_LOAD to vmcore */
>> +static int dump_completed(DumpState *s)
>> +{
>> +    target_phys_addr_t offset;
>> +    MemoryMapping *memory_mapping;
>> +    int phdr_index = 1, ret;
>> +
>> +    QTAILQ_FOREACH(memory_mapping, &s->list.head, next) {
>> +        offset = get_offset(memory_mapping->phys_addr, s->memory_offset);
>> +        if (s->dump_info.d_class == ELFCLASS64) {
>> +            ret = write_elf64_load(s, memory_mapping, phdr_index++, offset);
>> +        } else {
>> +            ret = write_elf32_load(s, memory_mapping, phdr_index++, offset);
>> +        }
>> +        if (ret < 0) {
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    s->state = DUMP_STATE_COMPLETED;
>> +    dump_cleanup(s);
>> +    return 0;
>> +}
>> +
>> +/* write all memory to vmcore */
>> +static int dump_iterate(DumpState *s)
>> +{
>> +    RAMBlock *block;
>> +    target_phys_addr_t offset = s->memory_offset;
>> +    int ret;
>> +
>> +    /* write all memory to vmcore */
>> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
>> +        ret = write_memory(s, block, &offset);
>> +        if (ret < 0) {
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    return dump_completed(s);
>> +}
>> +
>> +static int create_vmcore(DumpState *s)
>> +{
>> +    int ret;
>> +
>> +    ret = dump_begin(s);
>> +    if (ret < 0) {
>> +        return -1;
>> +    }
>> +
>> +    ret = dump_iterate(s);
>> +    if (ret < 0) {
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static DumpState *dump_init(bool paging, Error **errp)
>> +{
>> +    CPUState *env;
>> +    DumpState *s = dump_get_current();
>> +    int ret;
>> +
>> +    if (runstate_is_running()) {
>> +        vm_stop(RUN_STATE_PAUSED);
>> +        s->resume = true;
> 
> Hmm, you actually stop the VM. Seems obvious now, but when people talked about
> making this asynchronous I automatically assumed that what we didn't want was
> having the global mutex held for too much time (ie. while this command was
> running).
> 
> The only disadvantage of having this as a synchronous command is that libvirt
> won't be able to cancel it and won't be able to run other commands in parallel.
> Doesn't seem that serious to me.
> 
> Btw, RUN_STATE_PAUSED is not a good one. Doesn't matter that much, as this
> is unlikely to be visible, but you should use RUN_STATE_SAVE_VM or
> RUN_STATE_DEBUG.
> 
>> +    } else {
>> +        s->resume = false;
>> +    }
>> +    s->state = DUMP_STATE_SETUP;
>> +    if (s->error) {
>> +        g_free(s->error);
>> +        s->error = NULL;
>> +    }
>> +
>> +    /*
>> +     * get dump info: endian, class and architecture.
>> +     * If the target architecture is not supported, cpu_get_dump_info() will
>> +     * return -1.
>> +     *
>> +     * if we use kvm, we should synchronize the register before we get dump
>> +     * info.
>> +     */
>> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
>> +        cpu_synchronize_state(env);
>> +    }
>> +
>> +    ret = cpu_get_dump_info(&s->dump_info);
>> +    if (ret < 0) {
>> +        error_set(errp, QERR_UNSUPPORTED);
> 
> This will let the VM paused.
> 
>> +        return NULL;
>> +    }
>> +
>> +    /* get memory mapping */
>> +    memory_mapping_list_init(&s->list);
>> +    if (paging) {
>> +        qemu_get_guest_memory_mapping(&s->list);
>> +    } else {
>> +        qemu_get_guest_simple_memory_mapping(&s->list);
>> +    }
>> +
>> +    /*
>> +     * calculate phdr_num
>> +     *
>> +     * the type of phdr->num is uint16_t, so we should avoid overflow
>> +     */
>> +    s->phdr_num = 1; /* PT_NOTE */
>> +    if (s->list.num < (1 << 16) - 2) {
>> +        s->phdr_num += s->list.num;
>> +        s->have_section = false;
>> +    } else {
>> +        s->have_section = true;
>> +        s->phdr_num = PN_XNUM;
>> +
>> +        /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */
>> +        if (s->list.num > (1ULL << 32) - 2) {
>> +            s->sh_info = 0xffffffff;
>> +        } else {
>> +            s->sh_info += s->list.num;
>> +        }
>> +    }
>> +
>> +    return s;
>> +}
>> +
>> +static int fd_write_vmcore(target_phys_addr_t offset, void *buf, size_t size,
>> +                           void *opaque)
>> +{
>> +    int fd = (int)(intptr_t)opaque;
>> +    int ret;
>> +
>> +    ret = lseek(fd, offset, SEEK_SET);
>> +    if (ret < 0) {
>> +        return -1;
>> +    }
>> +
>> +    ret = write(fd, buf, size);
>> +    if (ret != size) {
>> +        return -1;
>> +    }
> 
> I think you should use send_all() instead of plain write().
> 
>> +
>> +    return 0;
>> +}
>> +
>> +static void fd_cleanup(void *opaque)
>> +{
>> +    int fd = (int)(intptr_t)opaque;
>> +
>> +    if (fd != -1) {
>> +        close(fd);
>> +    }
>> +}
>> +
>> +static DumpState *dump_init_fd(int fd, bool paging, Error **errp)
>> +{
>> +    DumpState *s = dump_init(paging, errp);
>> +
>> +    if (s == NULL) {
>> +        return NULL;
>> +    }
>> +
>> +    s->f = fd_write_vmcore;
>> +    s->cleanup = fd_cleanup;
>> +    s->opaque = (void *)(intptr_t)fd;
> 
> Do we really need all these indirections?
> 
>> +
>> +    return s;
>> +}
>> +
>> +void qmp_dump(bool paging, const char *file, Error **errp)
>> +{
>> +    const char *p;
>> +    int fd = -1;
>> +    DumpState *s;
>> +
>> +#if !defined(WIN32)
>> +    if (strstart(file, "fd:", &p)) {
>> +        fd = qemu_get_fd(p);
> 
> qemu_get_fd() won't be merged, you should use monitor_get_fd(cur_mon, p);
> 
>> +        if (fd == -1) {
>> +            error_set(errp, QERR_FD_NOT_FOUND, p);
>> +            return;
>> +        }
>> +    }
>> +#endif
>> +
>> +    if  (strstart(file, "file:", &p)) {
>> +        fd = open(p, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR);
> 
> This is minor, but I'd use qemu_open() here.
> 
>> +        if (fd < 0) {
>> +            error_set(errp, QERR_OPEN_FILE_FAILED, p);
>> +            return;
>> +        }
>> +    }
>> +
>> +    if (fd == -1) {
>> +        error_set(errp, QERR_INVALID_PARAMETER, "file");
>> +        return;
>> +    }
>> +
>> +    s = dump_init_fd(fd, paging, errp);
>> +    if (!s) {
>> +        return;
>> +    }
>> +
>> +    if (create_vmcore(s) < 0) {
>> +        error_set(errp, QERR_IO_ERROR);
>> +    }
>> +}
>> diff --git a/elf.h b/elf.h
>> index 2e05d34..6a10657 100644
>> --- a/elf.h
>> +++ b/elf.h
>> @@ -1000,6 +1000,11 @@ typedef struct elf64_sym {
>>  
>>  #define EI_NIDENT	16
>>  
>> +/* Special value for e_phnum.  This indicates that the real number of
>> +   program headers is too large to fit into e_phnum.  Instead the real
>> +   value is in the field sh_info of section 0.  */
>> +#define PN_XNUM         0xffff
>> +
>>  typedef struct elf32_hdr{
>>    unsigned char	e_ident[EI_NIDENT];
>>    Elf32_Half	e_type;
>> diff --git a/hmp-commands.hx b/hmp-commands.hx
>> index 6980214..d4cf2e5 100644
>> --- a/hmp-commands.hx
>> +++ b/hmp-commands.hx
>> @@ -880,6 +880,27 @@ server will ask the spice/vnc client to automatically reconnect using the
>>  new parameters (if specified) once the vm migration finished successfully.
>>  ETEXI
>>  
>> +#if defined(CONFIG_HAVE_CORE_DUMP)
>> +    {
>> +        .name       = "dump",
>> +        .args_type  = "paging:-p,file:s",
>> +        .params     = "[-p] file",
>> +        .help       = "dump to file",
>> +        .user_print = monitor_user_noop,
>> +        .mhandler.cmd = hmp_dump,
>> +    },
>> +
>> +
>> +STEXI
>> +@item dump [-p] @var{file}
>> +@findex dump
>> +Dump to @var{file}. The file can be processed with crash or gdb.
>> +    file: destination file(started with "file:") or destination file descriptor
>> +          (started with "fd:")
>> +  paging: do paging to get guest's memory mapping
>> +ETEXI
>> +#endif
>> +
>>      {
>>          .name       = "snapshot_blkdev",
>>          .args_type  = "reuse:-n,device:B,snapshot-file:s?,format:s?",
>> diff --git a/hmp.c b/hmp.c
>> index 290c43d..e13b793 100644
>> --- a/hmp.c
>> +++ b/hmp.c
>> @@ -860,3 +860,13 @@ void hmp_block_job_cancel(Monitor *mon, const QDict *qdict)
>>  
>>      hmp_handle_error(mon, &error);
>>  }
>> +
>> +void hmp_dump(Monitor *mon, const QDict *qdict)
>> +{
>> +    Error *errp = NULL;
>> +    int paging = qdict_get_try_bool(qdict, "paging", 0);
>> +    const char *file = qdict_get_str(qdict, "file");
>> +
>> +    qmp_dump(!!paging, file, &errp);
> 
> Why the double negation on 'paging'?
> 
>> +    hmp_handle_error(mon, &errp);
>> +}
>> diff --git a/hmp.h b/hmp.h
>> index 5409464..b055e50 100644
>> --- a/hmp.h
>> +++ b/hmp.h
>> @@ -59,5 +59,6 @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict);
>>  void hmp_block_stream(Monitor *mon, const QDict *qdict);
>>  void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict);
>>  void hmp_block_job_cancel(Monitor *mon, const QDict *qdict);
>> +void hmp_dump(Monitor *mon, const QDict *qdict);
>>  
>>  #endif
>> diff --git a/qapi-schema.json b/qapi-schema.json
>> index 04fa84f..81b8c7c 100644
>> --- a/qapi-schema.json
>> +++ b/qapi-schema.json
>> @@ -1663,3 +1663,17 @@
>>  { 'command': 'qom-list-types',
>>    'data': { '*implements': 'str', '*abstract': 'bool' },
>>    'returns': [ 'ObjectTypeInfo' ] }
>> +
>> +##
>> +# @dump
> 
> 'dump' is too generic, please call this dump-guest-memory-vmcore or something
> more descriptive.
> 
>> +#
>> +# Dump guest's memory to vmcore.
>> +#
>> +# @paging: if true, do paging to get guest's memory mapping
>> +# @file: the filename or file descriptor of the vmcore.
> 
> 'file' is not a good name because it can also dump to an fd, maybe 'protocol'?
> 
>> +#
>> +# Returns: nothing on success
>> +#
>> +# Since: 1.1
>> +##
>> +{ 'command': 'dump', 'data': { 'paging': 'bool', 'file': 'str' } }
>> diff --git a/qmp-commands.hx b/qmp-commands.hx
>> index dfe8a5b..9e39bd9 100644
>> --- a/qmp-commands.hx
>> +++ b/qmp-commands.hx
>> @@ -586,6 +586,40 @@ Example:
>>  
>>  EQMP
>>  
>> +#if defined(CONFIG_HAVE_CORE_DUMP)
>> +    {
>> +        .name       = "dump",
>> +        .args_type  = "paging:-p,file:s",
>> +        .params     = "[-p] file",
>> +        .help       = "dump to file",
>> +        .user_print = monitor_user_noop,
>> +        .mhandler.cmd_new = qmp_marshal_input_dump,
>> +    },
>> +
>> +SQMP
>> +dump
>> +
>> +
>> +Dump to file. The file can be processed with crash or gdb.
>> +
>> +Arguments:
>> +
>> +- "paging": do paging to get guest's memory mapping (json-bool)
>> +- "file": destination file(started with "file:") or destination file descriptor
>> +          (started with "fd:") (json-string)
>> +
>> +Example:
>> +
>> +-> { "execute": "dump", "arguments": { "file": "fd:dump" } }
>> +<- { "return": {} }
>> +
>> +Notes:
>> +
>> +(1) All boolean arguments default to false
>> +
>> +EQMP
>> +#endif
>> +
>>      {
>>          .name       = "netdev_add",
>>          .args_type  = "netdev:O",
> 
>
Wen Congyang March 19, 2012, 2:28 a.m. UTC | #7
At 03/15/2012 01:18 AM, Luiz Capitulino Wrote:
> On Wed, 14 Mar 2012 10:11:35 +0800
> Wen Congyang <wency@cn.fujitsu.com> wrote:
> 
>> The command's usage:
>>    dump [-p] file
>> file should be start with "file:"(the file's path) or "fd:"(the fd's name).
>>
>> Note:
>>   1. If you want to use gdb to analyse the core, please specify -p option.
>>   2. This command doesn't support the fd that is is associated with a pipe,
>>      socket, or FIFO(lseek will fail with such fd).
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>

<cut>

>> +
>> +static DumpState *dump_init(bool paging, Error **errp)
>> +{
>> +    CPUState *env;
>> +    DumpState *s = dump_get_current();
>> +    int ret;
>> +
>> +    if (runstate_is_running()) {
>> +        vm_stop(RUN_STATE_PAUSED);
>> +        s->resume = true;
> 
> Hmm, you actually stop the VM. Seems obvious now, but when people talked about
> making this asynchronous I automatically assumed that what we didn't want was
> having the global mutex held for too much time (ie. while this command was
> running).

Yes, In the earlier version, I add a vm state change handler. If the vm is resumed
by the user, qemu dump will be auto cancelled.

> 
> The only disadvantage of having this as a synchronous command is that libvirt
> won't be able to cancel it and won't be able to run other commands in parallel.
> Doesn't seem that serious to me.
> 
> Btw, RUN_STATE_PAUSED is not a good one. Doesn't matter that much, as this
> is unlikely to be visible, but you should use RUN_STATE_SAVE_VM or
> RUN_STATE_DEBUG.

OK, I will use RUN_STATE_SAVE_VM.

> 
>> +    } else {

<cut>

>> +    ret = cpu_get_dump_info(&s->dump_info);
>> +    if (ret < 0) {
>> +        error_set(errp, QERR_UNSUPPORTED);
> 
> This will let the VM paused.

Hmm, in which function the vm is paused?

> 
>> +        return NULL;

<cut>

>> +    ret = write(fd, buf, size);
>> +    if (ret != size) {
>> +        return -1;
>> +    }
> 
> I think you should use send_all() instead of plain write().

OK, I will use qemu_write_full() you mentioned in anohter mail.

> 
>> +
>> +    return 0;
>> +}

<cut>

>> +
>> +    s->f = fd_write_vmcore;
>> +    s->cleanup = fd_cleanup;
>> +    s->opaque = (void *)(intptr_t)fd;
> 
> Do we really need all these indirections?

At 02/15/2012 01:31 AM, Jan Kiszka Wrote:
> Is writing to file descriptor generic enough? What if we want to dump
> via QMP, letting the receiver side decide about where to write it?

So I use these indirections.

> 
>> +
>> +    return s;
>> +}
>> +
>> +void qmp_dump(bool paging, const char *file, Error **errp)
>> +{
>> +    const char *p;
>> +    int fd = -1;
>> +    DumpState *s;
>> +
>> +#if !defined(WIN32)
>> +    if (strstart(file, "fd:", &p)) {
>> +        fd = qemu_get_fd(p);
> 
> qemu_get_fd() won't be merged, you should use monitor_get_fd(cur_mon, p);

OK

> 
>> +        if (fd == -1) {
>> +            error_set(errp, QERR_FD_NOT_FOUND, p);
>> +            return;
>> +        }
>> +    }
>> +#endif
>> +
>> +    if  (strstart(file, "file:", &p)) {
>> +        fd = open(p, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR);
> 
> This is minor, but I'd use qemu_open() here.

OK

> 
>> +        if (fd < 0) {

<cut>

>> +
>> +    qmp_dump(!!paging, file, &errp);
> 
> Why the double negation on 'paging'?

OK, I will remove double negation.

> 
>> +    hmp_handle_error(mon, &errp);

<cut>

>> +
>> +##
>> +# @dump
> 
> 'dump' is too generic, please call this dump-guest-memory-vmcore or something
> more descriptive.

Hmm, dump-guest-memory-vmcore is too long. What about dump-guest-memory or
dump-memory?

> 
>> +#
>> +# Dump guest's memory to vmcore.
>> +#
>> +# @paging: if true, do paging to get guest's memory mapping
>> +# @file: the filename or file descriptor of the vmcore.
> 
> 'file' is not a good name because it can also dump to an fd, maybe 'protocol'?

OK

Thanks for you reviewing
Wen Congyang
Wen Congyang March 19, 2012, 8:31 a.m. UTC | #8
At 03/19/2012 10:28 AM, Wen Congyang Wrote:
> At 03/15/2012 01:18 AM, Luiz Capitulino Wrote:
>> On Wed, 14 Mar 2012 10:11:35 +0800
>> Wen Congyang <wency@cn.fujitsu.com> wrote:
>>
>>> The command's usage:
>>>    dump [-p] file
>>> file should be start with "file:"(the file's path) or "fd:"(the fd's name).
>>>
>>> Note:
>>>   1. If you want to use gdb to analyse the core, please specify -p option.
>>>   2. This command doesn't support the fd that is is associated with a pipe,
>>>      socket, or FIFO(lseek will fail with such fd).
>>>
>>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> 
> <cut>
> 
>>> +
>>> +static DumpState *dump_init(bool paging, Error **errp)
>>> +{
>>> +    CPUState *env;
>>> +    DumpState *s = dump_get_current();
>>> +    int ret;
>>> +
>>> +    if (runstate_is_running()) {
>>> +        vm_stop(RUN_STATE_PAUSED);
>>> +        s->resume = true;
>>
>> Hmm, you actually stop the VM. Seems obvious now, but when people talked about
>> making this asynchronous I automatically assumed that what we didn't want was
>> having the global mutex held for too much time (ie. while this command was
>> running).
> 
> Yes, In the earlier version, I add a vm state change handler. If the vm is resumed
> by the user, qemu dump will be auto cancelled.
> 
>>
>> The only disadvantage of having this as a synchronous command is that libvirt
>> won't be able to cancel it and won't be able to run other commands in parallel.
>> Doesn't seem that serious to me.
>>
>> Btw, RUN_STATE_PAUSED is not a good one. Doesn't matter that much, as this
>> is unlikely to be visible, but you should use RUN_STATE_SAVE_VM or
>> RUN_STATE_DEBUG.
> 
> OK, I will use RUN_STATE_SAVE_VM.
> 
>>
>>> +    } else {
> 
> <cut>
> 
>>> +    ret = cpu_get_dump_info(&s->dump_info);
>>> +    if (ret < 0) {
>>> +        error_set(errp, QERR_UNSUPPORTED);
>>
>> This will let the VM paused.
> 
> Hmm, in which function the vm is paused?

Sorry for my misundestand. I forgot to resume vm before dump_init() returns.

Thanks
Wen Congyang

> 
>>
>>> +        return NULL;
> 
> <cut>
> 
>>> +    ret = write(fd, buf, size);
>>> +    if (ret != size) {
>>> +        return -1;
>>> +    }
>>
>> I think you should use send_all() instead of plain write().
> 
> OK, I will use qemu_write_full() you mentioned in anohter mail.
> 
>>
>>> +
>>> +    return 0;
>>> +}
> 
> <cut>
> 
>>> +
>>> +    s->f = fd_write_vmcore;
>>> +    s->cleanup = fd_cleanup;
>>> +    s->opaque = (void *)(intptr_t)fd;
>>
>> Do we really need all these indirections?
> 
> At 02/15/2012 01:31 AM, Jan Kiszka Wrote:
>> Is writing to file descriptor generic enough? What if we want to dump
>> via QMP, letting the receiver side decide about where to write it?
> 
> So I use these indirections.
> 
>>
>>> +
>>> +    return s;
>>> +}
>>> +
>>> +void qmp_dump(bool paging, const char *file, Error **errp)
>>> +{
>>> +    const char *p;
>>> +    int fd = -1;
>>> +    DumpState *s;
>>> +
>>> +#if !defined(WIN32)
>>> +    if (strstart(file, "fd:", &p)) {
>>> +        fd = qemu_get_fd(p);
>>
>> qemu_get_fd() won't be merged, you should use monitor_get_fd(cur_mon, p);
> 
> OK
> 
>>
>>> +        if (fd == -1) {
>>> +            error_set(errp, QERR_FD_NOT_FOUND, p);
>>> +            return;
>>> +        }
>>> +    }
>>> +#endif
>>> +
>>> +    if  (strstart(file, "file:", &p)) {
>>> +        fd = open(p, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR);
>>
>> This is minor, but I'd use qemu_open() here.
> 
> OK
> 
>>
>>> +        if (fd < 0) {
> 
> <cut>
> 
>>> +
>>> +    qmp_dump(!!paging, file, &errp);
>>
>> Why the double negation on 'paging'?
> 
> OK, I will remove double negation.
> 
>>
>>> +    hmp_handle_error(mon, &errp);
> 
> <cut>
> 
>>> +
>>> +##
>>> +# @dump
>>
>> 'dump' is too generic, please call this dump-guest-memory-vmcore or something
>> more descriptive.
> 
> Hmm, dump-guest-memory-vmcore is too long. What about dump-guest-memory or
> dump-memory?
> 
>>
>>> +#
>>> +# Dump guest's memory to vmcore.
>>> +#
>>> +# @paging: if true, do paging to get guest's memory mapping
>>> +# @file: the filename or file descriptor of the vmcore.
>>
>> 'file' is not a good name because it can also dump to an fd, maybe 'protocol'?
> 
> OK
> 
> Thanks for you reviewing
> Wen Congyang
> 
>
Luiz Capitulino March 19, 2012, 1:16 p.m. UTC | #9
On Mon, 19 Mar 2012 10:28:17 +0800
Wen Congyang <wency@cn.fujitsu.com> wrote:

> > 'dump' is too generic, please call this dump-guest-memory-vmcore or something
> > more descriptive.
> 
> Hmm, dump-guest-memory-vmcore is too long. What about dump-guest-memory or
> dump-memory?

dump-guest-memory is acceptable, but we don't have problems with long names.
We prefer a long, but descriptive name, rather than a short & ambiguous one.
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index c81c4fa..287fbe7 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -213,7 +213,7 @@  obj-$(CONFIG_NO_KVM) += kvm-stub.o
 obj-$(CONFIG_VGA) += vga.o
 obj-y += memory.o savevm.o
 obj-y += memory_mapping.o
-obj-$(CONFIG_HAVE_CORE_DUMP) += arch_dump.o
+obj-$(CONFIG_HAVE_CORE_DUMP) += arch_dump.o dump.o
 LIBS+=-lz
 
 obj-i386-$(CONFIG_KVM) += hyperv.o
diff --git a/dump.c b/dump.c
new file mode 100644
index 0000000..42e1681
--- /dev/null
+++ b/dump.c
@@ -0,0 +1,714 @@ 
+/*
+ * QEMU dump
+ *
+ * Copyright Fujitsu, Corp. 2011
+ *
+ * Authors:
+ *     Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include <unistd.h>
+#include "elf.h"
+#include <sys/procfs.h>
+#include <glib.h>
+#include "cpu.h"
+#include "cpu-all.h"
+#include "targphys.h"
+#include "monitor.h"
+#include "kvm.h"
+#include "dump.h"
+#include "sysemu.h"
+#include "bswap.h"
+#include "memory_mapping.h"
+#include "error.h"
+#include "qmp-commands.h"
+#include "gdbstub.h"
+
+static inline uint16_t cpu_convert_to_target16(uint16_t val, int endian)
+{
+    if (endian == ELFDATA2LSB) {
+        val = cpu_to_le16(val);
+    } else {
+        val = cpu_to_be16(val);
+    }
+
+    return val;
+}
+
+static inline uint32_t cpu_convert_to_target32(uint32_t val, int endian)
+{
+    if (endian == ELFDATA2LSB) {
+        val = cpu_to_le32(val);
+    } else {
+        val = cpu_to_be32(val);
+    }
+
+    return val;
+}
+
+static inline uint64_t cpu_convert_to_target64(uint64_t val, int endian)
+{
+    if (endian == ELFDATA2LSB) {
+        val = cpu_to_le64(val);
+    } else {
+        val = cpu_to_be64(val);
+    }
+
+    return val;
+}
+
+enum {
+    DUMP_STATE_ERROR,
+    DUMP_STATE_SETUP,
+    DUMP_STATE_CANCELLED,
+    DUMP_STATE_ACTIVE,
+    DUMP_STATE_COMPLETED,
+};
+
+typedef struct DumpState {
+    ArchDumpInfo dump_info;
+    MemoryMappingList list;
+    uint16_t phdr_num;
+    uint32_t sh_info;
+    bool have_section;
+    int state;
+    bool resume;
+    char *error;
+    target_phys_addr_t memory_offset;
+    write_core_dump_function f;
+    void (*cleanup)(void *opaque);
+    void *opaque;
+} DumpState;
+
+static DumpState *dump_get_current(void)
+{
+    static DumpState current_dump = {
+        .state = DUMP_STATE_SETUP,
+    };
+
+    return &current_dump;
+}
+
+static int dump_cleanup(DumpState *s)
+{
+    int ret = 0;
+
+    memory_mapping_list_free(&s->list);
+    s->cleanup(s->opaque);
+    if (s->resume) {
+        vm_start();
+    }
+
+    return ret;
+}
+
+static void dump_error(DumpState *s, const char *reason)
+{
+    s->state = DUMP_STATE_ERROR;
+    s->error = g_strdup(reason);
+    dump_cleanup(s);
+}
+
+static int write_elf64_header(DumpState *s)
+{
+    Elf64_Ehdr elf_header;
+    int ret;
+    int endian = s->dump_info.d_endian;
+
+    memset(&elf_header, 0, sizeof(Elf64_Ehdr));
+    memcpy(&elf_header, ELFMAG, 4);
+    elf_header.e_ident[EI_CLASS] = ELFCLASS64;
+    elf_header.e_ident[EI_DATA] = s->dump_info.d_endian;
+    elf_header.e_ident[EI_VERSION] = EV_CURRENT;
+    elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
+    elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
+                                                   endian);
+    elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
+    elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
+    elf_header.e_phoff = cpu_convert_to_target64(sizeof(Elf64_Ehdr), endian);
+    elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf64_Phdr),
+                                                     endian);
+    elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
+    if (s->have_section) {
+        uint64_t shoff = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr) * s->sh_info;
+
+        elf_header.e_shoff = cpu_convert_to_target64(shoff, endian);
+        elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf64_Shdr),
+                                                         endian);
+        elf_header.e_shnum = cpu_convert_to_target16(1, endian);
+    }
+
+    ret = s->f(0, &elf_header, sizeof(elf_header), s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write elf header.\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int write_elf32_header(DumpState *s)
+{
+    Elf32_Ehdr elf_header;
+    int ret;
+    int endian = s->dump_info.d_endian;
+
+    memset(&elf_header, 0, sizeof(Elf32_Ehdr));
+    memcpy(&elf_header, ELFMAG, 4);
+    elf_header.e_ident[EI_CLASS] = ELFCLASS32;
+    elf_header.e_ident[EI_DATA] = endian;
+    elf_header.e_ident[EI_VERSION] = EV_CURRENT;
+    elf_header.e_type = cpu_convert_to_target16(ET_CORE, endian);
+    elf_header.e_machine = cpu_convert_to_target16(s->dump_info.d_machine,
+                                                   endian);
+    elf_header.e_version = cpu_convert_to_target32(EV_CURRENT, endian);
+    elf_header.e_ehsize = cpu_convert_to_target16(sizeof(elf_header), endian);
+    elf_header.e_phoff = cpu_convert_to_target32(sizeof(Elf32_Ehdr), endian);
+    elf_header.e_phentsize = cpu_convert_to_target16(sizeof(Elf32_Phdr),
+                                                     endian);
+    elf_header.e_phnum = cpu_convert_to_target16(s->phdr_num, endian);
+    if (s->have_section) {
+        uint32_t shoff = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr) * s->sh_info;
+
+        elf_header.e_shoff = cpu_convert_to_target32(shoff, endian);
+        elf_header.e_shentsize = cpu_convert_to_target16(sizeof(Elf32_Shdr),
+                                                         endian);
+        elf_header.e_shnum = cpu_convert_to_target16(1, endian);
+    }
+
+    ret = s->f(0, &elf_header, sizeof(elf_header), s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write elf header.\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int write_elf64_load(DumpState *s, MemoryMapping *memory_mapping,
+                            int phdr_index, target_phys_addr_t offset)
+{
+    Elf64_Phdr phdr;
+    off_t phdr_offset;
+    int ret;
+    int endian = s->dump_info.d_endian;
+
+    memset(&phdr, 0, sizeof(Elf64_Phdr));
+    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
+    phdr.p_offset = cpu_convert_to_target64(offset, endian);
+    phdr.p_paddr = cpu_convert_to_target64(memory_mapping->phys_addr, endian);
+    if (offset == -1) {
+        phdr.p_filesz = 0;
+    } else {
+        phdr.p_filesz = cpu_convert_to_target64(memory_mapping->length, endian);
+    }
+    phdr.p_memsz = cpu_convert_to_target64(memory_mapping->length, endian);
+    phdr.p_vaddr = cpu_convert_to_target64(memory_mapping->virt_addr, endian);
+
+    phdr_offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*phdr_index;
+    ret = s->f(phdr_offset, &phdr, sizeof(Elf64_Phdr), s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write program header table.\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int write_elf32_load(DumpState *s, MemoryMapping *memory_mapping,
+                            int phdr_index, target_phys_addr_t offset)
+{
+    Elf32_Phdr phdr;
+    off_t phdr_offset;
+    int ret;
+    int endian = s->dump_info.d_endian;
+
+    memset(&phdr, 0, sizeof(Elf32_Phdr));
+    phdr.p_type = cpu_convert_to_target32(PT_LOAD, endian);
+    phdr.p_offset = cpu_convert_to_target32(offset, endian);
+    phdr.p_paddr = cpu_convert_to_target32(memory_mapping->phys_addr, endian);
+    if (offset == -1) {
+        phdr.p_filesz = 0;
+    } else {
+        phdr.p_filesz = cpu_convert_to_target32(memory_mapping->length, endian);
+    }
+    phdr.p_memsz = cpu_convert_to_target32(memory_mapping->length, endian);
+    phdr.p_vaddr = cpu_convert_to_target32(memory_mapping->virt_addr, endian);
+
+    phdr_offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*phdr_index;
+    ret = s->f(phdr_offset, &phdr, sizeof(Elf32_Phdr), s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write program header table.\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int write_elf64_notes(DumpState *s, int phdr_index,
+                             target_phys_addr_t *offset)
+{
+    CPUState *env;
+    int ret;
+    target_phys_addr_t begin = *offset;
+    Elf64_Phdr phdr;
+    off_t phdr_offset;
+    int id;
+    int endian = s->dump_info.d_endian;
+
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        id = gdb_id(env);
+        ret = cpu_write_elf64_note(s->f, env, id, offset, s->opaque);
+        if (ret < 0) {
+            dump_error(s, "dump: failed to write elf notes.\n");
+            return -1;
+        }
+    }
+
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        ret = cpu_write_elf64_qemunote(s->f, env, offset, s->opaque);
+        if (ret < 0) {
+            dump_error(s, "dump: failed to write CPU status.\n");
+            return -1;
+        }
+    }
+
+    memset(&phdr, 0, sizeof(Elf64_Phdr));
+    phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
+    phdr.p_offset = cpu_convert_to_target64(begin, endian);
+    phdr.p_paddr = 0;
+    phdr.p_filesz = cpu_convert_to_target64(*offset - begin, endian);
+    phdr.p_memsz = cpu_convert_to_target64(*offset - begin, endian);
+    phdr.p_vaddr = 0;
+
+    phdr_offset = sizeof(Elf64_Ehdr);
+    ret = s->f(phdr_offset, &phdr, sizeof(Elf64_Phdr), s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write program header table.\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int write_elf32_notes(DumpState *s, int phdr_index,
+                             target_phys_addr_t *offset)
+{
+    CPUState *env;
+    int ret;
+    target_phys_addr_t begin = *offset;
+    Elf32_Phdr phdr;
+    off_t phdr_offset;
+    int id;
+    int endian = s->dump_info.d_endian;
+
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        id = gdb_id(env);
+        ret = cpu_write_elf32_note(s->f, env, id, offset, s->opaque);
+        if (ret < 0) {
+            dump_error(s, "dump: failed to write elf notes.\n");
+            return -1;
+        }
+    }
+
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        ret = cpu_write_elf32_qemunote(s->f, env, offset, s->opaque);
+        if (ret < 0) {
+            dump_error(s, "dump: failed to write CPU status.\n");
+            return -1;
+        }
+    }
+
+    memset(&phdr, 0, sizeof(Elf32_Phdr));
+    phdr.p_type = cpu_convert_to_target32(PT_NOTE, endian);
+    phdr.p_offset = cpu_convert_to_target32(begin, endian);
+    phdr.p_paddr = 0;
+    phdr.p_filesz = cpu_convert_to_target32(*offset - begin, endian);
+    phdr.p_memsz = cpu_convert_to_target32(*offset - begin, endian);
+    phdr.p_vaddr = 0;
+
+    phdr_offset = sizeof(Elf32_Ehdr);
+    ret = s->f(phdr_offset, &phdr, sizeof(Elf32_Phdr), s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write program header table.\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int write_elf_section(DumpState *s, target_phys_addr_t *offset, int type)
+{
+    Elf32_Shdr shdr32;
+    Elf64_Shdr shdr64;
+    int endian = s->dump_info.d_endian;
+    int shdr_size;
+    void *shdr;
+    int ret;
+
+    if (type == 0) {
+        shdr_size = sizeof(Elf32_Shdr);
+        memset(&shdr32, 0, shdr_size);
+        shdr32.sh_info = cpu_convert_to_target32(s->sh_info, endian);
+        shdr = &shdr32;
+    } else {
+        shdr_size = sizeof(Elf64_Shdr);
+        memset(&shdr64, 0, shdr_size);
+        shdr64.sh_info = cpu_convert_to_target32(s->sh_info, endian);
+        shdr = &shdr64;
+    }
+
+    ret = s->f(*offset, &shdr, shdr_size, s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to write section header table.\n");
+        return -1;
+    }
+
+    *offset += shdr_size;
+    return 0;
+}
+
+static int write_data(DumpState *s, void *buf, int length,
+                      target_phys_addr_t *offset)
+{
+    int ret;
+
+    ret = s->f(*offset, buf, length, s->opaque);
+    if (ret < 0) {
+        dump_error(s, "dump: failed to save memory.\n");
+        return -1;
+    }
+
+    *offset += length;
+    return 0;
+}
+
+/* write the memroy to vmcore. 1 page per I/O. */
+static int write_memory(DumpState *s, RAMBlock *block,
+                        target_phys_addr_t *offset)
+{
+    int i, ret;
+
+    for (i = 0; i < block->length / TARGET_PAGE_SIZE; i++) {
+        ret = write_data(s, block->host + i * TARGET_PAGE_SIZE,
+                         TARGET_PAGE_SIZE, offset);
+        if (ret < 0) {
+            return -1;
+        }
+    }
+
+    if ((block->length % TARGET_PAGE_SIZE) != 0) {
+        ret = write_data(s, block->host + i * TARGET_PAGE_SIZE,
+                         block->length % TARGET_PAGE_SIZE, offset);
+        if (ret < 0) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+/* get the memory's offset in the vmcore */
+static target_phys_addr_t get_offset(target_phys_addr_t phys_addr,
+                                     target_phys_addr_t memory_offset)
+{
+    RAMBlock *block;
+    target_phys_addr_t offset = memory_offset;
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (phys_addr >= block->offset &&
+            phys_addr < block->offset + block->length) {
+            return phys_addr - block->offset + offset;
+        }
+        offset += block->length;
+    }
+
+    return -1;
+}
+
+/* write elf header, PT_NOTE and elf note to vmcore. */
+static int dump_begin(DumpState *s)
+{
+    target_phys_addr_t offset;
+    int ret;
+
+    s->state = DUMP_STATE_ACTIVE;
+
+    /*
+     * the vmcore's format is:
+     *   --------------
+     *   |  elf header |
+     *   --------------
+     *   |  PT_NOTE    |
+     *   --------------
+     *   |  PT_LOAD    |
+     *   --------------
+     *   |  ......     |
+     *   --------------
+     *   |  PT_LOAD    |
+     *   --------------
+     *   |  sec_hdr    |
+     *   --------------
+     *   |  elf note   |
+     *   --------------
+     *   |  memory     |
+     *   --------------
+     *
+     * we only know where the memory is saved after we write elf note into
+     * vmcore.
+     */
+
+    /* write elf header to vmcore */
+    if (s->dump_info.d_class == ELFCLASS64) {
+        ret = write_elf64_header(s);
+    } else {
+        ret = write_elf32_header(s);
+    }
+    if (ret < 0) {
+        return -1;
+    }
+
+    /* write elf section and notes to vmcore */
+    if (s->dump_info.d_class == ELFCLASS64) {
+        if (s->have_section) {
+            offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*s->sh_info;
+            if (write_elf_section(s, &offset, 1) < 0) {
+                return -1;
+            }
+        } else {
+            offset = sizeof(Elf64_Ehdr) + sizeof(Elf64_Phdr)*s->phdr_num;
+        }
+        ret = write_elf64_notes(s, 0, &offset);
+    } else {
+        if (s->have_section) {
+            offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*s->sh_info;
+            if (write_elf_section(s, &offset, 0) < 0) {
+                return -1;
+            }
+        } else {
+            offset = sizeof(Elf32_Ehdr) + sizeof(Elf32_Phdr)*s->phdr_num;
+        }
+        ret = write_elf32_notes(s, 0, &offset);
+    }
+
+    if (ret < 0) {
+        return -1;
+    }
+
+    s->memory_offset = offset;
+    return 0;
+}
+
+/* write PT_LOAD to vmcore */
+static int dump_completed(DumpState *s)
+{
+    target_phys_addr_t offset;
+    MemoryMapping *memory_mapping;
+    int phdr_index = 1, ret;
+
+    QTAILQ_FOREACH(memory_mapping, &s->list.head, next) {
+        offset = get_offset(memory_mapping->phys_addr, s->memory_offset);
+        if (s->dump_info.d_class == ELFCLASS64) {
+            ret = write_elf64_load(s, memory_mapping, phdr_index++, offset);
+        } else {
+            ret = write_elf32_load(s, memory_mapping, phdr_index++, offset);
+        }
+        if (ret < 0) {
+            return -1;
+        }
+    }
+
+    s->state = DUMP_STATE_COMPLETED;
+    dump_cleanup(s);
+    return 0;
+}
+
+/* write all memory to vmcore */
+static int dump_iterate(DumpState *s)
+{
+    RAMBlock *block;
+    target_phys_addr_t offset = s->memory_offset;
+    int ret;
+
+    /* write all memory to vmcore */
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        ret = write_memory(s, block, &offset);
+        if (ret < 0) {
+            return -1;
+        }
+    }
+
+    return dump_completed(s);
+}
+
+static int create_vmcore(DumpState *s)
+{
+    int ret;
+
+    ret = dump_begin(s);
+    if (ret < 0) {
+        return -1;
+    }
+
+    ret = dump_iterate(s);
+    if (ret < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static DumpState *dump_init(bool paging, Error **errp)
+{
+    CPUState *env;
+    DumpState *s = dump_get_current();
+    int ret;
+
+    if (runstate_is_running()) {
+        vm_stop(RUN_STATE_PAUSED);
+        s->resume = true;
+    } else {
+        s->resume = false;
+    }
+    s->state = DUMP_STATE_SETUP;
+    if (s->error) {
+        g_free(s->error);
+        s->error = NULL;
+    }
+
+    /*
+     * get dump info: endian, class and architecture.
+     * If the target architecture is not supported, cpu_get_dump_info() will
+     * return -1.
+     *
+     * if we use kvm, we should synchronize the register before we get dump
+     * info.
+     */
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        cpu_synchronize_state(env);
+    }
+
+    ret = cpu_get_dump_info(&s->dump_info);
+    if (ret < 0) {
+        error_set(errp, QERR_UNSUPPORTED);
+        return NULL;
+    }
+
+    /* get memory mapping */
+    memory_mapping_list_init(&s->list);
+    if (paging) {
+        qemu_get_guest_memory_mapping(&s->list);
+    } else {
+        qemu_get_guest_simple_memory_mapping(&s->list);
+    }
+
+    /*
+     * calculate phdr_num
+     *
+     * the type of phdr->num is uint16_t, so we should avoid overflow
+     */
+    s->phdr_num = 1; /* PT_NOTE */
+    if (s->list.num < (1 << 16) - 2) {
+        s->phdr_num += s->list.num;
+        s->have_section = false;
+    } else {
+        s->have_section = true;
+        s->phdr_num = PN_XNUM;
+
+        /* the type of shdr->sh_info is uint32_t, so we should avoid overflow */
+        if (s->list.num > (1ULL << 32) - 2) {
+            s->sh_info = 0xffffffff;
+        } else {
+            s->sh_info += s->list.num;
+        }
+    }
+
+    return s;
+}
+
+static int fd_write_vmcore(target_phys_addr_t offset, void *buf, size_t size,
+                           void *opaque)
+{
+    int fd = (int)(intptr_t)opaque;
+    int ret;
+
+    ret = lseek(fd, offset, SEEK_SET);
+    if (ret < 0) {
+        return -1;
+    }
+
+    ret = write(fd, buf, size);
+    if (ret != size) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static void fd_cleanup(void *opaque)
+{
+    int fd = (int)(intptr_t)opaque;
+
+    if (fd != -1) {
+        close(fd);
+    }
+}
+
+static DumpState *dump_init_fd(int fd, bool paging, Error **errp)
+{
+    DumpState *s = dump_init(paging, errp);
+
+    if (s == NULL) {
+        return NULL;
+    }
+
+    s->f = fd_write_vmcore;
+    s->cleanup = fd_cleanup;
+    s->opaque = (void *)(intptr_t)fd;
+
+    return s;
+}
+
+void qmp_dump(bool paging, const char *file, Error **errp)
+{
+    const char *p;
+    int fd = -1;
+    DumpState *s;
+
+#if !defined(WIN32)
+    if (strstart(file, "fd:", &p)) {
+        fd = qemu_get_fd(p);
+        if (fd == -1) {
+            error_set(errp, QERR_FD_NOT_FOUND, p);
+            return;
+        }
+    }
+#endif
+
+    if  (strstart(file, "file:", &p)) {
+        fd = open(p, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR);
+        if (fd < 0) {
+            error_set(errp, QERR_OPEN_FILE_FAILED, p);
+            return;
+        }
+    }
+
+    if (fd == -1) {
+        error_set(errp, QERR_INVALID_PARAMETER, "file");
+        return;
+    }
+
+    s = dump_init_fd(fd, paging, errp);
+    if (!s) {
+        return;
+    }
+
+    if (create_vmcore(s) < 0) {
+        error_set(errp, QERR_IO_ERROR);
+    }
+}
diff --git a/elf.h b/elf.h
index 2e05d34..6a10657 100644
--- a/elf.h
+++ b/elf.h
@@ -1000,6 +1000,11 @@  typedef struct elf64_sym {
 
 #define EI_NIDENT	16
 
+/* Special value for e_phnum.  This indicates that the real number of
+   program headers is too large to fit into e_phnum.  Instead the real
+   value is in the field sh_info of section 0.  */
+#define PN_XNUM         0xffff
+
 typedef struct elf32_hdr{
   unsigned char	e_ident[EI_NIDENT];
   Elf32_Half	e_type;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 6980214..d4cf2e5 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -880,6 +880,27 @@  server will ask the spice/vnc client to automatically reconnect using the
 new parameters (if specified) once the vm migration finished successfully.
 ETEXI
 
+#if defined(CONFIG_HAVE_CORE_DUMP)
+    {
+        .name       = "dump",
+        .args_type  = "paging:-p,file:s",
+        .params     = "[-p] file",
+        .help       = "dump to file",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd = hmp_dump,
+    },
+
+
+STEXI
+@item dump [-p] @var{file}
+@findex dump
+Dump to @var{file}. The file can be processed with crash or gdb.
+    file: destination file(started with "file:") or destination file descriptor
+          (started with "fd:")
+  paging: do paging to get guest's memory mapping
+ETEXI
+#endif
+
     {
         .name       = "snapshot_blkdev",
         .args_type  = "reuse:-n,device:B,snapshot-file:s?,format:s?",
diff --git a/hmp.c b/hmp.c
index 290c43d..e13b793 100644
--- a/hmp.c
+++ b/hmp.c
@@ -860,3 +860,13 @@  void hmp_block_job_cancel(Monitor *mon, const QDict *qdict)
 
     hmp_handle_error(mon, &error);
 }
+
+void hmp_dump(Monitor *mon, const QDict *qdict)
+{
+    Error *errp = NULL;
+    int paging = qdict_get_try_bool(qdict, "paging", 0);
+    const char *file = qdict_get_str(qdict, "file");
+
+    qmp_dump(!!paging, file, &errp);
+    hmp_handle_error(mon, &errp);
+}
diff --git a/hmp.h b/hmp.h
index 5409464..b055e50 100644
--- a/hmp.h
+++ b/hmp.h
@@ -59,5 +59,6 @@  void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict);
 void hmp_block_stream(Monitor *mon, const QDict *qdict);
 void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict);
 void hmp_block_job_cancel(Monitor *mon, const QDict *qdict);
+void hmp_dump(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/qapi-schema.json b/qapi-schema.json
index 04fa84f..81b8c7c 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1663,3 +1663,17 @@ 
 { 'command': 'qom-list-types',
   'data': { '*implements': 'str', '*abstract': 'bool' },
   'returns': [ 'ObjectTypeInfo' ] }
+
+##
+# @dump
+#
+# Dump guest's memory to vmcore.
+#
+# @paging: if true, do paging to get guest's memory mapping
+# @file: the filename or file descriptor of the vmcore.
+#
+# Returns: nothing on success
+#
+# Since: 1.1
+##
+{ 'command': 'dump', 'data': { 'paging': 'bool', 'file': 'str' } }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index dfe8a5b..9e39bd9 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -586,6 +586,40 @@  Example:
 
 EQMP
 
+#if defined(CONFIG_HAVE_CORE_DUMP)
+    {
+        .name       = "dump",
+        .args_type  = "paging:-p,file:s",
+        .params     = "[-p] file",
+        .help       = "dump to file",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = qmp_marshal_input_dump,
+    },
+
+SQMP
+dump
+
+
+Dump to file. The file can be processed with crash or gdb.
+
+Arguments:
+
+- "paging": do paging to get guest's memory mapping (json-bool)
+- "file": destination file(started with "file:") or destination file descriptor
+          (started with "fd:") (json-string)
+
+Example:
+
+-> { "execute": "dump", "arguments": { "file": "fd:dump" } }
+<- { "return": {} }
+
+Notes:
+
+(1) All boolean arguments default to false
+
+EQMP
+#endif
+
     {
         .name       = "netdev_add",
         .args_type  = "netdev:O",