diff mbox

[v4] block:add-cow file format

Message ID 1319705553-23707-1-git-send-email-wdongxu@linux.vnet.ibm.com
State New
Headers show

Commit Message

Robert Wang Oct. 27, 2011, 8:52 a.m. UTC
Provide a new file format: add-cow. The usage can be found in add-cow.txt of
this patch.

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
---
 Makefile.objs          |    1 +
 block.c                |    2 +-
 block.h                |    1 +
 block/add-cow.c        |  405 ++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h            |    1 +
 docs/specs/add-cow.txt |   45 ++++++
 6 files changed, 454 insertions(+), 1 deletions(-)
 create mode 100644 block/add-cow.c
 create mode 100644 docs/specs/add-cow.txt

Comments

Robert Wang Nov. 6, 2011, 6:35 a.m. UTC | #1
Do you have any comments?
Thanks.

2011/10/27 Dong Xu Wang <wdongxu@linux.vnet.ibm.com>:
> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
>
> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  405 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   45 ++++++
>  6 files changed, 454 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
>
> diff --git a/Makefile.objs b/Makefile.objs
> index 01587c8..208c12c 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-nested-y += add-cow.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> diff --git a/block.c b/block.c
> index 70aab63..e343995 100644
> --- a/block.c
> +++ b/block.c
> @@ -105,7 +105,7 @@ int is_windows_drive(const char *filename)
>  #endif
>
>  /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>  {
>  #ifdef _WIN32
>     if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index 5a042c9..dff5197 100644
> --- a/block.h
> +++ b/block.h
> @@ -271,6 +271,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>
>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>  int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>  void path_combine(char *dest, int dest_size,
>                   const char *base_path,
>                   const char *filename);
> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..93d5b13
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,405 @@
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC  (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
> +                        ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
> +                        ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
> +                        ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION 1
> +#define ADD_COW_FILE_LEN 1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t magic;
> +    uint32_t version;
> +    char backing_file[ADD_COW_FILE_LEN];
> +    char image_file[ADD_COW_FILE_LEN];
> +    uint64_t size;
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char image_file[ADD_COW_FILE_LEN];
> +    BlockDriverState *image_hd;
> +    uint8_t *bitmap;
> +    uint64_t bitmap_size;
> +    CoMutex lock;
> +} BDRVAddCowState;
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader header;
> +    int64_t size;
> +    char image_filename[ADD_COW_FILE_LEN];
> +    int image_flags;
> +    BlockDriver *image_drv = NULL;
> +    int ret;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        ret = -1;
> +        goto fail;
> +    }
> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(state->image_file, sizeof(state->image_file),
> +            header.image_file);
> +
> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    state->bitmap = g_malloc0(state->bitmap_size);
> +
> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> +            state->bitmap_size);
> +    if (ret != state->bitmap_size) {
> +        goto fail;
> +    }
> +   /* If there is a image_file, must be together with backing_file */
> +    if (state->image_file[0] != '\0') {
> +        state->image_hd = bdrv_new("");
> +        /* Relative to image or working dir, need discussion */
> +        if (path_has_protocol(state->image_file)) {
> +            pstrcpy(image_filename, sizeof(image_filename),
> +                    state->image_file);
> +        } else {
> +            path_combine(image_filename, sizeof(image_filename),
> +                         bs->filename, state->image_file);
> +        }
> +
> +        image_drv = bdrv_find_format("raw");
> +        image_flags =
> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;
> +        state->image_hd->keep_read_only = 0;
> +
> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
> +                image_drv);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    } else {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +    qemu_co_mutex_init(&state->lock);
> +    return 0;
> + fail:
> +    if (state->bitmap) {
> +        g_free(state->bitmap);
> +        state->bitmap = NULL;
> +    }
> +    return ret;
> +}
> +
> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> +{
> +    uint64_t offset = bitnum / 8;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    state->bitmap[offset] |= (1 << (bitnum % 8));
> +}
> +
> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t offset = bitnum / 8;
> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
> +}
> +
> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
> +
> +    /* Beyond the end of bitmap, return error or read from backing_file? */
> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
> +        return 0;
> +    }
> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    if (changed < 0) {
> +        return 0;
> +    }
> +
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    int i, ret = 0;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t start_pos = sector_num / 8;
> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
> +
> +    if (start_pos > state->bitmap_size) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < nb_sectors; i++) {
> +        add_cow_set_bit(bs, sector_num + i);
> +    }
> +    ret = bdrv_pwrite_sync(bs->file,
> +        sizeof(AddCowHeader) + start_pos,
> +        state->bitmap + start_pos,
> +        MIN(((end_pos - start_pos) & (~512)) + 512, state->bitmap_size - start_pos));
> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    if (state->bitmap) {
> +        g_free(state->bitmap);
> +        state->bitmap = NULL;
> +    }
> +}
> +
> +static int add_cow_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    AddCowHeader header;
> +    int64_t image_sectors = 0;
> +    const char *backing_filename = NULL;
> +    const char *image_filename = NULL;
> +    int ret;
> +    BlockDriverState *bs, *image_bs = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_filename = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
> +            image_filename = options->value.s;
> +        }
> +        options++;
> +    }
> +
> +    if (!backing_filename || !image_filename) {
> +        return -EINVAL;
> +    }
> +    /* Make sure image file exists */
> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
> +            | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_close(image_bs);
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
> +    header.version = cpu_to_be32(ADD_COW_VERSION);
> +    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
> +
> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
> +    if (ret < 0) {
> +        bdrv_close(bs);
> +        return ret;
> +    }
> +    bdrv_close(bs);
> +
> +    BlockDriver *drv = bdrv_find_format("add-cow");
> +    assert(drv != NULL);
> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
> +    bdrv_close(bs);
> +    return ret;
> +}
> +
> +static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
> +                         int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
> +            if (ret < 0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                    n, &hd_qiov);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +            } else {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_memset(&hd_qiov, 0,
> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
> +                          int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +    QEMUIOVector hd_qiov;
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_reset(&hd_qiov);
> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, &hd_qiov);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
> +{
> +    int ret = 0;
> +    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
> +    int64_t be_offset = cpu_to_be64(offset);
> +    BDRVAddCowState *state = bs->opaque;
> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
> +        + sizeof(AddCowHeader));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
> +        &be_offset, sizeof(uint64_t));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(state->image_hd, offset);
> +    return ret;
> +}
> +
> +static int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = bs->opaque;
> +    int ret = bdrv_co_flush(state->image_hd);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name        = "add-cow",
> +    .instance_size      = sizeof(BDRVAddCowState),
> +    .bdrv_probe         = add_cow_probe,
> +    .bdrv_open          = add_cow_open,
> +    .bdrv_close         = add_cow_close,
> +    .bdrv_create        = add_cow_create,
> +    .bdrv_is_allocated  = add_cow_is_allocated,
> +
> +    .bdrv_co_readv      = add_cow_co_readv,
> +    .bdrv_co_writev     = add_cow_co_writev,
> +    .bdrv_truncate      = bdrv_add_cow_truncate,
> +
> +    .create_options     = add_cow_create_options,
> +    .bdrv_co_flush      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block_int.h b/block_int.h
> index dac00f5..7409c19 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -42,6 +42,7 @@
>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>  #define BLOCK_OPT_PREALLOC      "preallocation"
>  #define BLOCK_OPT_SUBFMT        "subformat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>
>  typedef struct AIOPool {
>     void (*cancel)(BlockDriverAIOCB *acb);
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..0443c6c
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,45 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature. Then
> +you can use add-cow file realize these features.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +----------+----------+----------+-----+
> + |  Header  |   Data   |   Data   | ... |
> + +----------+----------+----------+-----+
> +
> + All numbers in add-cow are stored in Big Endian byte order.
> +
> +
> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. While
> +                        using backing_file, must together with image_file.
> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file, While using image_file, must
> +                        together with image_file.
> +
> +         2060 - 2067:   size
> +                        Virtual disk size of image_file in bytes.
> --
> 1.7.5.4
>
>
>
Kevin Wolf Nov. 9, 2011, 2:13 p.m. UTC | #2
Am 27.10.2011 10:52, schrieb Dong Xu Wang:
> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
> 
> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  405 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   45 ++++++
>  6 files changed, 454 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 01587c8..208c12c 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>  
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-nested-y += add-cow.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> diff --git a/block.c b/block.c
> index 70aab63..e343995 100644
> --- a/block.c
> +++ b/block.c
> @@ -105,7 +105,7 @@ int is_windows_drive(const char *filename)
>  #endif
>  
>  /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>  {
>  #ifdef _WIN32
>      if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index 5a042c9..dff5197 100644
> --- a/block.h
> +++ b/block.h
> @@ -271,6 +271,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>  
>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>  int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>  void path_combine(char *dest, int dest_size,
>                    const char *base_path,
>                    const char *filename);
> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..93d5b13
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,405 @@
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC  (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
> +                        ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
> +                        ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
> +                        ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION 1
> +#define ADD_COW_FILE_LEN 1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t magic;
> +    uint32_t version;
> +    char backing_file[ADD_COW_FILE_LEN];
> +    char image_file[ADD_COW_FILE_LEN];
> +    uint64_t size;
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char image_file[ADD_COW_FILE_LEN];
> +    BlockDriverState *image_hd;
> +    uint8_t *bitmap;
> +    uint64_t bitmap_size;
> +    CoMutex lock;
> +} BDRVAddCowState;

You could align the field names, that would make it a bit more readable.

> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader header;
> +    int64_t size;
> +    char image_filename[ADD_COW_FILE_LEN];
> +    int image_flags;
> +    BlockDriver *image_drv = NULL;
> +    int ret;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        ret = -1;

The return value is interpreted as -errno, so -1 typically means EPERM,
which is misleading. I think -ENOTSUP or -EINVAL would be a good choice.
For newer versions of the file format, have a look at how qcow2 handles it.

> +        goto fail;
> +    }
> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(state->image_file, sizeof(state->image_file),
> +            header.image_file);
> +
> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    state->bitmap = g_malloc0(state->bitmap_size);
> +
> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> +            state->bitmap_size);
> +    if (ret != state->bitmap_size) {
> +        goto fail;
> +    }
> +   /* If there is a image_file, must be together with backing_file */
> +    if (state->image_file[0] != '\0') {

The other checks check for an error. This one checks for success and has
a large 'then' block. Any reason for not staying consistent and doing if
(... == '\0') { ret = -ENOENT; goto fail; } here?

> +        state->image_hd = bdrv_new("");
> +        /* Relative to image or working dir, need discussion */

You implemented it as relative to the image, which I believe is the best
way to handle it. It is consistent with how backing files are handled.

So is there anything left to discuss or should just the comment be updated?

> +        if (path_has_protocol(state->image_file)) {
> +            pstrcpy(image_filename, sizeof(image_filename),
> +                    state->image_file);
> +        } else {
> +            path_combine(image_filename, sizeof(image_filename),
> +                         bs->filename, state->image_file);
> +        }
> +
> +        image_drv = bdrv_find_format("raw");
> +        image_flags =
> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;
> +        state->image_hd->keep_read_only = 0;
> +
> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
> +                image_drv);
> +        if (ret < 0) {
> +            goto fail;

This leaks state->image_hd.

> +        }
> +    } else {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +    qemu_co_mutex_init(&state->lock);
> +    return 0;
> + fail:
> +    if (state->bitmap) {
> +        g_free(state->bitmap);
> +        state->bitmap = NULL;
> +    }

The check isn't required, g_free(NULL) works fine.

> +    return ret;
> +}
> +
> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> +{
> +    uint64_t offset = bitnum / 8;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    state->bitmap[offset] |= (1 << (bitnum % 8));
> +}
> +
> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t offset = bitnum / 8;
> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
> +}
> +
> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
> +
> +    /* Beyond the end of bitmap, return error or read from backing_file? */
> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
> +        return 0;
> +    }
> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    if (changed < 0) {
> +        return 0;
> +    }
> +
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    int i, ret = 0;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t start_pos = sector_num / 8;
> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
> +
> +    if (start_pos > state->bitmap_size) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < nb_sectors; i++) {
> +        add_cow_set_bit(bs, sector_num + i);
> +    }
> +    ret = bdrv_pwrite_sync(bs->file,
> +        sizeof(AddCowHeader) + start_pos,
> +        state->bitmap + start_pos,
> +        MIN(((end_pos - start_pos) & (~512)) + 512, state->bitmap_size - start_pos));

This turn effectively everything into cache=writethrough. Which is not
wrong, but very conservative and will cost a lot of performance.

One optimisation is easy: Check if the bitmap is really changed or if
you only set bits that were already set before the update. If they were
already set, you can skip the bdrv_pwrite_sync().

I think it would also be safe to use bdrv_pwrite without an explicit
sync, this would save some more flushes.

> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    if (state->bitmap) {
> +        g_free(state->bitmap);
> +        state->bitmap = NULL;
> +    }
> +}

Can state->bitmap ever be NULL for an open image?

> +
> +static int add_cow_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    AddCowHeader header;
> +    int64_t image_sectors = 0;
> +    const char *backing_filename = NULL;
> +    const char *image_filename = NULL;
> +    int ret;
> +    BlockDriverState *bs, *image_bs = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_filename = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
> +            image_filename = options->value.s;
> +        }
> +        options++;
> +    }
> +
> +    if (!backing_filename || !image_filename) {
> +        return -EINVAL;
> +    }

Maybe an error_report() that tells the user what's going wrong?

> +    /* Make sure image file exists */
> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
> +            | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_close(image_bs);
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
> +    header.version = cpu_to_be32(ADD_COW_VERSION);
> +    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
> +
> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
> +    if (ret < 0) {
> +        bdrv_close(bs);
> +        return ret;
> +    }
> +    bdrv_close(bs);
> +
> +    BlockDriver *drv = bdrv_find_format("add-cow");
> +    assert(drv != NULL);
> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
> +    bdrv_close(bs);
> +    return ret;
> +}

bdrv_close should be replaced by bdrv_delete in all cases.

> +
> +static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
> +                         int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
> +            if (ret < 0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                    n, &hd_qiov);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +            } else {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_memset(&hd_qiov, 0,
> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
> +                          int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +    QEMUIOVector hd_qiov;
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_reset(&hd_qiov);
> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, &hd_qiov);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
> +{
> +    int ret = 0;
> +    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
> +    int64_t be_offset = cpu_to_be64(offset);
> +    BDRVAddCowState *state = bs->opaque;
> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
> +        + sizeof(AddCowHeader));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
> +        &be_offset, sizeof(uint64_t));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(state->image_hd, offset);
> +    return ret;
> +}

When an error occurs, you may end up with an add_cow image whose size
disagrees with the raw image's size.

I think we can do a bit better here. First thing would be to try
truncating the base image first (many formats don't support truncate at
all, so this is the most likely failure case). The other thing worth
doing would be undoing the header update if a truncate fails afterwards.

> +
> +static int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = bs->opaque;
> +    int ret = bdrv_co_flush(state->image_hd);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name        = "add-cow",
> +    .instance_size      = sizeof(BDRVAddCowState),
> +    .bdrv_probe         = add_cow_probe,
> +    .bdrv_open          = add_cow_open,
> +    .bdrv_close         = add_cow_close,
> +    .bdrv_create        = add_cow_create,
> +    .bdrv_is_allocated  = add_cow_is_allocated,
> +
> +    .bdrv_co_readv      = add_cow_co_readv,
> +    .bdrv_co_writev     = add_cow_co_writev,
> +    .bdrv_truncate      = bdrv_add_cow_truncate,
> +
> +    .create_options     = add_cow_create_options,
> +    .bdrv_co_flush      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block_int.h b/block_int.h
> index dac00f5..7409c19 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -42,6 +42,7 @@
>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>  #define BLOCK_OPT_PREALLOC      "preallocation"
>  #define BLOCK_OPT_SUBFMT        "subformat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>  
>  typedef struct AIOPool {
>      void (*cancel)(BlockDriverAIOCB *acb);
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..0443c6c
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,45 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature. Then
> +you can use add-cow file realize these features.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +----------+----------+----------+-----+
> + |  Header  |   Data   |   Data   | ... |
> + +----------+----------+----------+-----+
> +
> + All numbers in add-cow are stored in Big Endian byte order.

What the data blocks mean isn't explained in the spec.

> +
> +
> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. While
> +                        using backing_file, must together with image_file.
> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file, While using image_file, must
> +                        together with image_file.
> +
> +         2060 - 2067:   size
> +                        Virtual disk size of image_file in bytes.

Kevin
Robert Wang Nov. 10, 2011, 2:11 a.m. UTC | #3
2011/11/9 Kevin Wolf <kwolf@redhat.com>:
> Am 27.10.2011 10:52, schrieb Dong Xu Wang:
>> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
>> this patch.
>>
>> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>> ---
>>  Makefile.objs          |    1 +
>>  block.c                |    2 +-
>>  block.h                |    1 +
>>  block/add-cow.c        |  405 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  block_int.h            |    1 +
>>  docs/specs/add-cow.txt |   45 ++++++
>>  6 files changed, 454 insertions(+), 1 deletions(-)
>>  create mode 100644 block/add-cow.c
>>  create mode 100644 docs/specs/add-cow.txt
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index 01587c8..208c12c 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>
>>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>> +block-nested-y += add-cow.o
>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>  block-nested-y += qed-check.o
>>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>> diff --git a/block.c b/block.c
>> index 70aab63..e343995 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -105,7 +105,7 @@ int is_windows_drive(const char *filename)
>>  #endif
>>
>>  /* check if the path starts with "<protocol>:" */
>> -static int path_has_protocol(const char *path)
>> +int path_has_protocol(const char *path)
>>  {
>>  #ifdef _WIN32
>>      if (is_windows_drive(path) ||
>> diff --git a/block.h b/block.h
>> index 5a042c9..dff5197 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -271,6 +271,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>>
>>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>  int path_is_absolute(const char *path);
>> +int path_has_protocol(const char *path);
>>  void path_combine(char *dest, int dest_size,
>>                    const char *base_path,
>>                    const char *filename);
>> diff --git a/block/add-cow.c b/block/add-cow.c
>> new file mode 100644
>> index 0000000..93d5b13
>> --- /dev/null
>> +++ b/block/add-cow.c
>> @@ -0,0 +1,405 @@
>> +#include "qemu-common.h"
>> +#include "block_int.h"
>> +#include "module.h"
>> +
>> +#define ADD_COW_MAGIC  (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
>> +                        ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
>> +                        ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
>> +                        ((uint64_t)'W' << 8) | 0xFF)
>> +#define ADD_COW_VERSION 1
>> +#define ADD_COW_FILE_LEN 1024
>> +
>> +typedef struct AddCowHeader {
>> +    uint64_t magic;
>> +    uint32_t version;
>> +    char backing_file[ADD_COW_FILE_LEN];
>> +    char image_file[ADD_COW_FILE_LEN];
>> +    uint64_t size;
>> +} QEMU_PACKED AddCowHeader;
>> +
>> +typedef struct BDRVAddCowState {
>> +    char image_file[ADD_COW_FILE_LEN];
>> +    BlockDriverState *image_hd;
>> +    uint8_t *bitmap;
>> +    uint64_t bitmap_size;
>> +    CoMutex lock;
>> +} BDRVAddCowState;
>
> You could align the field names, that would make it a bit more readable.
>
>> +
>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
>> +{
>> +    const AddCowHeader *header = (const void *)buf;
>> +
>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>> +        return 100;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int add_cow_open(BlockDriverState *bs, int flags)
>> +{
>> +    AddCowHeader header;
>> +    int64_t size;
>> +    char image_filename[ADD_COW_FILE_LEN];
>> +    int image_flags;
>> +    BlockDriver *image_drv = NULL;
>> +    int ret;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +
>> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
>> +    if (ret != sizeof(header)) {
>> +        goto fail;
>> +    }
>> +
>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
>> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
>> +        ret = -1;
>
> The return value is interpreted as -errno, so -1 typically means EPERM,
> which is misleading. I think -ENOTSUP or -EINVAL would be a good choice.
> For newer versions of the file format, have a look at how qcow2 handles it.
Okay.
>
>> +        goto fail;
>> +    }
>> +
>> +    size = be64_to_cpu(header.size);
>> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
>> +
>> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
>> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
>> +            header.backing_file);
>> +    pstrcpy(state->image_file, sizeof(state->image_file),
>> +            header.image_file);
>> +
>> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
>> +    state->bitmap = g_malloc0(state->bitmap_size);
>> +
>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>> +            state->bitmap_size);
>> +    if (ret != state->bitmap_size) {
>> +        goto fail;
>> +    }
>> +   /* If there is a image_file, must be together with backing_file */
>> +    if (state->image_file[0] != '\0') {
>
> The other checks check for an error. This one checks for success and has
> a large 'then' block. Any reason for not staying consistent and doing if
> (... == '\0') { ret = -ENOENT; goto fail; } here?
>
Okay.
>> +        state->image_hd = bdrv_new("");
>> +        /* Relative to image or working dir, need discussion */
>
> You implemented it as relative to the image, which I believe is the best
> way to handle it. It is consistent with how backing files are handled.
>
> So is there anything left to discuss or should just the comment be updated?
>

Okay. In the previous emails, you said relative to the image instead
of relative
to the working directory might confuse users.

>> +        if (path_has_protocol(state->image_file)) {
>> +            pstrcpy(image_filename, sizeof(image_filename),
>> +                    state->image_file);
>> +        } else {
>> +            path_combine(image_filename, sizeof(image_filename),
>> +                         bs->filename, state->image_file);
>> +        }
>> +
>> +        image_drv = bdrv_find_format("raw");
>> +        image_flags =
>> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;
>> +        state->image_hd->keep_read_only = 0;
>> +
>> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
>> +                image_drv);
>> +        if (ret < 0) {
>> +            goto fail;
>
> This leaks state->image_hd.
>
>> +        }
>> +    } else {
>> +        ret = -ENOENT;
>> +        goto fail;
>> +    }
>> +    qemu_co_mutex_init(&state->lock);
>> +    return 0;
>> + fail:
>> +    if (state->bitmap) {
>> +        g_free(state->bitmap);
>> +        state->bitmap = NULL;
>> +    }
>
> The check isn't required, g_free(NULL) works fine.
>
>> +    return ret;
>> +}
>> +
>> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    uint64_t offset = bitnum / 8;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    state->bitmap[offset] |= (1 << (bitnum % 8));
>> +}
>> +
>> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    uint64_t offset = bitnum / 8;
>> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
>> +}
>> +
>> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
>> +        int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
>> +
>> +    /* Beyond the end of bitmap, return error or read from backing_file? */
>> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
>> +        return 0;
>> +    }
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = nb_sectors;
>> +        return 0;
>> +    }
>> +
>> +    changed = is_bit_set(bs, sector_num);
>> +    if (changed < 0) {
>> +        return 0;
>> +    }
>> +
>> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    return changed;
>> +}
>> +
>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>> +        int nb_sectors)
>> +{
>> +    int i, ret = 0;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    uint64_t start_pos = sector_num / 8;
>> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
>> +
>> +    if (start_pos > state->bitmap_size) {
>> +        return -1;
>> +    }
>> +
>> +    for (i = 0; i < nb_sectors; i++) {
>> +        add_cow_set_bit(bs, sector_num + i);
>> +    }
>> +    ret = bdrv_pwrite_sync(bs->file,
>> +        sizeof(AddCowHeader) + start_pos,
>> +        state->bitmap + start_pos,
>> +        MIN(((end_pos - start_pos) & (~512)) + 512, state->bitmap_size - start_pos));
>
> This turn effectively everything into cache=writethrough. Which is not
> wrong, but very conservative and will cost a lot of performance.
>
> One optimisation is easy: Check if the bitmap is really changed or if
> you only set bits that were already set before the update. If they were
> already set, you can skip the bdrv_pwrite_sync().
>
> I think it would also be safe to use bdrv_pwrite without an explicit
> sync, this would save some more flushes.
>
Okay.
>> +    return ret;
>> +}
>> +
>> +static void add_cow_close(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    if (state->bitmap) {
>> +        g_free(state->bitmap);
>> +        state->bitmap = NULL;
>> +    }
>> +}
>
> Can state->bitmap ever be NULL for an open image?
>
state->bitmap can not be NULL, yes, check is useless.
>> +
>> +static int add_cow_create(const char *filename, QEMUOptionParameter *options)
>> +{
>> +    AddCowHeader header;
>> +    int64_t image_sectors = 0;
>> +    const char *backing_filename = NULL;
>> +    const char *image_filename = NULL;
>> +    int ret;
>> +    BlockDriverState *bs, *image_bs = NULL;
>> +
>> +    while (options && options->name) {
>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>> +            backing_filename = options->value.s;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
>> +            image_filename = options->value.s;
>> +        }
>> +        options++;
>> +    }
>> +
>> +    if (!backing_filename || !image_filename) {
>> +        return -EINVAL;
>> +    }
>
> Maybe an error_report() that tells the user what's going wrong?
>
Okay.
>> +    /* Make sure image file exists */
>> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
>> +            | BDRV_O_CACHE_WB);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +    bdrv_close(image_bs);
>> +    ret = bdrv_create_file(filename, NULL);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    memset(&header, 0, sizeof(header));
>> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
>> +    header.version = cpu_to_be32(ADD_COW_VERSION);
>> +    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
>> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
>> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
>> +
>> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
>> +    if (ret < 0) {
>> +        bdrv_close(bs);
>> +        return ret;
>> +    }
>> +    bdrv_close(bs);
>> +
>> +    BlockDriver *drv = bdrv_find_format("add-cow");
>> +    assert(drv != NULL);
>> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
>> +    bdrv_close(bs);
>> +    return ret;
>> +}
>
> bdrv_close should be replaced by bdrv_delete in all cases.
>
>> +
>> +static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
>> +                         int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int cur_nr_sectors;
>> +    uint64_t bytes_done = 0;
>> +    QEMUIOVector hd_qiov;
>> +    int n, ret = 0;
>> +
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    while (remaining_sectors != 0) {
>> +        cur_nr_sectors = remaining_sectors;
>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
>> +            cur_nr_sectors = n;
>> +            qemu_iovec_reset(&hd_qiov);
>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
>> +            if (ret < 0) {
>> +                goto fail;
>> +            }
>> +        } else {
>> +            cur_nr_sectors = n;
>> +            if (bs->backing_hd) {
>> +                qemu_iovec_reset(&hd_qiov);
>> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
>> +                                    n, &hd_qiov);
>> +                if (ret < 0) {
>> +                    goto fail;
>> +                }
>> +            } else {
>> +                qemu_iovec_reset(&hd_qiov);
>> +                qemu_iovec_memset(&hd_qiov, 0,
>> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
>> +            }
>> +        }
>> +        remaining_sectors -= cur_nr_sectors;
>> +        sector_num += cur_nr_sectors;
>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
>> +                          int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0;
>> +    QEMUIOVector hd_qiov;
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    qemu_iovec_reset(&hd_qiov);
>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors, &hd_qiov);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +
>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>> +{
>> +    int ret = 0;
>> +    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
>> +    int64_t be_offset = cpu_to_be64(offset);
>> +    BDRVAddCowState *state = bs->opaque;
>> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
>> +        + sizeof(AddCowHeader));
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
>> +        &be_offset, sizeof(uint64_t));
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_truncate(state->image_hd, offset);
>> +    return ret;
>> +}
>
> When an error occurs, you may end up with an add_cow image whose size
> disagrees with the raw image's size.
>
> I think we can do a bit better here. First thing would be to try
> truncating the base image first (many formats don't support truncate at
> all, so this is the most likely failure case). The other thing worth
> doing would be undoing the header update if a truncate fails afterwards.
>
Okay.
>> +
>> +static int add_cow_co_flush(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *state = bs->opaque;
>> +    int ret = bdrv_co_flush(state->image_hd);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return bdrv_co_flush(bs->file);
>> +}
>> +
>> +static QEMUOptionParameter add_cow_create_options[] = {
>> +    {
>> +        .name = BLOCK_OPT_SIZE,
>> +        .type = OPT_SIZE,
>> +        .help = "Virtual disk size"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a base image"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_IMAGE_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a image file"
>> +    },
>> +    { NULL }
>> +};
>> +
>> +static BlockDriver bdrv_add_cow = {
>> +    .format_name        = "add-cow",
>> +    .instance_size      = sizeof(BDRVAddCowState),
>> +    .bdrv_probe         = add_cow_probe,
>> +    .bdrv_open          = add_cow_open,
>> +    .bdrv_close         = add_cow_close,
>> +    .bdrv_create        = add_cow_create,
>> +    .bdrv_is_allocated  = add_cow_is_allocated,
>> +
>> +    .bdrv_co_readv      = add_cow_co_readv,
>> +    .bdrv_co_writev     = add_cow_co_writev,
>> +    .bdrv_truncate      = bdrv_add_cow_truncate,
>> +
>> +    .create_options     = add_cow_create_options,
>> +    .bdrv_co_flush      = add_cow_co_flush,
>> +};
>> +
>> +static void bdrv_add_cow_init(void)
>> +{
>> +    bdrv_register(&bdrv_add_cow);
>> +}
>> +
>> +block_init(bdrv_add_cow_init);
>> diff --git a/block_int.h b/block_int.h
>> index dac00f5..7409c19 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -42,6 +42,7 @@
>>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>  #define BLOCK_OPT_PREALLOC      "preallocation"
>>  #define BLOCK_OPT_SUBFMT        "subformat"
>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>
>>  typedef struct AIOPool {
>>      void (*cancel)(BlockDriverAIOCB *acb);
>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>> new file mode 100644
>> index 0000000..0443c6c
>> --- /dev/null
>> +++ b/docs/specs/add-cow.txt
>> @@ -0,0 +1,45 @@
>> +== General ==
>> +
>> +Raw file format does not support backing_file and copy on write feature. Then
>> +you can use add-cow file realize these features.
>> +
>> +When using add-cow, procedures may like this:
>> +(ubuntu.img is a disk image which has been installed OS.)
>> +    1)  Create a raw image with the same size of ubuntu.img
>> +            qemu-img create -f raw test.raw 8G
>> +    2)  Create a add-cow image which will store dirty bitmap
>> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
>> +    3)  Run qemu with add-cow image
>> +            qemu -drive if=virtio,file=test.add-cow
>> +
>> +=Specification=
>> +
>> +The file format looks like this:
>> +
>> + +----------+----------+----------+-----+
>> + |  Header  |   Data   |   Data   | ... |
>> + +----------+----------+----------+-----+
>> +
>> + All numbers in add-cow are stored in Big Endian byte order.
>
> What the data blocks mean isn't explained in the spec.
>

>> +
>> +
>> +== Header ==
>> +
>> +The Header is included in the first bytes:
>> +
>> +    Byte  0 -  7:       magic
>> +                        add-cow magic string ("ADD_COW\xff")
>> +
>> +          8 -  11:      version
>> +                        Version number (only valid value is 1 now)
>> +
>> +          12 - 1035:    backing_file
>> +                        backing_file file name related to add-cow file. While
>> +                        using backing_file, must together with image_file.
>> +
>> +         1036 - 2059:   image_file
>> +                        image_file is a raw file, While using image_file, must
>> +                        together with image_file.
>> +
>> +         2060 - 2067:   size
>> +                        Virtual disk size of image_file in bytes.
>
> Kevin
>
>

Thank you, Kevin, I will send V5 soon.
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 01587c8..208c12c 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -31,6 +31,7 @@  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-nested-y += add-cow.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block.c b/block.c
index 70aab63..e343995 100644
--- a/block.c
+++ b/block.c
@@ -105,7 +105,7 @@  int is_windows_drive(const char *filename)
 #endif
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 #ifdef _WIN32
     if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index 5a042c9..dff5197 100644
--- a/block.h
+++ b/block.h
@@ -271,6 +271,7 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..93d5b13
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,405 @@ 
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define ADD_COW_MAGIC  (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
+                        ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
+                        ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
+                        ((uint64_t)'W' << 8) | 0xFF)
+#define ADD_COW_VERSION 1
+#define ADD_COW_FILE_LEN 1024
+
+typedef struct AddCowHeader {
+    uint64_t magic;
+    uint32_t version;
+    char backing_file[ADD_COW_FILE_LEN];
+    char image_file[ADD_COW_FILE_LEN];
+    uint64_t size;
+} QEMU_PACKED AddCowHeader;
+
+typedef struct BDRVAddCowState {
+    char image_file[ADD_COW_FILE_LEN];
+    BlockDriverState *image_hd;
+    uint8_t *bitmap;
+    uint64_t bitmap_size;
+    CoMutex lock;
+} BDRVAddCowState;
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const AddCowHeader *header = (const void *)buf;
+
+    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
+        be32_to_cpu(header->version) == ADD_COW_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+    AddCowHeader header;
+    int64_t size;
+    char image_filename[ADD_COW_FILE_LEN];
+    int image_flags;
+    BlockDriver *image_drv = NULL;
+    int ret;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto fail;
+    }
+
+    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
+        be32_to_cpu(header.version) != ADD_COW_VERSION) {
+        ret = -1;
+        goto fail;
+    }
+
+    size = be64_to_cpu(header.size);
+    bs->total_sectors = size / BDRV_SECTOR_SIZE;
+
+    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+            header.backing_file);
+    pstrcpy(state->image_file, sizeof(state->image_file),
+            header.image_file);
+
+    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
+    state->bitmap = g_malloc0(state->bitmap_size);
+
+    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
+            state->bitmap_size);
+    if (ret != state->bitmap_size) {
+        goto fail;
+    }
+   /* If there is a image_file, must be together with backing_file */
+    if (state->image_file[0] != '\0') {
+        state->image_hd = bdrv_new("");
+        /* Relative to image or working dir, need discussion */
+        if (path_has_protocol(state->image_file)) {
+            pstrcpy(image_filename, sizeof(image_filename),
+                    state->image_file);
+        } else {
+            path_combine(image_filename, sizeof(image_filename),
+                         bs->filename, state->image_file);
+        }
+
+        image_drv = bdrv_find_format("raw");
+        image_flags =
+             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;
+        state->image_hd->keep_read_only = 0;
+
+        ret = bdrv_open(state->image_hd, image_filename, image_flags,
+                image_drv);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        ret = -ENOENT;
+        goto fail;
+    }
+    qemu_co_mutex_init(&state->lock);
+    return 0;
+ fail:
+    if (state->bitmap) {
+        g_free(state->bitmap);
+        state->bitmap = NULL;
+    }
+    return ret;
+}
+
+static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+{
+    uint64_t offset = bitnum / 8;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    state->bitmap[offset] |= (1 << (bitnum % 8));
+}
+
+static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+{
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    uint64_t offset = bitnum / 8;
+    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
+}
+
+static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors, int *num_same)
+{
+    int changed;
+    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
+
+    /* Beyond the end of bitmap, return error or read from backing_file? */
+    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
+        return 0;
+    }
+
+    if (nb_sectors == 0) {
+        *num_same = nb_sectors;
+        return 0;
+    }
+
+    changed = is_bit_set(bs, sector_num);
+    if (changed < 0) {
+        return 0;
+    }
+
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+        if (is_bit_set(bs, sector_num + *num_same) != changed) {
+            break;
+        }
+    }
+
+    return changed;
+}
+
+static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors)
+{
+    int i, ret = 0;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    uint64_t start_pos = sector_num / 8;
+    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
+
+    if (start_pos > state->bitmap_size) {
+        return -1;
+    }
+
+    for (i = 0; i < nb_sectors; i++) {
+        add_cow_set_bit(bs, sector_num + i);
+    }
+    ret = bdrv_pwrite_sync(bs->file,
+        sizeof(AddCowHeader) + start_pos,
+        state->bitmap + start_pos,
+        MIN(((end_pos - start_pos) & (~512)) + 512, state->bitmap_size - start_pos));
+    return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    if (state->bitmap) {
+        g_free(state->bitmap);
+        state->bitmap = NULL;
+    }
+}
+
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    AddCowHeader header;
+    int64_t image_sectors = 0;
+    const char *backing_filename = NULL;
+    const char *image_filename = NULL;
+    int ret;
+    BlockDriverState *bs, *image_bs = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_filename = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    if (!backing_filename || !image_filename) {
+        return -EINVAL;
+    }
+    /* Make sure image file exists */
+    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_close(image_bs);
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be64(ADD_COW_MAGIC);
+    header.version = cpu_to_be32(ADD_COW_VERSION);
+    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
+    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
+    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        bdrv_close(bs);
+        return ret;
+    }
+    bdrv_close(bs);
+
+    BlockDriver *drv = bdrv_find_format("add-cow");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+    bdrv_close(bs);
+    return ret;
+}
+
+static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int cur_nr_sectors;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int n, ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    while (remaining_sectors != 0) {
+        cur_nr_sectors = remaining_sectors;
+        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
+            cur_nr_sectors = n;
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            cur_nr_sectors = n;
+            if (bs->backing_hd) {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_memset(&hd_qiov, 0,
+                    BDRV_SECTOR_SIZE * cur_nr_sectors);
+            }
+        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+    QEMUIOVector hd_qiov;
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_reset(&hd_qiov);
+    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
+    ret = bdrv_co_writev(s->image_hd,
+                     sector_num,
+                     remaining_sectors, &hd_qiov);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
+{
+    int ret = 0;
+    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
+    int64_t be_offset = cpu_to_be64(offset);
+    BDRVAddCowState *state = bs->opaque;
+    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
+        + sizeof(AddCowHeader));
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
+        &be_offset, sizeof(uint64_t));
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(state->image_hd, offset);
+    return ret;
+}
+
+static int add_cow_co_flush(BlockDriverState *bs)
+{
+    BDRVAddCowState *state = bs->opaque;
+    int ret = bdrv_co_flush(state->image_hd);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_co_flush(bs->file);
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_IMAGE_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a image file"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_add_cow = {
+    .format_name        = "add-cow",
+    .instance_size      = sizeof(BDRVAddCowState),
+    .bdrv_probe         = add_cow_probe,
+    .bdrv_open          = add_cow_open,
+    .bdrv_close         = add_cow_close,
+    .bdrv_create        = add_cow_create,
+    .bdrv_is_allocated  = add_cow_is_allocated,
+
+    .bdrv_co_readv      = add_cow_co_readv,
+    .bdrv_co_writev     = add_cow_co_writev,
+    .bdrv_truncate      = bdrv_add_cow_truncate,
+
+    .create_options     = add_cow_create_options,
+    .bdrv_co_flush      = add_cow_co_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+    bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block_int.h b/block_int.h
index dac00f5..7409c19 100644
--- a/block_int.h
+++ b/block_int.h
@@ -42,6 +42,7 @@ 
 #define BLOCK_OPT_TABLE_SIZE    "table_size"
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
+#define BLOCK_OPT_IMAGE_FILE    "image_file"
 
 typedef struct AIOPool {
     void (*cancel)(BlockDriverAIOCB *acb);
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..0443c6c
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,45 @@ 
+== General ==
+
+Raw file format does not support backing_file and copy on write feature. Then
+you can use add-cow file realize these features.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+    1)  Create a raw image with the same size of ubuntu.img
+            qemu-img create -f raw test.raw 8G
+    2)  Create a add-cow image which will store dirty bitmap
+            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
+    3)  Run qemu with add-cow image
+            qemu -drive if=virtio,file=test.add-cow
+
+=Specification=
+
+The file format looks like this:
+
+ +----------+----------+----------+-----+
+ |  Header  |   Data   |   Data   | ... |
+ +----------+----------+----------+-----+
+
+ All numbers in add-cow are stored in Big Endian byte order.
+
+
+== Header ==
+
+The Header is included in the first bytes:
+
+    Byte  0 -  7:       magic
+                        add-cow magic string ("ADD_COW\xff")
+
+          8 -  11:      version
+                        Version number (only valid value is 1 now)
+
+          12 - 1035:    backing_file
+                        backing_file file name related to add-cow file. While
+                        using backing_file, must together with image_file.
+
+         1036 - 2059:   image_file
+                        image_file is a raw file, While using image_file, must
+                        together with image_file.
+
+         2060 - 2067:   size
+                        Virtual disk size of image_file in bytes.