Patchwork [v5] block:add-cow file format

login
register
mail settings
Submitter Robert Wang
Date Nov. 15, 2011, 5:28 a.m.
Message ID <1321334931-26087-1-git-send-email-wdongxu@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/125672/
State New
Headers show

Comments

Robert Wang - Nov. 15, 2011, 5:28 a.m.
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>

Provide a new file format: add-cow. The usage can be found in add-cow.txt of
this patch.

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
---
 Makefile.objs          |    1 +
 block.c                |    2 +-
 block.h                |    1 +
 block/add-cow.c        |  417 ++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h            |    1 +
 docs/specs/add-cow.txt |   57 +++++++
 6 files changed, 478 insertions(+), 1 deletions(-)
 create mode 100644 block/add-cow.c
 create mode 100644 docs/specs/add-cow.txt
Robert Wang - Nov. 28, 2011, 2:11 a.m.
Any comment?
Thanks.

2011/11/15 Dong Xu Wang <wdongxu@linux.vnet.ibm.com>

> From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>
> Provide a new file format: add-cow. The usage can be found in add-cow.txt
> of
> this patch.
>
> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  417
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   57 +++++++
>  6 files changed, 478 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
>
> diff --git a/Makefile.objs b/Makefile.objs
> index d7a6539..ad99243 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o
> vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
> qcow2-snapshot.o qcow2-cache.o
> +block-nested-y += add-cow.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
> qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> diff --git a/block.c b/block.c
> index 86910b0..a2be27b 100644
> --- a/block.c
> +++ b/block.c
> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>  #endif
>
>  /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>  {
>  #ifdef _WIN32
>     if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index 051a25d..836284f 100644
> --- a/block.h
> +++ b/block.h
> @@ -276,6 +276,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,
> QEMUSnapshotInfo *sn);
>
>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>  int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>  void path_combine(char *dest, int dest_size,
>                   const char *base_path,
>                   const char *filename);
> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..54d30a9
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,417 @@
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' <<
> 48) | \
> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32)
> | \
> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16)
> | \
> +                            ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION     1
> +#define ADD_COW_FILE_LEN    1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    uint64_t        size;
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char                image_file[ADD_COW_FILE_LEN];
> +    BlockDriverState    *image_hd;
> +    uint8_t             *bitmap;
> +    uint64_t            bitmap_size;
> +    CoMutex             lock;
> +} BDRVAddCowState;
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char
> *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader    header;
> +    int64_t         size;
> +    char            image_filename[ADD_COW_FILE_LEN];
> +    int             image_flags;
> +    BlockDriver     *image_drv = NULL;
> +    int             ret;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) !=
> sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(state->image_file, sizeof(state->image_file),
> +            header.image_file);
> +
> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    state->bitmap = g_malloc0(state->bitmap_size);
> +
> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> +            state->bitmap_size);
> +    if (ret != state->bitmap_size) {
> +        goto fail;
> +    }
> +   /* If there is a image_file, must be together with backing_file */
> +    if (state->image_file[0] != '\0') {
> +        state->image_hd = bdrv_new("");
> +
> +        if (path_has_protocol(state->image_file)) {
> +            pstrcpy(image_filename, sizeof(image_filename),
> +                    state->image_file);
> +        } else {
> +            path_combine(image_filename, sizeof(image_filename),
> +                         bs->filename, state->image_file);
> +        }
> +
> +        image_drv = bdrv_find_format("raw");
> +        image_flags =
> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) |
> BDRV_O_RDWR;
> +        state->image_hd->keep_read_only = 0;
> +
> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
> +                image_drv);
> +        if (ret < 0) {
> +            bdrv_delete(state->image_hd);
> +            state->image_hd = NULL;
> +            goto fail;
> +        }
> +    }
> +    if (state->image_file[0] == '\0') {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +
> +    qemu_co_mutex_init(&state->lock);
> +    return 0;
> + fail:
> +    g_free(state->bitmap);
> +    state->bitmap = NULL;
> +    return ret;
> +}
> +
> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> +{
> +    uint64_t offset = bitnum / 8;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    state->bitmap[offset] |= (1 << (bitnum % 8));
> +}
> +
> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t offset = bitnum / 8;
> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
> +}
> +
> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
> +
> +    /* Beyond the end of bitmap, return error or read from backing_file?
> */
> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
> +        return 0;
> +    }
> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    int i, ret = 0;
> +    bool changed = false;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t start_pos = sector_num / 8;
> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
> +
> +    if (start_pos > state->bitmap_size) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < nb_sectors; i++) {
> +        if (changed || !is_bit_set(bs, sector_num + i)) {
> +            changed = true;
> +        }
> +        add_cow_set_bit(bs, sector_num + i);
> +    }
> +
> +    if (changed) {
> +        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
> +            state->bitmap + start_pos,
> +            MIN(((end_pos - start_pos) & (~512)) + 512,
> +                state->bitmap_size - start_pos));
> +    }
> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    g_free(state->bitmap);
> +    state->bitmap = NULL;
> +}
> +
> +static int add_cow_create(const char *filename, QEMUOptionParameter
> *options)
> +{
> +    AddCowHeader header;
> +    int64_t image_sectors = 0;
> +    const char *backing_filename = NULL;
> +    const char *image_filename = NULL;
> +    int ret;
> +    BlockDriverState *bs, *image_bs = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_filename = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
> +            image_filename = options->value.s;
> +        }
> +        options++;
> +    }
> +
> +    if (!backing_filename || !image_filename) {
> +        error_report("Both backing_file and image_file should be given.");
> +        return -EINVAL;
> +    }
> +    /* Make sure image file exists */
> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
> +            | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_delete(image_bs);
> +
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
> +    header.version = cpu_to_be32(ADD_COW_VERSION);
> +    pstrcpy(header.backing_file, sizeof(header.backing_file),
> backing_filename);
> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
> +
> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
> +    if (ret < 0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    BlockDriver *drv = bdrv_find_format("add-cow");
> +    assert(drv != NULL);
> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
> +    if (ret < 0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
> +    bdrv_delete(bs);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t
> sector_num,
> +                         int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
> +            if (ret < 0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                    n, &hd_qiov);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +            } else {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_memset(&hd_qiov, 0,
> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t
> sector_num,
> +                          int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +    QEMUIOVector hd_qiov;
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_reset(&hd_qiov);
> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors *
> BDRV_SECTOR_SIZE);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, &hd_qiov);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
> +{
> +    int ret = 0;
> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
> +    int64_t be_offset = cpu_to_be64(offset);
> +    BDRVAddCowState *state = bs->opaque;
> +    int64_t old_image_sector = state->image_hd->total_sectors;
> +
> +    ret = bdrv_truncate(state->image_hd, offset);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
> +            + sizeof(AddCowHeader));
> +    if (ret < 0) {
> +        bdrv_truncate(state->image_hd, old_image_sector *
> BDRV_SECTOR_SIZE);
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
> +        &be_offset, sizeof(uint64_t));
> +    if (ret < 0) {
> +        bdrv_truncate(state->image_hd, old_image_sector *
> BDRV_SECTOR_SIZE);
> +    }
> +
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = bs->opaque;
> +    int ret = bdrv_co_flush(state->image_hd);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name                = "add-cow",
> +    .instance_size              = sizeof(BDRVAddCowState),
> +    .bdrv_probe                 = add_cow_probe,
> +    .bdrv_open                  = add_cow_open,
> +    .bdrv_close                 = add_cow_close,
> +    .bdrv_create                = add_cow_create,
> +    .bdrv_is_allocated          = add_cow_is_allocated,
> +
> +    .bdrv_co_readv              = add_cow_co_readv,
> +    .bdrv_co_writev             = add_cow_co_writev,
> +    .bdrv_truncate              = bdrv_add_cow_truncate,
> +
> +    .create_options             = add_cow_create_options,
> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block_int.h b/block_int.h
> index 1ec4921..d6e8337 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -43,6 +43,7 @@
>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>  #define BLOCK_OPT_PREALLOC      "preallocation"
>  #define BLOCK_OPT_SUBFMT        "subformat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>
>  typedef struct AIOPool {
>     void (*cancel)(BlockDriverAIOCB *acb);
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..e403c84
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,57 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature.
> Then
> +you can use add-cow file to implement these features.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o
> backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +While QEMU is running, virtual size of image_file and backing_file must
> be the
> +same. So if image_file does not have the same virtual size as
> backing_file's in
> +step 2), qemu-img will truncate it.
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +----------+----------+----------+-----+
> + |  Header  |   Data   |   Data   | ... |
> + +----------+----------+----------+-----+
> +
> + All numbers in add-cow are stored in Big Endian byte order.
> +
> +
> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file.
> While
> +                        using backing_file, must together with image_file.
> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file, While using image_file,
> must
> +                        together with image_file.
> +
> +         2060 - 2067:   size
> +                        Virtual disk size of image_file in bytes.
> +
> +== Data ==
> +
> +The Data field stores a bitmap related to backing_file and image_file.
> The bitmap
> +will track whether the cluster in backing_file is dirty or not.
> +
> +Each bit in the bitmap indicates one cluster. So the size of bitmap is
> calculated
> +according to virtual size of backing_file.
> --
> 1.7.5.4
>
>
>
Robert Wang - Dec. 5, 2011, 1:38 a.m.
Ping...

2011/11/28 Dong Xu Wang <wdongxu@linux.vnet.ibm.com>

> Any comment?
> Thanks.
>
>
> 2011/11/15 Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>
>> From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>>
>> Provide a new file format: add-cow. The usage can be found in add-cow.txt
>> of
>> this patch.
>>
>> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>> ---
>>  Makefile.objs          |    1 +
>>  block.c                |    2 +-
>>  block.h                |    1 +
>>  block/add-cow.c        |  417
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>  block_int.h            |    1 +
>>  docs/specs/add-cow.txt |   57 +++++++
>>  6 files changed, 478 insertions(+), 1 deletions(-)
>>  create mode 100644 block/add-cow.c
>>  create mode 100644 docs/specs/add-cow.txt
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index d7a6539..ad99243 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>
>>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o
>> vpc.o vvfat.o
>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
>> qcow2-snapshot.o qcow2-cache.o
>> +block-nested-y += add-cow.o
>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
>> qed-cluster.o
>>  block-nested-y += qed-check.o
>>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>> diff --git a/block.c b/block.c
>> index 86910b0..a2be27b 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>>  #endif
>>
>>  /* check if the path starts with "<protocol>:" */
>> -static int path_has_protocol(const char *path)
>> +int path_has_protocol(const char *path)
>>  {
>>  #ifdef _WIN32
>>     if (is_windows_drive(path) ||
>> diff --git a/block.h b/block.h
>> index 051a25d..836284f 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -276,6 +276,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,
>> QEMUSnapshotInfo *sn);
>>
>>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>  int path_is_absolute(const char *path);
>> +int path_has_protocol(const char *path);
>>  void path_combine(char *dest, int dest_size,
>>                   const char *base_path,
>>                   const char *filename);
>> diff --git a/block/add-cow.c b/block/add-cow.c
>> new file mode 100644
>> index 0000000..54d30a9
>> --- /dev/null
>> +++ b/block/add-cow.c
>> @@ -0,0 +1,417 @@
>> +#include "qemu-common.h"
>> +#include "block_int.h"
>> +#include "module.h"
>> +
>> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' <<
>> 48) | \
>> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' <<
>> 32) | \
>> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' <<
>> 16) | \
>> +                            ((uint64_t)'W' << 8) | 0xFF)
>> +#define ADD_COW_VERSION     1
>> +#define ADD_COW_FILE_LEN    1024
>> +
>> +typedef struct AddCowHeader {
>> +    uint64_t        magic;
>> +    uint32_t        version;
>> +    char            backing_file[ADD_COW_FILE_LEN];
>> +    char            image_file[ADD_COW_FILE_LEN];
>> +    uint64_t        size;
>> +} QEMU_PACKED AddCowHeader;
>> +
>> +typedef struct BDRVAddCowState {
>> +    char                image_file[ADD_COW_FILE_LEN];
>> +    BlockDriverState    *image_hd;
>> +    uint8_t             *bitmap;
>> +    uint64_t            bitmap_size;
>> +    CoMutex             lock;
>> +} BDRVAddCowState;
>> +
>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char
>> *filename)
>> +{
>> +    const AddCowHeader *header = (const void *)buf;
>> +
>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>> +        return 100;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int add_cow_open(BlockDriverState *bs, int flags)
>> +{
>> +    AddCowHeader    header;
>> +    int64_t         size;
>> +    char            image_filename[ADD_COW_FILE_LEN];
>> +    int             image_flags;
>> +    BlockDriver     *image_drv = NULL;
>> +    int             ret;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +
>> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
>> +    if (ret != sizeof(header)) {
>> +        goto fail;
>> +    }
>> +
>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
>> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
>> +        ret = -EINVAL;
>> +        goto fail;
>> +    }
>> +
>> +    size = be64_to_cpu(header.size);
>> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
>> +
>> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) !=
>> sizeof(header.image_file));
>> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
>> +            header.backing_file);
>> +    pstrcpy(state->image_file, sizeof(state->image_file),
>> +            header.image_file);
>> +
>> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
>> +    state->bitmap = g_malloc0(state->bitmap_size);
>> +
>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>> +            state->bitmap_size);
>> +    if (ret != state->bitmap_size) {
>> +        goto fail;
>> +    }
>> +   /* If there is a image_file, must be together with backing_file */
>> +    if (state->image_file[0] != '\0') {
>> +        state->image_hd = bdrv_new("");
>> +
>> +        if (path_has_protocol(state->image_file)) {
>> +            pstrcpy(image_filename, sizeof(image_filename),
>> +                    state->image_file);
>> +        } else {
>> +            path_combine(image_filename, sizeof(image_filename),
>> +                         bs->filename, state->image_file);
>> +        }
>> +
>> +        image_drv = bdrv_find_format("raw");
>> +        image_flags =
>> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) |
>> BDRV_O_RDWR;
>> +        state->image_hd->keep_read_only = 0;
>> +
>> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
>> +                image_drv);
>> +        if (ret < 0) {
>> +            bdrv_delete(state->image_hd);
>> +            state->image_hd = NULL;
>> +            goto fail;
>> +        }
>> +    }
>> +    if (state->image_file[0] == '\0') {
>> +        ret = -ENOENT;
>> +        goto fail;
>> +    }
>> +
>> +    qemu_co_mutex_init(&state->lock);
>> +    return 0;
>> + fail:
>> +    g_free(state->bitmap);
>> +    state->bitmap = NULL;
>> +    return ret;
>> +}
>> +
>> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    uint64_t offset = bitnum / 8;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    state->bitmap[offset] |= (1 << (bitnum % 8));
>> +}
>> +
>> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    uint64_t offset = bitnum / 8;
>> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
>> +}
>> +
>> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
>> +        int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +    uint64_t bitmap_size = ((BDRVAddCowState
>> *)(bs->opaque))->bitmap_size;
>> +
>> +    /* Beyond the end of bitmap, return error or read from backing_file?
>> */
>> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
>> +        return 0;
>> +    }
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = nb_sectors;
>> +        return 0;
>> +    }
>> +
>> +    changed = is_bit_set(bs, sector_num);
>> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    return changed;
>> +}
>> +
>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t
>> sector_num,
>> +        int nb_sectors)
>> +{
>> +    int i, ret = 0;
>> +    bool changed = false;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    uint64_t start_pos = sector_num / 8;
>> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
>> +
>> +    if (start_pos > state->bitmap_size) {
>> +        return -1;
>> +    }
>> +
>> +    for (i = 0; i < nb_sectors; i++) {
>> +        if (changed || !is_bit_set(bs, sector_num + i)) {
>> +            changed = true;
>> +        }
>> +        add_cow_set_bit(bs, sector_num + i);
>> +    }
>> +
>> +    if (changed) {
>> +        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
>> +            state->bitmap + start_pos,
>> +            MIN(((end_pos - start_pos) & (~512)) + 512,
>> +                state->bitmap_size - start_pos));
>> +    }
>> +    return ret;
>> +}
>> +
>> +static void add_cow_close(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    g_free(state->bitmap);
>> +    state->bitmap = NULL;
>> +}
>> +
>> +static int add_cow_create(const char *filename, QEMUOptionParameter
>> *options)
>> +{
>> +    AddCowHeader header;
>> +    int64_t image_sectors = 0;
>> +    const char *backing_filename = NULL;
>> +    const char *image_filename = NULL;
>> +    int ret;
>> +    BlockDriverState *bs, *image_bs = NULL;
>> +
>> +    while (options && options->name) {
>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>> +            backing_filename = options->value.s;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
>> +            image_filename = options->value.s;
>> +        }
>> +        options++;
>> +    }
>> +
>> +    if (!backing_filename || !image_filename) {
>> +        error_report("Both backing_file and image_file should be
>> given.");
>> +        return -EINVAL;
>> +    }
>> +    /* Make sure image file exists */
>> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
>> +            | BDRV_O_CACHE_WB);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +    bdrv_delete(image_bs);
>> +
>> +    ret = bdrv_create_file(filename, NULL);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    memset(&header, 0, sizeof(header));
>> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
>> +    header.version = cpu_to_be32(ADD_COW_VERSION);
>> +    pstrcpy(header.backing_file, sizeof(header.backing_file),
>> backing_filename);
>> +    pstrcpy(header.image_file, sizeof(header.image_file),
>> image_filename);
>> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
>> +
>> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
>> +    if (ret < 0) {
>> +        bdrv_delete(bs);
>> +        return ret;
>> +    }
>> +
>> +    BlockDriver *drv = bdrv_find_format("add-cow");
>> +    assert(drv != NULL);
>> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
>> +    if (ret < 0) {
>> +        bdrv_delete(bs);
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
>> +    bdrv_delete(bs);
>> +    return ret;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t
>> sector_num,
>> +                         int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int cur_nr_sectors;
>> +    uint64_t bytes_done = 0;
>> +    QEMUIOVector hd_qiov;
>> +    int n, ret = 0;
>> +
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    while (remaining_sectors != 0) {
>> +        cur_nr_sectors = remaining_sectors;
>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
>> +            cur_nr_sectors = n;
>> +            qemu_iovec_reset(&hd_qiov);
>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
>> +            if (ret < 0) {
>> +                goto fail;
>> +            }
>> +        } else {
>> +            cur_nr_sectors = n;
>> +            if (bs->backing_hd) {
>> +                qemu_iovec_reset(&hd_qiov);
>> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
>> +                                    n, &hd_qiov);
>> +                if (ret < 0) {
>> +                    goto fail;
>> +                }
>> +            } else {
>> +                qemu_iovec_reset(&hd_qiov);
>> +                qemu_iovec_memset(&hd_qiov, 0,
>> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
>> +            }
>> +        }
>> +        remaining_sectors -= cur_nr_sectors;
>> +        sector_num += cur_nr_sectors;
>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t
>> sector_num,
>> +                          int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0;
>> +    QEMUIOVector hd_qiov;
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    qemu_iovec_reset(&hd_qiov);
>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors *
>> BDRV_SECTOR_SIZE);
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors, &hd_qiov);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +
>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>> +{
>> +    int ret = 0;
>> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
>> +    int64_t be_offset = cpu_to_be64(offset);
>> +    BDRVAddCowState *state = bs->opaque;
>> +    int64_t old_image_sector = state->image_hd->total_sectors;
>> +
>> +    ret = bdrv_truncate(state->image_hd, offset);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
>> +            + sizeof(AddCowHeader));
>> +    if (ret < 0) {
>> +        bdrv_truncate(state->image_hd, old_image_sector *
>> BDRV_SECTOR_SIZE);
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
>> +        &be_offset, sizeof(uint64_t));
>> +    if (ret < 0) {
>> +        bdrv_truncate(state->image_hd, old_image_sector *
>> BDRV_SECTOR_SIZE);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *state = bs->opaque;
>> +    int ret = bdrv_co_flush(state->image_hd);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return bdrv_co_flush(bs->file);
>> +}
>> +
>> +static QEMUOptionParameter add_cow_create_options[] = {
>> +    {
>> +        .name = BLOCK_OPT_SIZE,
>> +        .type = OPT_SIZE,
>> +        .help = "Virtual disk size"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a base image"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_IMAGE_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a image file"
>> +    },
>> +    { NULL }
>> +};
>> +
>> +static BlockDriver bdrv_add_cow = {
>> +    .format_name                = "add-cow",
>> +    .instance_size              = sizeof(BDRVAddCowState),
>> +    .bdrv_probe                 = add_cow_probe,
>> +    .bdrv_open                  = add_cow_open,
>> +    .bdrv_close                 = add_cow_close,
>> +    .bdrv_create                = add_cow_create,
>> +    .bdrv_is_allocated          = add_cow_is_allocated,
>> +
>> +    .bdrv_co_readv              = add_cow_co_readv,
>> +    .bdrv_co_writev             = add_cow_co_writev,
>> +    .bdrv_truncate              = bdrv_add_cow_truncate,
>> +
>> +    .create_options             = add_cow_create_options,
>> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
>> +};
>> +
>> +static void bdrv_add_cow_init(void)
>> +{
>> +    bdrv_register(&bdrv_add_cow);
>> +}
>> +
>> +block_init(bdrv_add_cow_init);
>> diff --git a/block_int.h b/block_int.h
>> index 1ec4921..d6e8337 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -43,6 +43,7 @@
>>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>  #define BLOCK_OPT_PREALLOC      "preallocation"
>>  #define BLOCK_OPT_SUBFMT        "subformat"
>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>
>>  typedef struct AIOPool {
>>     void (*cancel)(BlockDriverAIOCB *acb);
>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>> new file mode 100644
>> index 0000000..e403c84
>> --- /dev/null
>> +++ b/docs/specs/add-cow.txt
>> @@ -0,0 +1,57 @@
>> +== General ==
>> +
>> +Raw file format does not support backing_file and copy on write feature.
>> Then
>> +you can use add-cow file to implement these features.
>> +
>> +When using add-cow, procedures may like this:
>> +(ubuntu.img is a disk image which has been installed OS.)
>> +    1)  Create a raw image with the same size of ubuntu.img
>> +            qemu-img create -f raw test.raw 8G
>> +    2)  Create a add-cow image which will store dirty bitmap
>> +            qemu-img create -f add-cow test.add-cow -o
>> backing_file=ubuntu.img,image_file=test.raw
>> +    3)  Run qemu with add-cow image
>> +            qemu -drive if=virtio,file=test.add-cow
>> +
>> +While QEMU is running, virtual size of image_file and backing_file must
>> be the
>> +same. So if image_file does not have the same virtual size as
>> backing_file's in
>> +step 2), qemu-img will truncate it.
>> +
>> +=Specification=
>> +
>> +The file format looks like this:
>> +
>> + +----------+----------+----------+-----+
>> + |  Header  |   Data   |   Data   | ... |
>> + +----------+----------+----------+-----+
>> +
>> + All numbers in add-cow are stored in Big Endian byte order.
>> +
>> +
>> +== Header ==
>> +
>> +The Header is included in the first bytes:
>> +
>> +    Byte  0 -  7:       magic
>> +                        add-cow magic string ("ADD_COW\xff")
>> +
>> +          8 -  11:      version
>> +                        Version number (only valid value is 1 now)
>> +
>> +          12 - 1035:    backing_file
>> +                        backing_file file name related to add-cow file.
>> While
>> +                        using backing_file, must together with
>> image_file.
>> +
>> +         1036 - 2059:   image_file
>> +                        image_file is a raw file, While using
>> image_file, must
>> +                        together with image_file.
>> +
>> +         2060 - 2067:   size
>> +                        Virtual disk size of image_file in bytes.
>> +
>> +== Data ==
>> +
>> +The Data field stores a bitmap related to backing_file and image_file.
>> The bitmap
>> +will track whether the cluster in backing_file is dirty or not.
>> +
>> +Each bit in the bitmap indicates one cluster. So the size of bitmap is
>> calculated
>> +according to virtual size of backing_file.
>> --
>> 1.7.5.4
>>
>>
>>
>
Marcelo Tosatti - Dec. 6, 2011, 12:48 p.m.
On Tue, Nov 15, 2011 at 01:28:51PM +0800, Dong Xu Wang wrote:
> From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> 
> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
> 
> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  417 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   57 +++++++
>  6 files changed, 478 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index d7a6539..ad99243 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>  
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-nested-y += add-cow.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> diff --git a/block.c b/block.c
> index 86910b0..a2be27b 100644
> --- a/block.c
> +++ b/block.c
> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>  #endif
>  
>  /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>  {
>  #ifdef _WIN32
>      if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index 051a25d..836284f 100644
> --- a/block.h
> +++ b/block.h
> @@ -276,6 +276,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>  
>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>  int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>  void path_combine(char *dest, int dest_size,
>                    const char *base_path,
>                    const char *filename);
> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..54d30a9
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,417 @@
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
> +                            ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION     1
> +#define ADD_COW_FILE_LEN    1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    uint64_t        size;
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char                image_file[ADD_COW_FILE_LEN];
> +    BlockDriverState    *image_hd;
> +    uint8_t             *bitmap;
> +    uint64_t            bitmap_size;
> +    CoMutex             lock;
> +} BDRVAddCowState;
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader    header;
> +    int64_t         size;
> +    char            image_filename[ADD_COW_FILE_LEN];
> +    int             image_flags;
> +    BlockDriver     *image_drv = NULL;
> +    int             ret;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(state->image_file, sizeof(state->image_file),
> +            header.image_file);
> +
> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    state->bitmap = g_malloc0(state->bitmap_size);
> +
> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> +            state->bitmap_size);
> +    if (ret != state->bitmap_size) {
> +        goto fail;
> +    }

Reading the entire bitmap in memory is not acceptable, it may be huge.
Better mmap it and use msync(MS_SYNC) when writing it back. This way the
host can free memory easily upon pressure.

> +   /* If there is a image_file, must be together with backing_file */
> +    if (state->image_file[0] != '\0') {
> +        state->image_hd = bdrv_new("");
> +
> +        if (path_has_protocol(state->image_file)) {
> +            pstrcpy(image_filename, sizeof(image_filename),
> +                    state->image_file);
> +        } else {
> +            path_combine(image_filename, sizeof(image_filename),
> +                         bs->filename, state->image_file);
> +        }
> +
> +        image_drv = bdrv_find_format("raw");
> +        image_flags =
> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;

Why are you modifying flags here?
Kevin Wolf - Dec. 6, 2011, 12:59 p.m.
Am 06.12.2011 13:48, schrieb Marcelo Tosatti:
> On Tue, Nov 15, 2011 at 01:28:51PM +0800, Dong Xu Wang wrote:
>> From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>>
>> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
>> this patch.
>>
>> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>> ---
>>  Makefile.objs          |    1 +
>>  block.c                |    2 +-
>>  block.h                |    1 +
>>  block/add-cow.c        |  417 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  block_int.h            |    1 +
>>  docs/specs/add-cow.txt |   57 +++++++
>>  6 files changed, 478 insertions(+), 1 deletions(-)
>>  create mode 100644 block/add-cow.c
>>  create mode 100644 docs/specs/add-cow.txt
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index d7a6539..ad99243 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>  
>>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>> +block-nested-y += add-cow.o
>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>  block-nested-y += qed-check.o
>>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>> diff --git a/block.c b/block.c
>> index 86910b0..a2be27b 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>>  #endif
>>  
>>  /* check if the path starts with "<protocol>:" */
>> -static int path_has_protocol(const char *path)
>> +int path_has_protocol(const char *path)
>>  {
>>  #ifdef _WIN32
>>      if (is_windows_drive(path) ||
>> diff --git a/block.h b/block.h
>> index 051a25d..836284f 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -276,6 +276,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>>  
>>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>  int path_is_absolute(const char *path);
>> +int path_has_protocol(const char *path);
>>  void path_combine(char *dest, int dest_size,
>>                    const char *base_path,
>>                    const char *filename);
>> diff --git a/block/add-cow.c b/block/add-cow.c
>> new file mode 100644
>> index 0000000..54d30a9
>> --- /dev/null
>> +++ b/block/add-cow.c
>> @@ -0,0 +1,417 @@
>> +#include "qemu-common.h"
>> +#include "block_int.h"
>> +#include "module.h"
>> +
>> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
>> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
>> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
>> +                            ((uint64_t)'W' << 8) | 0xFF)
>> +#define ADD_COW_VERSION     1
>> +#define ADD_COW_FILE_LEN    1024
>> +
>> +typedef struct AddCowHeader {
>> +    uint64_t        magic;
>> +    uint32_t        version;
>> +    char            backing_file[ADD_COW_FILE_LEN];
>> +    char            image_file[ADD_COW_FILE_LEN];
>> +    uint64_t        size;
>> +} QEMU_PACKED AddCowHeader;
>> +
>> +typedef struct BDRVAddCowState {
>> +    char                image_file[ADD_COW_FILE_LEN];
>> +    BlockDriverState    *image_hd;
>> +    uint8_t             *bitmap;
>> +    uint64_t            bitmap_size;
>> +    CoMutex             lock;
>> +} BDRVAddCowState;
>> +
>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
>> +{
>> +    const AddCowHeader *header = (const void *)buf;
>> +
>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>> +        return 100;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int add_cow_open(BlockDriverState *bs, int flags)
>> +{
>> +    AddCowHeader    header;
>> +    int64_t         size;
>> +    char            image_filename[ADD_COW_FILE_LEN];
>> +    int             image_flags;
>> +    BlockDriver     *image_drv = NULL;
>> +    int             ret;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +
>> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
>> +    if (ret != sizeof(header)) {
>> +        goto fail;
>> +    }
>> +
>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
>> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
>> +        ret = -EINVAL;
>> +        goto fail;
>> +    }
>> +
>> +    size = be64_to_cpu(header.size);
>> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
>> +
>> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
>> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
>> +            header.backing_file);
>> +    pstrcpy(state->image_file, sizeof(state->image_file),
>> +            header.image_file);
>> +
>> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
>> +    state->bitmap = g_malloc0(state->bitmap_size);
>> +
>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>> +            state->bitmap_size);
>> +    if (ret != state->bitmap_size) {
>> +        goto fail;
>> +    }
> 
> Reading the entire bitmap in memory is not acceptable, it may be huge.
> Better mmap it and use msync(MS_SYNC) when writing it back. This way the
> host can free memory easily upon pressure.

You can't use mmap in block drivers. It would only work with raw-posix
backends, if at all.

Kevin
Marcelo Tosatti - Dec. 6, 2011, 2:53 p.m.
On Tue, Dec 06, 2011 at 01:59:48PM +0100, Kevin Wolf wrote:
> >> +
> >> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> >> +            state->bitmap_size);
> >> +    if (ret != state->bitmap_size) {
> >> +        goto fail;
> >> +    }
> > 
> > Reading the entire bitmap in memory is not acceptable, it may be huge.
> > Better mmap it and use msync(MS_SYNC) when writing it back. This way the
> > host can free memory easily upon pressure.
> 
> You can't use mmap in block drivers. It would only work with raw-posix
> backends, if at all.
> 
> Kevin

This is just the bitmap, a plain file. Why would you want to use
anything other than a plain file to use as storage for the bitmap?
Kevin Wolf - Dec. 6, 2011, 2:56 p.m.
Am 15.11.2011 06:28, schrieb Dong Xu Wang:
> From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> 
> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
> 
> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> ---
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  417 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   57 +++++++
>  6 files changed, 478 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index d7a6539..ad99243 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>  
>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
> +block-nested-y += add-cow.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> diff --git a/block.c b/block.c
> index 86910b0..a2be27b 100644
> --- a/block.c
> +++ b/block.c
> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>  #endif
>  
>  /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>  {
>  #ifdef _WIN32
>      if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index 051a25d..836284f 100644
> --- a/block.h
> +++ b/block.h
> @@ -276,6 +276,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>  
>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>  int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>  void path_combine(char *dest, int dest_size,
>                    const char *base_path,
>                    const char *filename);
> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..54d30a9
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,417 @@
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
> +                            ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION     1
> +#define ADD_COW_FILE_LEN    1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    uint64_t        size;
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char                image_file[ADD_COW_FILE_LEN];
> +    BlockDriverState    *image_hd;
> +    uint8_t             *bitmap;
> +    uint64_t            bitmap_size;
> +    CoMutex             lock;
> +} BDRVAddCowState;
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader    header;
> +    int64_t         size;
> +    char            image_filename[ADD_COW_FILE_LEN];
> +    int             image_flags;
> +    BlockDriver     *image_drv = NULL;
> +    int             ret;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }

Please have a look at qcow2 for better handling of newer version
numbers. We should try to give a good error message for this case.

> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(state->image_file, sizeof(state->image_file),
> +            header.image_file);

You need the same QEMU_BUILD_BUG_ON for the backing file, or you can't
assume that header.image_file is large enough that it doesn't matter
that it isn't necessarily correctly terminated.

> +
> +    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    state->bitmap = g_malloc0(state->bitmap_size);

qemu_blockalign is better if you're using it as a buffer for I/O requests.

> +
> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> +            state->bitmap_size);
> +    if (ret != state->bitmap_size) {
> +        goto fail;
> +    }
> +   /* If there is a image_file, must be together with backing_file */

Indentation is off.

> +    if (state->image_file[0] != '\0') {
> +        state->image_hd = bdrv_new("");
> +
> +        if (path_has_protocol(state->image_file)) {
> +            pstrcpy(image_filename, sizeof(image_filename),
> +                    state->image_file);
> +        } else {
> +            path_combine(image_filename, sizeof(image_filename),
> +                         bs->filename, state->image_file);
> +        }
> +
> +        image_drv = bdrv_find_format("raw");
> +        image_flags =
> +             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;

As Marcelo said, why play with the flags? BDRV_O_SNAPSHOT and
BDRV_O_NO_BACKING should already be cleared by bdrv_open_common, and I
can't see a reason why you need to open the image r/w when the add-cow
image is opened read-only.

> +        state->image_hd->keep_read_only = 0;
> +
> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
> +                image_drv);
> +        if (ret < 0) {
> +            bdrv_delete(state->image_hd);
> +            state->image_hd = NULL;
> +            goto fail;
> +        }
> +    }
> +    if (state->image_file[0] == '\0') {

You can move this check up, then the above if block can become
unconditional.

> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +
> +    qemu_co_mutex_init(&state->lock);
> +    return 0;
> + fail:
> +    g_free(state->bitmap);
> +    state->bitmap = NULL;

Resetting it to NULL is not required, the memory will be freed anyway.

> +    return ret;
> +}
> +
> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> +{
> +    uint64_t offset = bitnum / 8;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);

Unnecessary cast.

Also, to keep things consistent with other format drivers, call it s
instead of state.

> +    state->bitmap[offset] |= (1 << (bitnum % 8));
> +}
> +
> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t offset = bitnum / 8;
> +    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
> +}
> +
> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;

BDRVAddCowState *s = bs->opaque; and then use s->bitmap_size instead of
copying it to a local variable.

> +
> +    /* Beyond the end of bitmap, return error or read from backing_file? */
> +    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
> +        return 0;
> +    }
> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    int i, ret = 0;
> +    bool changed = false;
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    uint64_t start_pos = sector_num / 8;
> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
> +
> +    if (start_pos > state->bitmap_size) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < nb_sectors; i++) {
> +        if (changed || !is_bit_set(bs, sector_num + i)) {
> +            changed = true;
> +        }

Wait... if (changed == true) changed = true? What is this good for?

> +        add_cow_set_bit(bs, sector_num + i);
> +    }
> +
> +    if (changed) {
> +        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
> +            state->bitmap + start_pos,
> +            MIN(((end_pos - start_pos) & (~512)) + 512,
> +                state->bitmap_size - start_pos));

-EMAGIC

Please calculate that MIN(...) separately and give the variable a
meaningful name. Trying to guess what you're doing here:

(end_pos - start_pos) & (~512)) + 512

This is the size of the updated area in the bitmap, in bytes. It is
rounded up to the next sector; if it's already on a sector boundary,
make it the next sector boundary (why?)

state->bitmap_size - start_pos

Ok, makes sense, you're trying to avoid writing after the end of the
array if the caller asked for too many sectors.


Please make sure to keep the write request sector aligned, so that
bdrv_pwrite doesn't have to perform a read-modify-write operation.

> +    }
> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
> +    g_free(state->bitmap);
> +    state->bitmap = NULL;

Resetting to NULL is unnecessary.

> +}
> +
> +static int add_cow_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    AddCowHeader header;
> +    int64_t image_sectors = 0;
> +    const char *backing_filename = NULL;
> +    const char *image_filename = NULL;
> +    int ret;
> +    BlockDriverState *bs, *image_bs = NULL;
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_filename = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
> +            image_filename = options->value.s;
> +        }
> +        options++;
> +    }
> +
> +    if (!backing_filename || !image_filename) {
> +        error_report("Both backing_file and image_file should be given.");
> +        return -EINVAL;
> +    }
> +    /* Make sure image file exists */
> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
> +            | BDRV_O_CACHE_WB);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_delete(image_bs);
> +
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
> +    header.version = cpu_to_be32(ADD_COW_VERSION);
> +    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
> +
> +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
> +    if (ret < 0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    BlockDriver *drv = bdrv_find_format("add-cow");
> +    assert(drv != NULL);
> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
> +    if (ret < 0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
> +    bdrv_delete(bs);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
> +                         int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
> +            if (ret < 0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {

I thought there are no add-cow images without backing file? (It isn't
checked in open, but in create)

> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                    n, &hd_qiov);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +            } else {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_memset(&hd_qiov, 0,
> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);

There's nothing to memset in a qiov that you have just reset and that
has a size of 0 now.

> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
> +                          int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +    QEMUIOVector hd_qiov;
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_reset(&hd_qiov);
> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, &hd_qiov);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
> +{
> +    int ret = 0;
> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
> +    int64_t be_offset = cpu_to_be64(offset);
> +    BDRVAddCowState *state = bs->opaque;
> +    int64_t old_image_sector = state->image_hd->total_sectors;
> +
> +    ret = bdrv_truncate(state->image_hd, offset);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
> +            + sizeof(AddCowHeader));
> +    if (ret < 0) {
> +        bdrv_truncate(state->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
> +        &be_offset, sizeof(uint64_t));
> +    if (ret < 0) {
> +        bdrv_truncate(state->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
> +    }
> +
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *state = bs->opaque;
> +    int ret = bdrv_co_flush(state->image_hd);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name                = "add-cow",
> +    .instance_size              = sizeof(BDRVAddCowState),
> +    .bdrv_probe                 = add_cow_probe,
> +    .bdrv_open                  = add_cow_open,
> +    .bdrv_close                 = add_cow_close,
> +    .bdrv_create                = add_cow_create,
> +    .bdrv_is_allocated          = add_cow_is_allocated,
> +
> +    .bdrv_co_readv              = add_cow_co_readv,
> +    .bdrv_co_writev             = add_cow_co_writev,
> +    .bdrv_truncate              = bdrv_add_cow_truncate,
> +
> +    .create_options             = add_cow_create_options,
> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block_int.h b/block_int.h
> index 1ec4921..d6e8337 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -43,6 +43,7 @@
>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>  #define BLOCK_OPT_PREALLOC      "preallocation"
>  #define BLOCK_OPT_SUBFMT        "subformat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>  
>  typedef struct AIOPool {
>      void (*cancel)(BlockDriverAIOCB *acb);
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..e403c84
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,57 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature. Then
> +you can use add-cow file to implement these features.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +While QEMU is running, virtual size of image_file and backing_file must be the
> +same. So if image_file does not have the same virtual size as backing_file's in
> +step 2), qemu-img will truncate it.
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +----------+----------+----------+-----+
> + |  Header  |   Data   |   Data   | ... |
> + +----------+----------+----------+-----+

This looks as if the file was divided in some kind of blocks of the same
size. This isn't really true.

One change that I would definitely suggest to make is that the bitmap
should start at byte 512 rather than directly after the header.
Unaligned requests with cache=none are expensive.

> +
> + All numbers in add-cow are stored in Big Endian byte order.

For the bitmap, you should addtionally specify which order the bits in
the bitmap have.

> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. While
> +                        using backing_file, must together with image_file.

Pad unused bytes with zeros.

> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file, While using image_file, must
> +                        together with image_file.

Same here.

> +
> +         2060 - 2067:   size
> +                        Virtual disk size of image_file in bytes.
> +
> +== Data ==
> +
> +The Data field stores a bitmap related to backing_file and image_file. The bitmap
> +will track whether the cluster in backing_file is dirty or not.

So 0 = load from backing file, 1 = load from image? I think it's better
to be explicit on this.

> +
> +Each bit in the bitmap indicates one cluster. So the size of bitmap is calculated
> +according to virtual size of backing_file.

I think you mean s/cluster/sector/g

Kevin
Marcelo Tosatti - Dec. 6, 2011, 3:06 p.m.
On Tue, Dec 06, 2011 at 12:53:16PM -0200, Marcelo Tosatti wrote:
> On Tue, Dec 06, 2011 at 01:59:48PM +0100, Kevin Wolf wrote:
> > >> +
> > >> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> > >> +            state->bitmap_size);
> > >> +    if (ret != state->bitmap_size) {
> > >> +        goto fail;
> > >> +    }
> > > 
> > > Reading the entire bitmap in memory is not acceptable, it may be huge.
> > > Better mmap it and use msync(MS_SYNC) when writing it back. This way the
> > > host can free memory easily upon pressure.
> > 
> > You can't use mmap in block drivers. It would only work with raw-posix
> > backends, if at all.
> > 
> > Kevin
> 
> This is just the bitmap, a plain file. Why would you want to use
> anything other than a plain file to use as storage for the bitmap?

Well, mmap'ing would make life much simpler, but it has limitations such 
as portability.

Then what is necessary is a cache similar to qcow2's metadata cache.
Kevin Wolf - Dec. 6, 2011, 3:11 p.m.
Am 06.12.2011 15:53, schrieb Marcelo Tosatti:
> On Tue, Dec 06, 2011 at 01:59:48PM +0100, Kevin Wolf wrote:
>>>> +
>>>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>>>> +            state->bitmap_size);
>>>> +    if (ret != state->bitmap_size) {
>>>> +        goto fail;
>>>> +    }
>>>
>>> Reading the entire bitmap in memory is not acceptable, it may be huge.
>>> Better mmap it and use msync(MS_SYNC) when writing it back. This way the
>>> host can free memory easily upon pressure.
>>
>> You can't use mmap in block drivers. It would only work with raw-posix
>> backends, if at all.
> 
> This is just the bitmap, a plain file. Why would you want to use
> anything other than a plain file to use as storage for the bitmap?

The obvious case is raw-win32. There are probably not so obvious, but
still valid use cases that involve things like NBD, iSCSI, blkdebug or
whatever.

Kevin
Kevin Wolf - Dec. 6, 2011, 3:20 p.m.
Am 06.12.2011 16:06, schrieb Marcelo Tosatti:
> On Tue, Dec 06, 2011 at 12:53:16PM -0200, Marcelo Tosatti wrote:
>> On Tue, Dec 06, 2011 at 01:59:48PM +0100, Kevin Wolf wrote:
>>>>> +
>>>>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>>>>> +            state->bitmap_size);
>>>>> +    if (ret != state->bitmap_size) {
>>>>> +        goto fail;
>>>>> +    }
>>>>
>>>> Reading the entire bitmap in memory is not acceptable, it may be huge.
>>>> Better mmap it and use msync(MS_SYNC) when writing it back. This way the
>>>> host can free memory easily upon pressure.
>>>
>>> You can't use mmap in block drivers. It would only work with raw-posix
>>> backends, if at all.
>>>
>>> Kevin
>>
>> This is just the bitmap, a plain file. Why would you want to use
>> anything other than a plain file to use as storage for the bitmap?
> 
> Well, mmap'ing would make life much simpler, but it has limitations such 
> as portability.
> 
> Then what is necessary is a cache similar to qcow2's metadata cache.

Right, we can probably generalise the qcow2 code and make it available
for other drivers as well.

Kevin
Marcelo Tosatti - Dec. 6, 2011, 4:35 p.m.
On Tue, Dec 06, 2011 at 04:20:55PM +0100, Kevin Wolf wrote:
> Am 06.12.2011 16:06, schrieb Marcelo Tosatti:
> > On Tue, Dec 06, 2011 at 12:53:16PM -0200, Marcelo Tosatti wrote:
> >> On Tue, Dec 06, 2011 at 01:59:48PM +0100, Kevin Wolf wrote:
> >>>>> +
> >>>>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
> >>>>> +            state->bitmap_size);
> >>>>> +    if (ret != state->bitmap_size) {
> >>>>> +        goto fail;
> >>>>> +    }
> >>>>
> >>>> Reading the entire bitmap in memory is not acceptable, it may be huge.
> >>>> Better mmap it and use msync(MS_SYNC) when writing it back. This way the
> >>>> host can free memory easily upon pressure.
> >>>
> >>> You can't use mmap in block drivers. It would only work with raw-posix
> >>> backends, if at all.
> >>>
> >>> Kevin
> >>
> >> This is just the bitmap, a plain file. Why would you want to use
> >> anything other than a plain file to use as storage for the bitmap?
> > 
> > Well, mmap'ing would make life much simpler, but it has limitations such 
> > as portability.
> > 
> > Then what is necessary is a cache similar to qcow2's metadata cache.
> 
> Right, we can probably generalise the qcow2 code and make it available
> for other drivers as well.

Hum, generalising sounds overly complicated (and there is a time
constraint to this). IMHO a cache internal to add-cow.c just to avoid
reading the entire bitmap would do the trick.
Marcelo Tosatti - Dec. 6, 2011, 4:40 p.m.
On Tue, Dec 06, 2011 at 02:35:03PM -0200, Marcelo Tosatti wrote:
> > Right, we can probably generalise the qcow2 code and make it available
> > for other drivers as well.
> 
> Hum, generalising sounds overly complicated (and there is a time
> constraint to this). IMHO a cache internal to add-cow.c just to avoid
> reading the entire bitmap would do the trick. 

(and writes must go through the cache too, of course).
Kevin Wolf - Dec. 6, 2011, 4:43 p.m.
Am 06.12.2011 17:35, schrieb Marcelo Tosatti:
> On Tue, Dec 06, 2011 at 04:20:55PM +0100, Kevin Wolf wrote:
>> Am 06.12.2011 16:06, schrieb Marcelo Tosatti:
>>> On Tue, Dec 06, 2011 at 12:53:16PM -0200, Marcelo Tosatti wrote:
>>>> On Tue, Dec 06, 2011 at 01:59:48PM +0100, Kevin Wolf wrote:
>>>>>>> +
>>>>>>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>>>>>>> +            state->bitmap_size);
>>>>>>> +    if (ret != state->bitmap_size) {
>>>>>>> +        goto fail;
>>>>>>> +    }
>>>>>>
>>>>>> Reading the entire bitmap in memory is not acceptable, it may be huge.
>>>>>> Better mmap it and use msync(MS_SYNC) when writing it back. This way the
>>>>>> host can free memory easily upon pressure.
>>>>>
>>>>> You can't use mmap in block drivers. It would only work with raw-posix
>>>>> backends, if at all.
>>>>>
>>>>> Kevin
>>>>
>>>> This is just the bitmap, a plain file. Why would you want to use
>>>> anything other than a plain file to use as storage for the bitmap?
>>>
>>> Well, mmap'ing would make life much simpler, but it has limitations such 
>>> as portability.
>>>
>>> Then what is necessary is a cache similar to qcow2's metadata cache.
>>
>> Right, we can probably generalise the qcow2 code and make it available
>> for other drivers as well.
> 
> Hum, generalising sounds overly complicated (and there is a time
> constraint to this). IMHO a cache internal to add-cow.c just to avoid
> reading the entire bitmap would do the trick.

The cache is mostly self-contained. But maybe we should get the locking
right (instead of always locking the whole BlockDriverState) before
using it in more drivers. I think this might need some change to it.

Kevin

Patch

diff --git a/Makefile.objs b/Makefile.objs
index d7a6539..ad99243 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -31,6 +31,7 @@  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-nested-y += add-cow.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block.c b/block.c
index 86910b0..a2be27b 100644
--- a/block.c
+++ b/block.c
@@ -106,7 +106,7 @@  int is_windows_drive(const char *filename)
 #endif
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 #ifdef _WIN32
     if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index 051a25d..836284f 100644
--- a/block.h
+++ b/block.h
@@ -276,6 +276,7 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..54d30a9
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,417 @@ 
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
+                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
+                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
+                            ((uint64_t)'W' << 8) | 0xFF)
+#define ADD_COW_VERSION     1
+#define ADD_COW_FILE_LEN    1024
+
+typedef struct AddCowHeader {
+    uint64_t        magic;
+    uint32_t        version;
+    char            backing_file[ADD_COW_FILE_LEN];
+    char            image_file[ADD_COW_FILE_LEN];
+    uint64_t        size;
+} QEMU_PACKED AddCowHeader;
+
+typedef struct BDRVAddCowState {
+    char                image_file[ADD_COW_FILE_LEN];
+    BlockDriverState    *image_hd;
+    uint8_t             *bitmap;
+    uint64_t            bitmap_size;
+    CoMutex             lock;
+} BDRVAddCowState;
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const AddCowHeader *header = (const void *)buf;
+
+    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
+        be32_to_cpu(header->version) == ADD_COW_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+    AddCowHeader    header;
+    int64_t         size;
+    char            image_filename[ADD_COW_FILE_LEN];
+    int             image_flags;
+    BlockDriver     *image_drv = NULL;
+    int             ret;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto fail;
+    }
+
+    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
+        be32_to_cpu(header.version) != ADD_COW_VERSION) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    size = be64_to_cpu(header.size);
+    bs->total_sectors = size / BDRV_SECTOR_SIZE;
+
+    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+            header.backing_file);
+    pstrcpy(state->image_file, sizeof(state->image_file),
+            header.image_file);
+
+    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
+    state->bitmap = g_malloc0(state->bitmap_size);
+
+    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
+            state->bitmap_size);
+    if (ret != state->bitmap_size) {
+        goto fail;
+    }
+   /* If there is a image_file, must be together with backing_file */
+    if (state->image_file[0] != '\0') {
+        state->image_hd = bdrv_new("");
+
+        if (path_has_protocol(state->image_file)) {
+            pstrcpy(image_filename, sizeof(image_filename),
+                    state->image_file);
+        } else {
+            path_combine(image_filename, sizeof(image_filename),
+                         bs->filename, state->image_file);
+        }
+
+        image_drv = bdrv_find_format("raw");
+        image_flags =
+             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;
+        state->image_hd->keep_read_only = 0;
+
+        ret = bdrv_open(state->image_hd, image_filename, image_flags,
+                image_drv);
+        if (ret < 0) {
+            bdrv_delete(state->image_hd);
+            state->image_hd = NULL;
+            goto fail;
+        }
+    }
+    if (state->image_file[0] == '\0') {
+        ret = -ENOENT;
+        goto fail;
+    }
+
+    qemu_co_mutex_init(&state->lock);
+    return 0;
+ fail:
+    g_free(state->bitmap);
+    state->bitmap = NULL;
+    return ret;
+}
+
+static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+{
+    uint64_t offset = bitnum / 8;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    state->bitmap[offset] |= (1 << (bitnum % 8));
+}
+
+static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+{
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    uint64_t offset = bitnum / 8;
+    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
+}
+
+static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors, int *num_same)
+{
+    int changed;
+    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
+
+    /* Beyond the end of bitmap, return error or read from backing_file? */
+    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
+        return 0;
+    }
+
+    if (nb_sectors == 0) {
+        *num_same = nb_sectors;
+        return 0;
+    }
+
+    changed = is_bit_set(bs, sector_num);
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+        if (is_bit_set(bs, sector_num + *num_same) != changed) {
+            break;
+        }
+    }
+
+    return changed;
+}
+
+static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors)
+{
+    int i, ret = 0;
+    bool changed = false;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    uint64_t start_pos = sector_num / 8;
+    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
+
+    if (start_pos > state->bitmap_size) {
+        return -1;
+    }
+
+    for (i = 0; i < nb_sectors; i++) {
+        if (changed || !is_bit_set(bs, sector_num + i)) {
+            changed = true;
+        }
+        add_cow_set_bit(bs, sector_num + i);
+    }
+
+    if (changed) {
+        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
+            state->bitmap + start_pos,
+            MIN(((end_pos - start_pos) & (~512)) + 512,
+                state->bitmap_size - start_pos));
+    }
+    return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    g_free(state->bitmap);
+    state->bitmap = NULL;
+}
+
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    AddCowHeader header;
+    int64_t image_sectors = 0;
+    const char *backing_filename = NULL;
+    const char *image_filename = NULL;
+    int ret;
+    BlockDriverState *bs, *image_bs = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_filename = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    if (!backing_filename || !image_filename) {
+        error_report("Both backing_file and image_file should be given.");
+        return -EINVAL;
+    }
+    /* Make sure image file exists */
+    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_delete(image_bs);
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be64(ADD_COW_MAGIC);
+    header.version = cpu_to_be32(ADD_COW_VERSION);
+    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
+    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
+    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    BlockDriver *drv = bdrv_find_format("add-cow");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int cur_nr_sectors;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int n, ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    while (remaining_sectors != 0) {
+        cur_nr_sectors = remaining_sectors;
+        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
+            cur_nr_sectors = n;
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            cur_nr_sectors = n;
+            if (bs->backing_hd) {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_memset(&hd_qiov, 0,
+                    BDRV_SECTOR_SIZE * cur_nr_sectors);
+            }
+        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+    QEMUIOVector hd_qiov;
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_reset(&hd_qiov);
+    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
+    ret = bdrv_co_writev(s->image_hd,
+                     sector_num,
+                     remaining_sectors, &hd_qiov);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
+{
+    int ret = 0;
+    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
+    int64_t be_offset = cpu_to_be64(offset);
+    BDRVAddCowState *state = bs->opaque;
+    int64_t old_image_sector = state->image_hd->total_sectors;
+
+    ret = bdrv_truncate(state->image_hd, offset);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
+            + sizeof(AddCowHeader));
+    if (ret < 0) {
+        bdrv_truncate(state->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+        return ret;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
+        &be_offset, sizeof(uint64_t));
+    if (ret < 0) {
+        bdrv_truncate(state->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+    }
+
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
+{
+    BDRVAddCowState *state = bs->opaque;
+    int ret = bdrv_co_flush(state->image_hd);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_co_flush(bs->file);
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_IMAGE_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a image file"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_add_cow = {
+    .format_name                = "add-cow",
+    .instance_size              = sizeof(BDRVAddCowState),
+    .bdrv_probe                 = add_cow_probe,
+    .bdrv_open                  = add_cow_open,
+    .bdrv_close                 = add_cow_close,
+    .bdrv_create                = add_cow_create,
+    .bdrv_is_allocated          = add_cow_is_allocated,
+
+    .bdrv_co_readv              = add_cow_co_readv,
+    .bdrv_co_writev             = add_cow_co_writev,
+    .bdrv_truncate              = bdrv_add_cow_truncate,
+
+    .create_options             = add_cow_create_options,
+    .bdrv_co_flush_to_disk      = add_cow_co_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+    bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block_int.h b/block_int.h
index 1ec4921..d6e8337 100644
--- a/block_int.h
+++ b/block_int.h
@@ -43,6 +43,7 @@ 
 #define BLOCK_OPT_TABLE_SIZE    "table_size"
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
+#define BLOCK_OPT_IMAGE_FILE    "image_file"
 
 typedef struct AIOPool {
     void (*cancel)(BlockDriverAIOCB *acb);
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..e403c84
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,57 @@ 
+== General ==
+
+Raw file format does not support backing_file and copy on write feature. Then
+you can use add-cow file to implement these features.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+    1)  Create a raw image with the same size of ubuntu.img
+            qemu-img create -f raw test.raw 8G
+    2)  Create a add-cow image which will store dirty bitmap
+            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
+    3)  Run qemu with add-cow image
+            qemu -drive if=virtio,file=test.add-cow
+
+While QEMU is running, virtual size of image_file and backing_file must be the
+same. So if image_file does not have the same virtual size as backing_file's in
+step 2), qemu-img will truncate it.
+
+=Specification=
+
+The file format looks like this:
+
+ +----------+----------+----------+-----+
+ |  Header  |   Data   |   Data   | ... |
+ +----------+----------+----------+-----+
+
+ All numbers in add-cow are stored in Big Endian byte order.
+
+
+== Header ==
+
+The Header is included in the first bytes:
+
+    Byte  0 -  7:       magic
+                        add-cow magic string ("ADD_COW\xff")
+
+          8 -  11:      version
+                        Version number (only valid value is 1 now)
+
+          12 - 1035:    backing_file
+                        backing_file file name related to add-cow file. While
+                        using backing_file, must together with image_file.
+
+         1036 - 2059:   image_file
+                        image_file is a raw file, While using image_file, must
+                        together with image_file.
+
+         2060 - 2067:   size
+                        Virtual disk size of image_file in bytes.
+
+== Data ==
+
+The Data field stores a bitmap related to backing_file and image_file. The bitmap
+will track whether the cluster in backing_file is dirty or not.
+
+Each bit in the bitmap indicates one cluster. So the size of bitmap is calculated
+according to virtual size of backing_file.