Patchwork [v6] block:add-cow file format

login
register
mail settings
Submitter Robert Wang
Date Dec. 29, 2011, 9:36 a.m.
Message ID <1325151419-26228-1-git-send-email-wdongxu@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/133534/
State New
Headers show

Comments

Robert Wang - Dec. 29, 2011, 9:36 a.m.
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>

Introduce a new file format: add-cow. The usage can be found in add-cow.txt of
this patch.

CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
---
After applying this patch, qemu might can not compile, need apply this patch first:
http://lists.gnu.org/archive/html/qemu-devel/2011-12/msg02527.html

 Makefile.objs          |    1 +
 block.c                |    2 +-
 block.h                |    1 +
 block/add-cow.c        |  429 ++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h            |    1 +
 docs/specs/add-cow.txt |   72 ++++++++
 6 files changed, 505 insertions(+), 1 deletions(-)
 create mode 100644 block/add-cow.c
 create mode 100644 docs/specs/add-cow.txt
Stefan Hajnoczi - Dec. 30, 2011, 2:09 p.m.
On Thu, Dec 29, 2011 at 05:36:59PM +0800, Dong Xu Wang wrote:

Some comments on everything but the I/O path, which I haven't reviewed
yet:

> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..95af5b7
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,429 @@

Missing GPL or LGPL license header.

> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +
> +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
> +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
> +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
> +                            ((uint64_t)'W' << 8) | 0xFF)
> +#define ADD_COW_VERSION     1
> +#define ADD_COW_FILE_LEN    1024
> +
> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    uint64_t        size;
> +    char            reserved[492];
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct BDRVAddCowState {
> +    char                image_file[ADD_COW_FILE_LEN];

Why is this field needed?

> +    BlockDriverState    *image_hd;
> +    uint8_t             *bitmap;
> +    uint64_t            bitmap_size;
> +    CoMutex             lock;
> +} BDRVAddCowState;
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader        header;
> +    int64_t             size;
> +    char                image_filename[ADD_COW_FILE_LEN];
> +    BlockDriver         *image_drv = NULL;
> +    int                 ret;
> +    BDRVAddCowState     *s = bs->opaque;
> +    BlockDriverState    *backing_bs = NULL;
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        char version[64];
> +        snprintf(version, sizeof(version), "ADD-COW version %d", header.version);
> +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
> +            bs->device_name, "add-cow", version);
> +        ret = -ENOTSUP;
> +        goto fail;
> +    }
> +
> +    size = be64_to_cpu(header.size);
> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> +
> +    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
> +    QEMU_BUILD_BUG_ON(sizeof(s->image_file) != sizeof(header.image_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> +            header.backing_file);
> +    pstrcpy(s->image_file, sizeof(s->image_file),
> +            header.image_file);

This assumes that header.backing_file and header.image_file is
NUL-terminated.  If the file happened to have the magic number and
version but does not include '\0' bytes in the header.backing_file then
we may crash here when trying to read beyond the end of the header
struct.

Image format code should be robust.  Please use strncpy(3) carefully
instead of pstrcpy().

Also please update the file format specification to either make these
fields NUL-terminated or NUL-terminated unless the length is 1024
characters (in which case there is no NUL but the string still ends).

> +
> +    s->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    s->bitmap = qemu_blockalign(bs, s->bitmap_size);
> +
> +    ret = bdrv_pread(bs->file, sizeof(header), s->bitmap,
> +            s->bitmap_size);
> +    if (ret != s->bitmap_size) {
> +        goto fail;
> +    }
> +
> +    if (s->image_file[0] == '\0') {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +
> +    ret = bdrv_file_open(&backing_bs, bs->backing_file, 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_delete(backing_bs);

What does this do?  (It leaks s->bitmap when it fails.)

> +
> +    s->image_hd = bdrv_new("");
> +
> +    if (path_has_protocol(s->image_file)) {
> +        pstrcpy(image_filename, sizeof(image_filename),
> +                s->image_file);
> +    } else {
> +        path_combine(image_filename, sizeof(image_filename),
> +                     bs->filename, s->image_file);
> +    }
> +
> +    image_drv = bdrv_find_format("raw");
> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> +    if (ret < 0) {
> +        bdrv_delete(s->image_hd);
> +        s->image_hd = NULL;
> +        goto fail;
> +    }

TODO

> +
> +    qemu_co_mutex_init(&s->lock);
> +    return 0;
> + fail:
> +    g_free(s->bitmap);
> +    return ret;
> +}
> +
> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> +{
> +    uint64_t offset = bitnum >> 3;
> +    BDRVAddCowState *s = bs->opaque;
> +    s->bitmap[offset] |= (1 << (bitnum % 8));
> +}
> +
> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    uint64_t offset = bitnum >> 3;
> +    return !!(s->bitmap[offset] & (1 << (bitnum % 8)));
> +}

Please use bool instead of int.  Then you can also drop the !!.

> +
> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    BDRVAddCowState *s = bs->opaque;
> +
> +    /* Beyond the end of bitmap, return error or read from backing_file? */
> +    if (((sector_num + nb_sectors + 7) >> 3) > s->bitmap_size) {
> +        return 0;
> +    }

If the bitmap size is tied to bs->total_sectors then you do not need
this check since bdrv_co_is_allocated() already handles this case (and
sets *num_same to 0).

> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    int i, ret = 0;
> +    bool changed = false;
> +    BDRVAddCowState *s = bs->opaque;
> +    uint64_t start_pos = (sector_num  >> 3) & (~511);
> +    uint64_t end_pos = (((sector_num + nb_sectors - 1)  >> 3) + 511) & (~511);
> +    uint64_t sectors_to_write = MIN(end_pos - start_pos,
> +                s->bitmap_size - start_pos);
> +
> +    if (start_pos > s->bitmap_size) {
> +        return 0;
> +    }
> +
> +    for (i = 0; i < nb_sectors; i++) {
> +        if (!changed) {
> +            changed = true;
> +        }
> +        add_cow_set_bit(bs, sector_num + i);
> +    }
> +
> +    if (changed) {
> +        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
> +            s->bitmap + start_pos, sectors_to_write);
> +    }
> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    g_free(s->bitmap);

We also need to free s->image_hd.

> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..33f358e
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,72 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature. Then
> +you can use add-cow file to implement these features.

I suggest replacing the second sentence with these:

"The add-cow image format makes it possible to use backing files with raw
image by keeping a separate .add-cow metadata file.  Once all sectors
have been written to in the raw image it is safe to discard the .add-cow
and backing files and instead use the raw image directly."

They describe what add-cow does and how it can be used in a little more
detail.

> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +While QEMU is running, virtual size of image_file and backing_file must be the
> +same. So if image_file does not have the same virtual size as backing_file's in
> +step 2), qemu-img will truncate it.

This limitation is unnecessary.  QCOW2 and QED will read zeroes if the
image is larger than the backing file.  Please implement the same
behavior so that add-cow behaves consistently with how users know QCOW2
and QED work.

> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +---------------+--------------------------+
> + |     Header    |           Data           |
> + +---------------+--------------------------+
> +
> +All numbers in add-cow are stored in Big Endian byte order.

New file formats and protocols often choose little endian because x86
and arm are very popular nowadays.  It's an arbitrary decision but I
just wanted to suggest it.

> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. While
> +                        using backing_file, must together with image_file. All
> +                        unused bytes are padded with zeros.

"While using backing_file, must together with image_file"

What does this mean?

> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file, While using image_file, must
> +                        together with image_file. All unused bytes are padded

"While using image_file, must together with image_file"

What does this mean?

> +                        with zeros.
> +
> +         2060 - 2067:   size
> +                        Virtual disk size of image_file in bytes.

Why is a field required to store the virtual disk size?  The image_file
size should always tell us the virtual disk image.

> +
> +         2068 - 2559:   The Reserved field is used to make sure Data field starts
> +                        at the multiple of 512, not used currently. All bytes are
> +                        filled with 0.
> +
> +== Data ==
> +
> +The Data field starts at the 2560th byte, stores a bitmap related to backing_file
> +and image_file. The bitmap will track whether the sector in backing_file is dirty
> +or not.
> +
> +
> +Each bit in the bitmap indicates one sector's status. So the size of bitmap is
> +calculated according to virtual size of backing_file. In each byte, bit 0 to 7
> +will track the 1st to 7th sector in sequence, bit orders in one byte look like:
> + +----+----+----+----+----+----+----+----+
> + | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
> + +----+----+----+----+----+----+----+----+
> +
> +If the bit is 0, indicates the sector has not been allocated in image_file, data
> +should be loaded from backing_file while reading; if the bit is 1,  indicates the
> +related sector has been dirty, should be loaded from image_file while reading.

Perhaps also add the following so we define both read and write
semantics:

Writing to a sector causes the corresponding bit to be set to 1.
Robert Wang - Dec. 31, 2011, 9:17 a.m.
On Fri, Dec 30, 2011 at 22:09, Stefan Hajnoczi <stefanha@gmail.com> wrote:

> On Thu, Dec 29, 2011 at 05:36:59PM +0800, Dong Xu Wang wrote:
>
> Some comments on everything but the I/O path, which I haven't reviewed
> yet:
>
> > diff --git a/block/add-cow.c b/block/add-cow.c
> > new file mode 100644
> > index 0000000..95af5b7
> > --- /dev/null
> > +++ b/block/add-cow.c
> > @@ -0,0 +1,429 @@
>
> Missing GPL or LGPL license header.
>
> > +#include "qemu-common.h"
> > +#include "block_int.h"
> > +#include "module.h"
> > +
> > +#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' <<
> 48) | \
> > +                            ((uint64_t)'D' << 40) | ((uint64_t)'_' <<
> 32) | \
> > +                            ((uint64_t)'C' << 24) | ((uint64_t)'O' <<
> 16) | \
> > +                            ((uint64_t)'W' << 8) | 0xFF)
> > +#define ADD_COW_VERSION     1
> > +#define ADD_COW_FILE_LEN    1024
> > +
> > +typedef struct AddCowHeader {
> > +    uint64_t        magic;
> > +    uint32_t        version;
> > +    char            backing_file[ADD_COW_FILE_LEN];
> > +    char            image_file[ADD_COW_FILE_LEN];
> > +    uint64_t        size;
> > +    char            reserved[492];
> > +} QEMU_PACKED AddCowHeader;
> > +
> > +typedef struct BDRVAddCowState {
> > +    char                image_file[ADD_COW_FILE_LEN];
>
> Why is this field needed?
>
> Yes, not needed.

>
> > +    BlockDriverState    *image_hd;
> > +    uint8_t             *bitmap;
> > +    uint64_t            bitmap_size;
> > +    CoMutex             lock;
> > +} BDRVAddCowState;
> > +
> > +static int add_cow_probe(const uint8_t *buf, int buf_size, const char
> *filename)
> > +{
> > +    const AddCowHeader *header = (const void *)buf;
> > +
> > +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> > +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> > +        return 100;
> > +    } else {
> > +        return 0;
> > +    }
> > +}
> > +
> > +static int add_cow_open(BlockDriverState *bs, int flags)
> > +{
> > +    AddCowHeader        header;
> > +    int64_t             size;
> > +    char                image_filename[ADD_COW_FILE_LEN];
> > +    BlockDriver         *image_drv = NULL;
> > +    int                 ret;
> > +    BDRVAddCowState     *s = bs->opaque;
> > +    BlockDriverState    *backing_bs = NULL;
> > +
> > +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> > +    if (ret != sizeof(header)) {
> > +        goto fail;
> > +    }
> > +
> > +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
> > +        ret = -EINVAL;
> > +        goto fail;
> > +    }
> > +    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
> > +        char version[64];
> > +        snprintf(version, sizeof(version), "ADD-COW version %d",
> header.version);
> > +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
> > +            bs->device_name, "add-cow", version);
> > +        ret = -ENOTSUP;
> > +        goto fail;
> > +    }
> > +
> > +    size = be64_to_cpu(header.size);
> > +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
> > +
> > +    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) !=
> sizeof(header.backing_file));
> > +    QEMU_BUILD_BUG_ON(sizeof(s->image_file) !=
> sizeof(header.image_file));
> > +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> > +            header.backing_file);
> > +    pstrcpy(s->image_file, sizeof(s->image_file),
> > +            header.image_file);
>
> This assumes that header.backing_file and header.image_file is
> NUL-terminated.  If the file happened to have the magic number and
> version but does not include '\0' bytes in the header.backing_file then
> we may crash here when trying to read beyond the end of the header
> struct.
>
> Image format code should be robust.  Please use strncpy(3) carefully
> instead of pstrcpy().
>
> Also please update the file format specification to either make these
> fields NUL-terminated or NUL-terminated unless the length is 1024
> characters (in which case there is no NUL but the string still ends).
>
Okay.

>
> > +
> > +    s->bitmap_size = ((bs->total_sectors + 7) >> 3);
> > +    s->bitmap = qemu_blockalign(bs, s->bitmap_size);
> > +
> > +    ret = bdrv_pread(bs->file, sizeof(header), s->bitmap,
> > +            s->bitmap_size);
> > +    if (ret != s->bitmap_size) {
> > +        goto fail;
> > +    }
> > +
> > +    if (s->image_file[0] == '\0') {
> > +        ret = -ENOENT;
> > +        goto fail;
> > +    }
> > +
> > +    ret = bdrv_file_open(&backing_bs, bs->backing_file, 0);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +    bdrv_delete(backing_bs);
>
> What does this do?  (It leaks s->bitmap when it fails.)
>

I wanna make sure backing_file exists while opening.

>
> > +
> > +    s->image_hd = bdrv_new("");
> > +
> > +    if (path_has_protocol(s->image_file)) {
> > +        pstrcpy(image_filename, sizeof(image_filename),
> > +                s->image_file);
> > +    } else {
> > +        path_combine(image_filename, sizeof(image_filename),
> > +                     bs->filename, s->image_file);
> > +    }
> > +
> > +    image_drv = bdrv_find_format("raw");
> > +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> > +    if (ret < 0) {
> > +        bdrv_delete(s->image_hd);
> > +        s->image_hd = NULL;
> > +        goto fail;
> > +    }
>
> TODO
>
> > +
> > +    qemu_co_mutex_init(&s->lock);
> > +    return 0;
> > + fail:
> > +    g_free(s->bitmap);
> > +    return ret;
> > +}
> > +
> > +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
> > +{
> > +    uint64_t offset = bitnum >> 3;
> > +    BDRVAddCowState *s = bs->opaque;
> > +    s->bitmap[offset] |= (1 << (bitnum % 8));
> > +}
> > +
> > +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
> > +{
> > +    BDRVAddCowState *s = bs->opaque;
> > +    uint64_t offset = bitnum >> 3;
> > +    return !!(s->bitmap[offset] & (1 << (bitnum % 8)));
> > +}
>
> Please use bool instead of int.  Then you can also drop the !!.
>
Okay.

>
> > +
> > +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
> > +        int64_t sector_num, int nb_sectors, int *num_same)
> > +{
> > +    int changed;
> > +    BDRVAddCowState *s = bs->opaque;
> > +
> > +    /* Beyond the end of bitmap, return error or read from
> backing_file? */
> > +    if (((sector_num + nb_sectors + 7) >> 3) > s->bitmap_size) {
> > +        return 0;
> > +    }
>
> If the bitmap size is tied to bs->total_sectors then you do not need
> this check since bdrv_co_is_allocated() already handles this case (and
> sets *num_same to 0).
>
> > +
> > +    if (nb_sectors == 0) {
> > +        *num_same = nb_sectors;
> > +        return 0;
> > +    }
> > +
> > +    changed = is_bit_set(bs, sector_num);
> > +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> > +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> > +            break;
> > +        }
> > +    }
> > +
> > +    return changed;
> > +}
> > +
> > +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t
> sector_num,
> > +        int nb_sectors)
> > +{
> > +    int i, ret = 0;
> > +    bool changed = false;
> > +    BDRVAddCowState *s = bs->opaque;
> > +    uint64_t start_pos = (sector_num  >> 3) & (~511);
> > +    uint64_t end_pos = (((sector_num + nb_sectors - 1)  >> 3) + 511) &
> (~511);
> > +    uint64_t sectors_to_write = MIN(end_pos - start_pos,
> > +                s->bitmap_size - start_pos);
> > +
> > +    if (start_pos > s->bitmap_size) {
> > +        return 0;
> > +    }
> > +
> > +    for (i = 0; i < nb_sectors; i++) {
> > +        if (!changed) {
> > +            changed = true;
> > +        }
> > +        add_cow_set_bit(bs, sector_num + i);
> > +    }
> > +
> > +    if (changed) {
> > +        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
> > +            s->bitmap + start_pos, sectors_to_write);
> > +    }
> > +    return ret;
> > +}
> > +
> > +static void add_cow_close(BlockDriverState *bs)
> > +{
> > +    BDRVAddCowState *s = bs->opaque;
> > +    g_free(s->bitmap);
>
> We also need to free s->image_hd.
>
>
Yep.

> > diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> > new file mode 100644
> > index 0000000..33f358e
> > --- /dev/null
> > +++ b/docs/specs/add-cow.txt
> > @@ -0,0 +1,72 @@
> > +== General ==
> > +
> > +Raw file format does not support backing_file and copy on write
> feature. Then
> > +you can use add-cow file to implement these features.
>
> I suggest replacing the second sentence with these:
>
> "The add-cow image format makes it possible to use backing files with raw
> image by keeping a separate .add-cow metadata file.  Once all sectors
> have been written to in the raw image it is safe to discard the .add-cow
> and backing files and instead use the raw image directly."
>
> They describe what add-cow does and how it can be used in a little more
> detail.
>
> > +
> > +When using add-cow, procedures may like this:
> > +(ubuntu.img is a disk image which has been installed OS.)
> > +    1)  Create a raw image with the same size of ubuntu.img
> > +            qemu-img create -f raw test.raw 8G
> > +    2)  Create a add-cow image which will store dirty bitmap
> > +            qemu-img create -f add-cow test.add-cow -o
> backing_file=ubuntu.img,image_file=test.raw
> > +    3)  Run qemu with add-cow image
> > +            qemu -drive if=virtio,file=test.add-cow
> > +
> > +While QEMU is running, virtual size of image_file and backing_file must
> be the
> > +same. So if image_file does not have the same virtual size as
> backing_file's in
> > +step 2), qemu-img will truncate it.
>
> This limitation is unnecessary.  QCOW2 and QED will read zeroes if the
> image is larger than the backing file.  Please implement the same
> behavior so that add-cow behaves consistently with how users know QCOW2
> and QED work.
>
>
Okay.

> > +
> > +=Specification=
> > +
> > +The file format looks like this:
> > +
> > + +---------------+--------------------------+
> > + |     Header    |           Data           |
> > + +---------------+--------------------------+
> > +
> > +All numbers in add-cow are stored in Big Endian byte order.
>
> New file formats and protocols often choose little endian because x86
> and arm are very popular nowadays.  It's an arbitrary decision but I
> just wanted to suggest it.
>
> > +== Header ==
> > +
> > +The Header is included in the first bytes:
> > +
> > +    Byte  0 -  7:       magic
> > +                        add-cow magic string ("ADD_COW\xff")
> > +
> > +          8 -  11:      version
> > +                        Version number (only valid value is 1 now)
> > +
> > +          12 - 1035:    backing_file
> > +                        backing_file file name related to add-cow file.
> While
> > +                        using backing_file, must together with
> image_file. All
> > +                        unused bytes are padded with zeros.
>
> "While using backing_file, must together with image_file"
>
> What does this mean?
>
> > +
> > +         1036 - 2059:   image_file
> > +                        image_file is a raw file, While using
> image_file, must
> > +                        together with image_file. All unused bytes are
> padded
>
> "While using image_file, must together with image_file"
>
> What does this mean?
>

I mean while using add-cow, must together with image_file and backing_file.
Both of them can not be missing.
Errors with sentences like that, I will correct them in v7.

>
> > +                        with zeros.
> > +
> > +         2060 - 2067:   size
> > +                        Virtual disk size of image_file in bytes.
>
> Why is a field required to store the virtual disk size?  The image_file
> size should always tell us the virtual disk image.
>
> > +
> > +         2068 - 2559:   The Reserved field is used to make sure Data
> field starts
> > +                        at the multiple of 512, not used currently. All
> bytes are
> > +                        filled with 0.
> > +
> > +== Data ==
> > +
> > +The Data field starts at the 2560th byte, stores a bitmap related to
> backing_file
> > +and image_file. The bitmap will track whether the sector in
> backing_file is dirty
> > +or not.
> > +
> > +
> > +Each bit in the bitmap indicates one sector's status. So the size of
> bitmap is
> > +calculated according to virtual size of backing_file. In each byte, bit
> 0 to 7
> > +will track the 1st to 7th sector in sequence, bit orders in one byte
> look like:
> > + +----+----+----+----+----+----+----+----+
> > + | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
> > + +----+----+----+----+----+----+----+----+
> > +
> > +If the bit is 0, indicates the sector has not been allocated in
> image_file, data
> > +should be loaded from backing_file while reading; if the bit is 1,
>  indicates the
> > +related sector has been dirty, should be loaded from image_file while
> reading.
>
> Perhaps also add the following so we define both read and write
> semantics:
>
> Writing to a sector causes the corresponding bit to be set to 1.
>
> Thanks for your reviewing.
Stefan Hajnoczi - Jan. 1, 2012, 4:56 p.m.
On Sat, Dec 31, 2011 at 9:17 AM, Dong Xu Wang
<wdongxu@linux.vnet.ibm.com> wrote:
> On Fri, Dec 30, 2011 at 22:09, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Thu, Dec 29, 2011 at 05:36:59PM +0800, Dong Xu Wang wrote:
>> > +    ret = bdrv_file_open(&backing_bs, bs->backing_file, 0);
>> > +    if (ret < 0) {
>> > +        return ret;
>> > +    }
>> > +    bdrv_delete(backing_bs);
>>
>> What does this do?  (It leaks s->bitmap when it fails.)
>
>
> I wanna make sure backing_file exists while opening.

block.c:bdrv_open() opens the backing file after .bdrv_open() has
returned.  It fails if the backing file does not exist.  There's no
need to duplicate this.

>> > +    s->image_hd = bdrv_new("");
>> > +
>> > +    if (path_has_protocol(s->image_file)) {
>> > +        pstrcpy(image_filename, sizeof(image_filename),
>> > +                s->image_file);
>> > +    } else {
>> > +        path_combine(image_filename, sizeof(image_filename),
>> > +                     bs->filename, s->image_file);
>> > +    }
>> > +
>> > +    image_drv = bdrv_find_format("raw");
>> > +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
>> > +    if (ret < 0) {
>> > +        bdrv_delete(s->image_hd);
>> > +        s->image_hd = NULL;
>> > +        goto fail;
>> > +    }
>>
>> TODO

If you were wondering why I put "TODO" here it's because I had some
thoughts when reviewing but didn't fully investigate it yet.  When I
send the rest of my feedback I'll include my comment here.  (I should
have removed this before replying :))

>> > +         1036 - 2059:   image_file
>> > +                        image_file is a raw file, While using
>> > image_file, must
>> > +                        together with image_file. All unused bytes are
>> > padded
>>
>> "While using image_file, must together with image_file"
>>
>> What does this mean?
>
>
> I mean while using add-cow, must together with image_file and backing_file.
> Both of them can not be missing.
> Errors with sentences like that, I will correct them in v7.

That sounds like qemu-img create behavior which should not be part of
the file format specification.

The only impact on the file format speficiation is that backing_file
can be an empty string but image_file must always be a valid filename.
Marcelo Tosatti - Jan. 5, 2012, 3:46 p.m.
On Thu, Dec 29, 2011 at 05:36:59PM +0800, Dong Xu Wang wrote:
> From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> 
> Introduce a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
> 
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> ---
> After applying this patch, qemu might can not compile, need apply this patch first:
> http://lists.gnu.org/archive/html/qemu-devel/2011-12/msg02527.html
> 
>  Makefile.objs          |    1 +
>  block.c                |    2 +-
>  block.h                |    1 +
>  block/add-cow.c        |  429 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h            |    1 +
>  docs/specs/add-cow.txt |   72 ++++++++
>  6 files changed, 505 insertions(+), 1 deletions(-)
>  create mode 100644 block/add-cow.c
>  create mode 100644 docs/specs/add-cow.txt
> 


> +    s->bitmap_size = ((bs->total_sectors + 7) >> 3);
> +    s->bitmap = qemu_blockalign(bs, s->bitmap_size);
> +
> +    ret = bdrv_pread(bs->file, sizeof(header), s->bitmap,
> +            s->bitmap_size);
> +    if (ret != s->bitmap_size) {
> +        goto fail;
> +    }

As noted previously, it is not acceptable to read the entire bitmap in
memory since it might be very large. A cache, which limits the in-memory
size of the bitmap, must be created. In the qcow2-cache.c file you can 
find an example (thats for qcow2 metadata cache). You can divide the
bitmap in chunks of say, 4k, and have:

int is_bit_set(int64_t bitnum, BlockDriverState *bs)
{
    int64_t bitmap_entry = bitnum >> bits_per_entry;

    if (!is_in_bitmap_cache(bs, bitmap_entry))
        read_from_disk(bs, bitmap_entry);

    return lookup_bitmap_cache(bs, bitnum);
}

And then limit the cache to a few megabytes.

Also when setting a bit you must update cache and write
to disk.

> +
> +    if (s->image_file[0] == '\0') {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +
> +    ret = bdrv_file_open(&backing_bs, bs->backing_file, 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    bdrv_delete(backing_bs);
> +
> +    s->image_hd = bdrv_new("");
> +
> +    if (path_has_protocol(s->image_file)) {
> +        pstrcpy(image_filename, sizeof(image_filename),
> +                s->image_file);
> +    } else {
> +        path_combine(image_filename, sizeof(image_filename),
> +                     bs->filename, s->image_file);
> +    }
> +
> +    image_drv = bdrv_find_format("raw");
> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> +    if (ret < 0) {
> +        bdrv_delete(s->image_hd);
> +        s->image_hd = NULL;
> +        goto fail;
> +    }

Please make sure it is possible to create a snapshot with the
snapshot_blkdev command, of a raw image. It is necessary for live block
copy, as described here:

http://patchwork.ozlabs.org/patch/134257/

Also please update that document, later, with raw examples.

Thanks
Stefan Hajnoczi - Jan. 6, 2012, 8:22 a.m.
On Thu, Jan 05, 2012 at 01:46:08PM -0200, Marcelo Tosatti wrote:
> On Thu, Dec 29, 2011 at 05:36:59PM +0800, Dong Xu Wang wrote:
> > From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> > 
> > Introduce a new file format: add-cow. The usage can be found in add-cow.txt of
> > this patch.
> > 
> > CC: Kevin Wolf <kwolf@redhat.com>
> > CC: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> > ---
> > After applying this patch, qemu might can not compile, need apply this patch first:
> > http://lists.gnu.org/archive/html/qemu-devel/2011-12/msg02527.html
> > 
> >  Makefile.objs          |    1 +
> >  block.c                |    2 +-
> >  block.h                |    1 +
> >  block/add-cow.c        |  429 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  block_int.h            |    1 +
> >  docs/specs/add-cow.txt |   72 ++++++++
> >  6 files changed, 505 insertions(+), 1 deletions(-)
> >  create mode 100644 block/add-cow.c
> >  create mode 100644 docs/specs/add-cow.txt
> > 
> 
> 
> > +    s->bitmap_size = ((bs->total_sectors + 7) >> 3);
> > +    s->bitmap = qemu_blockalign(bs, s->bitmap_size);
> > +
> > +    ret = bdrv_pread(bs->file, sizeof(header), s->bitmap,
> > +            s->bitmap_size);
> > +    if (ret != s->bitmap_size) {
> > +        goto fail;
> > +    }
> 
> As noted previously, it is not acceptable to read the entire bitmap in
> memory since it might be very large. A cache, which limits the in-memory
> size of the bitmap, must be created. In the qcow2-cache.c file you can 
> find an example (thats for qcow2 metadata cache). You can divide the
> bitmap in chunks of say, 4k, and have:
> 
> int is_bit_set(int64_t bitnum, BlockDriverState *bs)
> {
>     int64_t bitmap_entry = bitnum >> bits_per_entry;
> 
>     if (!is_in_bitmap_cache(bs, bitmap_entry))
>         read_from_disk(bs, bitmap_entry);
> 
>     return lookup_bitmap_cache(bs, bitnum);
> }
> 
> And then limit the cache to a few megabytes.
> 
> Also when setting a bit you must update cache and write
> to disk.

I suspect it's also better to increase the bitmap granularity.  The
bitmap should track allocation at a larger "cluster" size like 64 KB.
That way we reduce the number of I/O operations required to update
metadata - it reduces the amount of metadata by a factor of 65536 / 512
= 128.

If you imagine a random write workload with 4 KB block size there is an
advantage to a 64 KB cluster size since later I/Os may require no bitmap
updates where we already allocated a cluster in an earlier operation.

The downside of a larger bitmap granularity is that writes are increased
to 64 KB, but if you run a benchmark I guess there is a threshold around
32 or 64 KB where the reduction in I/O operations makes up for the
larger I/O size.  It depends on your disks.

Stefan
Robert Wang - Jan. 9, 2012, 5:18 a.m.
Okay, I will consider your suggestion in version 7.
Thank you Marcelo and Stefan, :).


On Fri, Jan 6, 2012 at 16:22, Stefan Hajnoczi
<stefanha@linux.vnet.ibm.com>wrote:

> On Thu, Jan 05, 2012 at 01:46:08PM -0200, Marcelo Tosatti wrote:
> > On Thu, Dec 29, 2011 at 05:36:59PM +0800, Dong Xu Wang wrote:
> > > From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> > >
> > > Introduce a new file format: add-cow. The usage can be found in
> add-cow.txt of
> > > this patch.
> > >
> > > CC: Kevin Wolf <kwolf@redhat.com>
> > > CC: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > > Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> > > ---
> > > After applying this patch, qemu might can not compile, need apply this
> patch first:
> > > http://lists.gnu.org/archive/html/qemu-devel/2011-12/msg02527.html
> > >
> > >  Makefile.objs          |    1 +
> > >  block.c                |    2 +-
> > >  block.h                |    1 +
> > >  block/add-cow.c        |  429
> ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  block_int.h            |    1 +
> > >  docs/specs/add-cow.txt |   72 ++++++++
> > >  6 files changed, 505 insertions(+), 1 deletions(-)
> > >  create mode 100644 block/add-cow.c
> > >  create mode 100644 docs/specs/add-cow.txt
> > >
> >
> >
> > > +    s->bitmap_size = ((bs->total_sectors + 7) >> 3);
> > > +    s->bitmap = qemu_blockalign(bs, s->bitmap_size);
> > > +
> > > +    ret = bdrv_pread(bs->file, sizeof(header), s->bitmap,
> > > +            s->bitmap_size);
> > > +    if (ret != s->bitmap_size) {
> > > +        goto fail;
> > > +    }
> >
> > As noted previously, it is not acceptable to read the entire bitmap in
> > memory since it might be very large. A cache, which limits the in-memory
> > size of the bitmap, must be created. In the qcow2-cache.c file you can
> > find an example (thats for qcow2 metadata cache). You can divide the
> > bitmap in chunks of say, 4k, and have:
> >
> > int is_bit_set(int64_t bitnum, BlockDriverState *bs)
> > {
> >     int64_t bitmap_entry = bitnum >> bits_per_entry;
> >
> >     if (!is_in_bitmap_cache(bs, bitmap_entry))
> >         read_from_disk(bs, bitmap_entry);
> >
> >     return lookup_bitmap_cache(bs, bitnum);
> > }
> >
> > And then limit the cache to a few megabytes.
> >
> > Also when setting a bit you must update cache and write
> > to disk.
>
> I suspect it's also better to increase the bitmap granularity.  The
> bitmap should track allocation at a larger "cluster" size like 64 KB.
> That way we reduce the number of I/O operations required to update
> metadata - it reduces the amount of metadata by a factor of 65536 / 512
> = 128.
>
> If you imagine a random write workload with 4 KB block size there is an
> advantage to a 64 KB cluster size since later I/Os may require no bitmap
> updates where we already allocated a cluster in an earlier operation.
>
> The downside of a larger bitmap granularity is that writes are increased
> to 64 KB, but if you run a benchmark I guess there is a threshold around
> 32 or 64 KB where the reduction in I/O operations makes up for the
> larger I/O size.  It depends on your disks.
>
> Stefan
>
>
>

Patch

diff --git a/Makefile.objs b/Makefile.objs
index f753d83..23fab70 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -31,6 +31,7 @@  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-nested-y += add-cow.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block.c b/block.c
index aa9d142..0f52a67 100644
--- a/block.c
+++ b/block.c
@@ -187,7 +187,7 @@  static void bdrv_io_limits_intercept(BlockDriverState *bs,
 }
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 #ifdef _WIN32
     if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index 0e3ff9f..f5bf078 100644
--- a/block.h
+++ b/block.h
@@ -289,6 +289,7 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..95af5b7
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,429 @@ 
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
+                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
+                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
+                            ((uint64_t)'W' << 8) | 0xFF)
+#define ADD_COW_VERSION     1
+#define ADD_COW_FILE_LEN    1024
+
+typedef struct AddCowHeader {
+    uint64_t        magic;
+    uint32_t        version;
+    char            backing_file[ADD_COW_FILE_LEN];
+    char            image_file[ADD_COW_FILE_LEN];
+    uint64_t        size;
+    char            reserved[492];
+} QEMU_PACKED AddCowHeader;
+
+typedef struct BDRVAddCowState {
+    char                image_file[ADD_COW_FILE_LEN];
+    BlockDriverState    *image_hd;
+    uint8_t             *bitmap;
+    uint64_t            bitmap_size;
+    CoMutex             lock;
+} BDRVAddCowState;
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const AddCowHeader *header = (const void *)buf;
+
+    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
+        be32_to_cpu(header->version) == ADD_COW_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+    AddCowHeader        header;
+    int64_t             size;
+    char                image_filename[ADD_COW_FILE_LEN];
+    BlockDriver         *image_drv = NULL;
+    int                 ret;
+    BDRVAddCowState     *s = bs->opaque;
+    BlockDriverState    *backing_bs = NULL;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto fail;
+    }
+
+    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
+        char version[64];
+        snprintf(version, sizeof(version), "ADD-COW version %d", header.version);
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "add-cow", version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    size = be64_to_cpu(header.size);
+    bs->total_sectors = size / BDRV_SECTOR_SIZE;
+
+    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
+    QEMU_BUILD_BUG_ON(sizeof(s->image_file) != sizeof(header.image_file));
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+            header.backing_file);
+    pstrcpy(s->image_file, sizeof(s->image_file),
+            header.image_file);
+
+    s->bitmap_size = ((bs->total_sectors + 7) >> 3);
+    s->bitmap = qemu_blockalign(bs, s->bitmap_size);
+
+    ret = bdrv_pread(bs->file, sizeof(header), s->bitmap,
+            s->bitmap_size);
+    if (ret != s->bitmap_size) {
+        goto fail;
+    }
+
+    if (s->image_file[0] == '\0') {
+        ret = -ENOENT;
+        goto fail;
+    }
+
+    ret = bdrv_file_open(&backing_bs, bs->backing_file, 0);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_delete(backing_bs);
+
+    s->image_hd = bdrv_new("");
+
+    if (path_has_protocol(s->image_file)) {
+        pstrcpy(image_filename, sizeof(image_filename),
+                s->image_file);
+    } else {
+        path_combine(image_filename, sizeof(image_filename),
+                     bs->filename, s->image_file);
+    }
+
+    image_drv = bdrv_find_format("raw");
+    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
+    if (ret < 0) {
+        bdrv_delete(s->image_hd);
+        s->image_hd = NULL;
+        goto fail;
+    }
+
+    qemu_co_mutex_init(&s->lock);
+    return 0;
+ fail:
+    g_free(s->bitmap);
+    return ret;
+}
+
+static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+{
+    uint64_t offset = bitnum >> 3;
+    BDRVAddCowState *s = bs->opaque;
+    s->bitmap[offset] |= (1 << (bitnum % 8));
+}
+
+static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+{
+    BDRVAddCowState *s = bs->opaque;
+    uint64_t offset = bitnum >> 3;
+    return !!(s->bitmap[offset] & (1 << (bitnum % 8)));
+}
+
+static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
+{
+    int changed;
+    BDRVAddCowState *s = bs->opaque;
+
+    /* Beyond the end of bitmap, return error or read from backing_file? */
+    if (((sector_num + nb_sectors + 7) >> 3) > s->bitmap_size) {
+        return 0;
+    }
+
+    if (nb_sectors == 0) {
+        *num_same = nb_sectors;
+        return 0;
+    }
+
+    changed = is_bit_set(bs, sector_num);
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+        if (is_bit_set(bs, sector_num + *num_same) != changed) {
+            break;
+        }
+    }
+
+    return changed;
+}
+
+static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors)
+{
+    int i, ret = 0;
+    bool changed = false;
+    BDRVAddCowState *s = bs->opaque;
+    uint64_t start_pos = (sector_num  >> 3) & (~511);
+    uint64_t end_pos = (((sector_num + nb_sectors - 1)  >> 3) + 511) & (~511);
+    uint64_t sectors_to_write = MIN(end_pos - start_pos,
+                s->bitmap_size - start_pos);
+
+    if (start_pos > s->bitmap_size) {
+        return 0;
+    }
+
+    for (i = 0; i < nb_sectors; i++) {
+        if (!changed) {
+            changed = true;
+        }
+        add_cow_set_bit(bs, sector_num + i);
+    }
+
+    if (changed) {
+        ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + start_pos,
+            s->bitmap + start_pos, sectors_to_write);
+    }
+    return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+    BDRVAddCowState *s = bs->opaque;
+    g_free(s->bitmap);
+}
+
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    AddCowHeader header;
+    int64_t image_sectors = 0;
+    const char *backing_filename = NULL;
+    const char *image_filename = NULL;
+    int ret;
+    BlockDriverState *bs, *image_bs = NULL, *backing_bs = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_filename = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    if (!backing_filename || !image_filename) {
+        error_report("Both backing_file and image_file should be given.");
+        return -EINVAL;
+    }
+
+    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_delete(image_bs);
+
+    ret = bdrv_file_open(&backing_bs, backing_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_delete(backing_bs);
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be64(ADD_COW_MAGIC);
+    header.version = cpu_to_be32(ADD_COW_VERSION);
+    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
+    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
+    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    BlockDriver *drv = bdrv_find_format("add-cow");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int cur_nr_sectors;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int n, ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    while (remaining_sectors != 0) {
+        cur_nr_sectors = remaining_sectors;
+        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
+            cur_nr_sectors = n;
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            cur_nr_sectors = n;
+            if (bs->backing_hd) {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                qemu_iovec_reset(&hd_qiov);
+            }
+        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+    QEMUIOVector hd_qiov;
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_reset(&hd_qiov);
+    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
+    ret = bdrv_co_writev(s->image_hd,
+                     sector_num,
+                     remaining_sectors, &hd_qiov);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
+{
+    int ret = 0;
+    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
+    int64_t be_offset = cpu_to_be64(offset);
+    BDRVAddCowState *s = bs->opaque;
+    int64_t old_image_sector = s->image_hd->total_sectors;
+
+    ret = bdrv_truncate(s->image_hd, offset);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
+            + sizeof(AddCowHeader));
+    if (ret < 0) {
+        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+        return ret;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
+        &be_offset, sizeof(uint64_t));
+    if (ret < 0) {
+        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+    }
+
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = bdrv_co_flush(s->image_hd);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_co_flush(bs->file);
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_IMAGE_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a image file"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_add_cow = {
+    .format_name                = "add-cow",
+    .instance_size              = sizeof(BDRVAddCowState),
+    .bdrv_probe                 = add_cow_probe,
+    .bdrv_open                  = add_cow_open,
+    .bdrv_close                 = add_cow_close,
+    .bdrv_create                = add_cow_create,
+    .bdrv_co_is_allocated       = add_cow_is_allocated,
+
+    .bdrv_co_readv              = add_cow_co_readv,
+    .bdrv_co_writev             = add_cow_co_writev,
+    .bdrv_truncate              = bdrv_add_cow_truncate,
+
+    .create_options             = add_cow_create_options,
+    .bdrv_co_flush_to_disk      = add_cow_co_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+    bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block_int.h b/block_int.h
index 311bd2a..2384e76 100644
--- a/block_int.h
+++ b/block_int.h
@@ -50,6 +50,7 @@ 
 #define BLOCK_OPT_TABLE_SIZE    "table_size"
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
+#define BLOCK_OPT_IMAGE_FILE    "image_file"
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;
 
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..33f358e
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,72 @@ 
+== General ==
+
+Raw file format does not support backing_file and copy on write feature. Then
+you can use add-cow file to implement these features.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+    1)  Create a raw image with the same size of ubuntu.img
+            qemu-img create -f raw test.raw 8G
+    2)  Create a add-cow image which will store dirty bitmap
+            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
+    3)  Run qemu with add-cow image
+            qemu -drive if=virtio,file=test.add-cow
+
+While QEMU is running, virtual size of image_file and backing_file must be the
+same. So if image_file does not have the same virtual size as backing_file's in
+step 2), qemu-img will truncate it.
+
+=Specification=
+
+The file format looks like this:
+
+ +---------------+--------------------------+
+ |     Header    |           Data           |
+ +---------------+--------------------------+
+
+All numbers in add-cow are stored in Big Endian byte order.
+
+== Header ==
+
+The Header is included in the first bytes:
+
+    Byte  0 -  7:       magic
+                        add-cow magic string ("ADD_COW\xff")
+
+          8 -  11:      version
+                        Version number (only valid value is 1 now)
+
+          12 - 1035:    backing_file
+                        backing_file file name related to add-cow file. While
+                        using backing_file, must together with image_file. All
+                        unused bytes are padded with zeros.
+
+         1036 - 2059:   image_file
+                        image_file is a raw file, While using image_file, must
+                        together with image_file. All unused bytes are padded
+                        with zeros.
+
+         2060 - 2067:   size
+                        Virtual disk size of image_file in bytes.
+
+         2068 - 2559:   The Reserved field is used to make sure Data field starts
+                        at the multiple of 512, not used currently. All bytes are
+                        filled with 0.
+
+== Data ==
+
+The Data field starts at the 2560th byte, stores a bitmap related to backing_file
+and image_file. The bitmap will track whether the sector in backing_file is dirty
+or not.
+
+
+Each bit in the bitmap indicates one sector's status. So the size of bitmap is
+calculated according to virtual size of backing_file. In each byte, bit 0 to 7
+will track the 1st to 7th sector in sequence, bit orders in one byte look like:
+ +----+----+----+----+----+----+----+----+
+ | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
+ +----+----+----+----+----+----+----+----+
+
+If the bit is 0, indicates the sector has not been allocated in image_file, data
+should be loaded from backing_file while reading; if the bit is 1,  indicates the
+related sector has been dirty, should be loaded from image_file while reading.