Patchwork [1/2,v7] block: add-cow file format

login
register
mail settings
Submitter Robert Wang
Date March 1, 2012, 2:56 a.m.
Message ID <1330570610-8523-1-git-send-email-wdongxu@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/143890/
State New
Headers show

Comments

Robert Wang - March 1, 2012, 2:56 a.m.
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>


Provide a new file format: add-cow. The usage can be found in add-cow.txt of
this patch.

CC: Marcelo Tosatti <mtosatti@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
---
 Makefile.objs          |    1 +
 block.c                |    2 +-
 block.h                |    1 +
 block/add-cow-cache.c  |  171 ++++++++++++++++++++
 block/add-cow.c        |  402 ++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h            |    1 +
 docs/specs/add-cow.txt |   68 ++++++++
 7 files changed, 645 insertions(+), 1 deletions(-)
 create mode 100644 block/add-cow-cache.c
 create mode 100644 block/add-cow.c
 create mode 100644 docs/specs/add-cow.txt
Stefan Hajnoczi - March 6, 2012, 4:59 p.m.
On Thu, Mar 1, 2012 at 2:56 AM, Dong Xu Wang <wdongxu@linux.vnet.ibm.com> wrote:
> diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
> new file mode 100644
> index 0000000..6be02ff
> --- /dev/null
> +++ b/block/add-cow-cache.c
> @@ -0,0 +1,171 @@
> +/*
> + * Cache For QEMU ADD-COW Disk Format
> + *
> + * Copyright IBM, Corp. 2012
> + *
> + * Authors:
> + *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "block_int.h"
> +#include "qemu-common.h"
> +#include "add-cow.h"

This patch is missing add-cow.h.

> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..6897a52
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,402 @@
> +/*
> + * QEMU ADD-COW Disk Format
> + *
> + * Copyright IBM, Corp. 2012
> + *
> + * Authors:
> + *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +#include "add-cow.h"
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const void *)buf;

Minor coding style comment: please cast to const AddCowHeader* instead of void.

> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader        header;
> +    char                image_filename[ADD_COW_FILE_LEN];
> +    BlockDriver         *image_drv = NULL;
> +    int                 ret;
> +    BDRVAddCowState     *s = bs->opaque;
> +
> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        char version[64];
> +        snprintf(version, sizeof(version), "ADD-COW version %d", header.version);

be32_to_cpu(header.version)

> +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
> +            bs->device_name, "add-cow", version);
> +        ret = -ENOTSUP;
> +        goto fail;
> +    }
> +
> +    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
> +    strncpy(bs->backing_file, header.backing_file,
> +            sizeof(bs->backing_file));

Please use pstrcpy() - it always NUL-terminates the string.  strncpy()
only stores a '\0' when the source string is *shorter than* max
length, so a max length string results in bs->backing_file missing the
'\0'.

> +
> +    if (header.image_file[0] == '\0') {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +    s->image_hd = bdrv_new("");
> +    if (path_has_protocol(header.image_file)) {

Please safely copy in header.image_file before treating it as a
NUL-terminated string.

> +        strncpy(image_filename, header.image_file, sizeof(image_filename));
> +    } else {
> +        path_combine(image_filename, sizeof(image_filename),
> +                     bs->filename, header.image_file);
> +    }
> +
> +    image_drv = bdrv_find_format("raw");
> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> +    if (ret < 0) {
> +        bdrv_delete(s->image_hd);
> +        goto fail;
> +    }
> +    bs->total_sectors = s->image_hd->total_sectors;
> +    s->cluster_size = ADD_COW_CLUSTER_SIZE;
> +    s->bitmap_cache = add_cow_cache_create(bs, ADD_COW_CACHE_SIZE);
> +    qemu_co_mutex_init(&s->lock);
> +    return 0;
> + fail:
> +    return ret;
> +}
> +
> +static inline bool is_bit_set(BlockDriverState *bs, int64_t bitnum)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    uint64_t offset = bitnum >> 3;
> +    uint8_t *bitmap;
> +    int ret = add_cow_cache_get(bs, s->bitmap_cache,
> +        offset & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);

(void **) should not be necessary.

> +    if (ret < 0) {
> +        abort();
> +    }
> +
> +    return *(bitmap + (offset & (ADD_COW_CLUSTER_SIZE - 1))) & (1 << (bitnum % 8));

The cluster concept confuses me.  Normally the point of a cluster is
to reduce metadata size, so you would not index using bitnum % 8.

> +}
> +
> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, int *num_same)
> +{
> +    int changed;
> +
> +    if (nb_sectors == 0) {
> +        *num_same = nb_sectors;
> +        return 0;
> +    }
> +
> +    changed = is_bit_set(bs, sector_num);
> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
> +            break;
> +        }
> +    }
> +
> +    return changed;
> +}
> +
> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
> +        int nb_sectors)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    uint8_t *bitmap;
> +
> +    int i, ret = 0;
> +    for (i = 0; i < nb_sectors; i++) {
> +        int ret = add_cow_cache_get(bs, s->bitmap_cache,
> +            (sector_num + i) / 8 & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
> +        if (ret < 0) {
> +            abort();
> +        }
> +        *(bitmap + ((sector_num + i) / 8 & (ADD_COW_CLUSTER_SIZE - 1))) |=
> +                    (1 << ((sector_num + i) % 8));
> +        add_cow_cache_entry_mark_dirty(s->bitmap_cache, bitmap);
> +
> +    }
> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);

Data must be flushed to the cow file before writing metadata updates,
otherwise a crash in between could result in zero sectors instead of
backing file sectors.

> +static int add_cow_backing_read(BlockDriverState *bs, QEMUIOVector *qiov,
> +                  int64_t sector_num, int nb_sectors)

This function name is confusing because we don't actually perform a read here.

I'm also not sure why we need to handle the case where a request spans
the end of the device.  bdrv_check_byte_request, called from
bdrv_co_do_readv() should prevent such requests?

> +{
> +    int n1;
> +    if ((sector_num + nb_sectors) <= bs->total_sectors) {
> +        return nb_sectors;
> +    }
> +    if (sector_num >= bs->total_sectors) {
> +        n1 = 0;
> +    } else {
> +        n1 = bs->total_sectors - sector_num;
> +    }
> +
> +    qemu_iovec_memset_skip(qiov, 0, BDRV_SECTOR_SIZE * (nb_sectors - n1),
> +            BDRV_SECTOR_SIZE * n1);
> +    return n1;
> +}
> +
> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
> +                         int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, n1, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);

During the read it seems safe to unlock s->lock.  We need to acquire
it again immediately afterwards, but it allows other requests to
access the cow cache in the meantime.

> +            if (ret < 0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {
> +                n1 = add_cow_backing_read(bs->backing_hd, &hd_qiov,
> +                    sector_num, cur_nr_sectors);
> +                if (n1 > 0) {
> +                    qemu_iovec_reset(&hd_qiov);
> +                    qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                                cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                        n, &hd_qiov);
> +                    if (ret < 0) {
> +                        goto fail;
> +                    }
> +                }
> +            } else {
> +                qemu_iovec_reset(&hd_qiov);

We need to zero bytes, otherwise qiov will contain uninitialized bytes
instead of zeroes read when there is no backing file.

> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
> +                          int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +    QEMUIOVector hd_qiov;
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_reset(&hd_qiov);
> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);

Why use hd_qiov?  It should be possible to use qiov directly, I think.

> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, &hd_qiov);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
> +{
> +    int ret = 0;
> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
> +    BDRVAddCowState *s = bs->opaque;
> +    int64_t old_image_sector = s->image_hd->total_sectors;
> +
> +    ret = bdrv_truncate(bs->file, sizeof(AddCowHeader) + ((image_sectors + 7) >> 3));
> +    if (ret < 0) {
> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
> +        return ret;
> +    }
> +    return ret;
> +}

I'm surprised that we truncate the image file...but only in the error case.

Also, do we need to resize the cow cache?

> +
> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = bdrv_co_flush(s->image_hd);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);

I suggest moving qemu_co_mutex_unlock(&s->lock) here so you don't need
to duplicate it in the success and error cases.

> +    if (ret < 0) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        return ret;
> +    }
> +    qemu_co_mutex_unlock(&s->lock);
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FMT,
> +        .type = OPT_STRING,
> +        .help = "Image format of the base image"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name                = "add-cow",
> +    .instance_size              = sizeof(BDRVAddCowState),
> +    .bdrv_probe                 = add_cow_probe,
> +    .bdrv_open                  = add_cow_open,
> +    .bdrv_close                 = add_cow_close,
> +    .bdrv_create                = add_cow_create,
> +    .bdrv_co_is_allocated       = add_cow_is_allocated,
> +
> +    .bdrv_co_readv              = add_cow_co_readv,
> +    .bdrv_co_writev             = add_cow_co_writev,
> +    .bdrv_truncate              = bdrv_add_cow_truncate,
> +
> +    .create_options             = add_cow_create_options,
> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block_int.h b/block_int.h
> index b460c36..8126f27 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -50,6 +50,7 @@
>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>  #define BLOCK_OPT_PREALLOC      "preallocation"
>  #define BLOCK_OPT_SUBFMT        "subformat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>
>  typedef struct BdrvTrackedRequest BdrvTrackedRequest;
>
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..db992a4
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,68 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature.
> +The add-cow image format makes it possible to use backing files with raw
> +image by keeping a separate .add-cow metadata file.  Once all sectors
> +have been written to in the raw image it is safe to discard the .add-cow
> +and backing files and instead use the raw image directly.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +---------------+--------------------------+
> + |     Header    |           Data           |
> + +---------------+--------------------------+
> +
> +All numbers in add-cow are stored in Big Endian byte order.
> +
> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. All
> +                        unused bytes are padded with zeros. Must not be longer
> +                        than 1023 bytes.
> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file. All unused bytes are padded
> +                        with zeros. Must not be longer than 1023 bytes.
> +
> +         2060  - 2559:   The Reserved field is used to make sure Data field starts
> +                        at the multiple of 512, not used currently. All bytes are
> +                        filled with 0.
> +
> +== Data ==
> +
> +The Data field starts at the 2560th byte, stores a bitmap related to backing_file
> +and image_file. The bitmap will track whether the sector in backing_file is dirty
> +or not.

Cluster size is not mentioned and not documented in the add-cow header
structure.

I didn't look closely at how the cache and clusters are supposed to
work but I think they don't really work.  What I would expect is:

1. The bitmap operates at ADD_COW_CLUSTER_SIZE granularity.  This
means the bits in the add-cow file actually represent
ADD_COW_CLUSTER_SIZE sectors of data allocation, not just 1 sector.

2. Code that accesses the bitmap first divides by ADD_COW_CLUSTER_SIZE
to operate on an entire cluster.  Right now the lookup code seems to
do it *during* the bitmap lookup instead of before.

Sorry, this isn't a good explanation but I've run out of time to
really understand how this works in your patch.  I suggest looking
carefully at the is_allocated lookup and the cache to see if you're
really maintaining the bitmap at cluster granularity.

Stefan
Robert Wang - March 8, 2012, 2:27 a.m.
On Wed, Mar 7, 2012 at 00:59, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Mar 1, 2012 at 2:56 AM, Dong Xu Wang <wdongxu@linux.vnet.ibm.com> wrote:
>> diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
>> new file mode 100644
>> index 0000000..6be02ff
>> --- /dev/null
>> +++ b/block/add-cow-cache.c
>> @@ -0,0 +1,171 @@
>> +/*
>> + * Cache For QEMU ADD-COW Disk Format
>> + *
>> + * Copyright IBM, Corp. 2012
>> + *
>> + * Authors:
>> + *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "block_int.h"
>> +#include "qemu-common.h"
>> +#include "add-cow.h"
>
> This patch is missing add-cow.h.
>

>> diff --git a/block/add-cow.c b/block/add-cow.c
>> new file mode 100644
>> index 0000000..6897a52
>> --- /dev/null
>> +++ b/block/add-cow.c
>> @@ -0,0 +1,402 @@
>> +/*
>> + * QEMU ADD-COW Disk Format
>> + *
>> + * Copyright IBM, Corp. 2012
>> + *
>> + * Authors:
>> + *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "block_int.h"
>> +#include "module.h"
>> +#include "add-cow.h"
>> +
>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
>> +{
>> +    const AddCowHeader *header = (const void *)buf;
>
> Minor coding style comment: please cast to const AddCowHeader* instead of void.
>
>> +
>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>> +        return 100;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int add_cow_open(BlockDriverState *bs, int flags)
>> +{
>> +    AddCowHeader        header;
>> +    char                image_filename[ADD_COW_FILE_LEN];
>> +    BlockDriver         *image_drv = NULL;
>> +    int                 ret;
>> +    BDRVAddCowState     *s = bs->opaque;
>> +
>> +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
>> +    if (ret != sizeof(header)) {
>> +        goto fail;
>> +    }
>> +
>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
>> +        ret = -EINVAL;
>> +        goto fail;
>> +    }
>> +    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
>> +        char version[64];
>> +        snprintf(version, sizeof(version), "ADD-COW version %d", header.version);
>
> be32_to_cpu(header.version)
>
>> +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
>> +            bs->device_name, "add-cow", version);
>> +        ret = -ENOTSUP;
>> +        goto fail;
>> +    }
>> +
>> +    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
>> +    strncpy(bs->backing_file, header.backing_file,
>> +            sizeof(bs->backing_file));
>
> Please use pstrcpy() - it always NUL-terminates the string.  strncpy()
> only stores a '\0' when the source string is *shorter than* max
> length, so a max length string results in bs->backing_file missing the
> '\0'.
>
>> +
>> +    if (header.image_file[0] == '\0') {
>> +        ret = -ENOENT;
>> +        goto fail;
>> +    }
>> +    s->image_hd = bdrv_new("");
>> +    if (path_has_protocol(header.image_file)) {
>
> Please safely copy in header.image_file before treating it as a
> NUL-terminated string.
>
>> +        strncpy(image_filename, header.image_file, sizeof(image_filename));
>> +    } else {
>> +        path_combine(image_filename, sizeof(image_filename),
>> +                     bs->filename, header.image_file);
>> +    }
>> +
>> +    image_drv = bdrv_find_format("raw");
>> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
>> +    if (ret < 0) {
>> +        bdrv_delete(s->image_hd);
>> +        goto fail;
>> +    }
>> +    bs->total_sectors = s->image_hd->total_sectors;
>> +    s->cluster_size = ADD_COW_CLUSTER_SIZE;
>> +    s->bitmap_cache = add_cow_cache_create(bs, ADD_COW_CACHE_SIZE);
>> +    qemu_co_mutex_init(&s->lock);
>> +    return 0;
>> + fail:
>> +    return ret;
>> +}
>> +
>> +static inline bool is_bit_set(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    uint64_t offset = bitnum >> 3;
>> +    uint8_t *bitmap;
>> +    int ret = add_cow_cache_get(bs, s->bitmap_cache,
>> +        offset & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
>
> (void **) should not be necessary.
>
>> +    if (ret < 0) {
>> +        abort();
>> +    }
>> +
>> +    return *(bitmap + (offset & (ADD_COW_CLUSTER_SIZE - 1))) & (1 << (bitnum % 8));
>
> The cluster concept confuses me.  Normally the point of a cluster is
> to reduce metadata size, so you would not index using bitnum % 8.
>
>> +}
>> +
>> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
>> +        int64_t sector_num, int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = nb_sectors;
>> +        return 0;
>> +    }
>> +
>> +    changed = is_bit_set(bs, sector_num);
>> +    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    return changed;
>> +}
>> +
>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>> +        int nb_sectors)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    uint8_t *bitmap;
>> +
>> +    int i, ret = 0;
>> +    for (i = 0; i < nb_sectors; i++) {
>> +        int ret = add_cow_cache_get(bs, s->bitmap_cache,
>> +            (sector_num + i) / 8 & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
>> +        if (ret < 0) {
>> +            abort();
>> +        }
>> +        *(bitmap + ((sector_num + i) / 8 & (ADD_COW_CLUSTER_SIZE - 1))) |=
>> +                    (1 << ((sector_num + i) % 8));
>> +        add_cow_cache_entry_mark_dirty(s->bitmap_cache, bitmap);
>> +
>> +    }
>> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);
>
> Data must be flushed to the cow file before writing metadata updates,
> otherwise a crash in between could result in zero sectors instead of
> backing file sectors.
>
Sorry, what does it mean?  The correct steps shoud be:
1. flush to the image file
2. if 1 succeeds, flush to add-cow?
That means metadata updates should in image  file flushing step, not
in add_cow_co_writev?

>> +static int add_cow_backing_read(BlockDriverState *bs, QEMUIOVector *qiov,
>> +                  int64_t sector_num, int nb_sectors)
>
> This function name is confusing because we don't actually perform a read here.
>
> I'm also not sure why we need to handle the case where a request spans
> the end of the device.  bdrv_check_byte_request, called from
> bdrv_co_do_readv() should prevent such requests?
>
>> +{
>> +    int n1;
>> +    if ((sector_num + nb_sectors) <= bs->total_sectors) {
>> +        return nb_sectors;
>> +    }
>> +    if (sector_num >= bs->total_sectors) {
>> +        n1 = 0;
>> +    } else {
>> +        n1 = bs->total_sectors - sector_num;
>> +    }
>> +
>> +    qemu_iovec_memset_skip(qiov, 0, BDRV_SECTOR_SIZE * (nb_sectors - n1),
>> +            BDRV_SECTOR_SIZE * n1);
>> +    return n1;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
>> +                         int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int cur_nr_sectors;
>> +    uint64_t bytes_done = 0;
>> +    QEMUIOVector hd_qiov;
>> +    int n, n1, ret = 0;
>> +
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    while (remaining_sectors != 0) {
>> +        cur_nr_sectors = remaining_sectors;
>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
>> +            cur_nr_sectors = n;
>> +            qemu_iovec_reset(&hd_qiov);
>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
>
> During the read it seems safe to unlock s->lock.  We need to acquire
> it again immediately afterwards, but it allows other requests to
> access the cow cache in the meantime.
>
>> +            if (ret < 0) {
>> +                goto fail;
>> +            }
>> +        } else {
>> +            cur_nr_sectors = n;
>> +            if (bs->backing_hd) {
>> +                n1 = add_cow_backing_read(bs->backing_hd, &hd_qiov,
>> +                    sector_num, cur_nr_sectors);
>> +                if (n1 > 0) {
>> +                    qemu_iovec_reset(&hd_qiov);
>> +                    qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                                cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
>> +                                        n, &hd_qiov);
>> +                    if (ret < 0) {
>> +                        goto fail;
>> +                    }
>> +                }
>> +            } else {
>> +                qemu_iovec_reset(&hd_qiov);
>
> We need to zero bytes, otherwise qiov will contain uninitialized bytes
> instead of zeroes read when there is no backing file.
>
>> +            }
>> +        }
>> +        remaining_sectors -= cur_nr_sectors;
>> +        sector_num += cur_nr_sectors;
>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
>> +                          int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0;
>> +    QEMUIOVector hd_qiov;
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    qemu_co_mutex_lock(&s->lock);
>> +    qemu_iovec_reset(&hd_qiov);
>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
>
> Why use hd_qiov?  It should be possible to use qiov directly, I think.
>
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors, &hd_qiov);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +
>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +fail:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>> +{
>> +    int ret = 0;
>> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>> +
>> +    ret = bdrv_truncate(bs->file, sizeof(AddCowHeader) + ((image_sectors + 7) >> 3));
>> +    if (ret < 0) {
>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>> +        return ret;
>> +    }
>> +    return ret;
>> +}
>
> I'm surprised that we truncate the image file...but only in the error case.
>
> Also, do we need to resize the cow cache?
>
cow cache should be resized, because the function will be called while creating.

Kevin gave comments in v2:
"bs->file only contains metadata, so why is this correct? Shoudln't you
update the header with the new size and truncate s->image_hd?"

So I want to know should image_file have the same virtual size as the
backing_file?

>> +
>> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = bdrv_co_flush(s->image_hd);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    qemu_co_mutex_lock(&s->lock);
>> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);
>
> I suggest moving qemu_co_mutex_unlock(&s->lock) here so you don't need
> to duplicate it in the success and error cases.
Okay.
>
>> +    if (ret < 0) {
>> +        qemu_co_mutex_unlock(&s->lock);
>> +        return ret;
>> +    }
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    return bdrv_co_flush(bs->file);
>> +}
>> +
>> +static QEMUOptionParameter add_cow_create_options[] = {
>> +    {
>> +        .name = BLOCK_OPT_SIZE,
>> +        .type = OPT_SIZE,
>> +        .help = "Virtual disk size"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a base image"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_IMAGE_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a image file"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FMT,
>> +        .type = OPT_STRING,
>> +        .help = "Image format of the base image"
>> +    },
>> +    { NULL }
>> +};
>> +
>> +static BlockDriver bdrv_add_cow = {
>> +    .format_name                = "add-cow",
>> +    .instance_size              = sizeof(BDRVAddCowState),
>> +    .bdrv_probe                 = add_cow_probe,
>> +    .bdrv_open                  = add_cow_open,
>> +    .bdrv_close                 = add_cow_close,
>> +    .bdrv_create                = add_cow_create,
>> +    .bdrv_co_is_allocated       = add_cow_is_allocated,
>> +
>> +    .bdrv_co_readv              = add_cow_co_readv,
>> +    .bdrv_co_writev             = add_cow_co_writev,
>> +    .bdrv_truncate              = bdrv_add_cow_truncate,
>> +
>> +    .create_options             = add_cow_create_options,
>> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
>> +};
>> +
>> +static void bdrv_add_cow_init(void)
>> +{
>> +    bdrv_register(&bdrv_add_cow);
>> +}
>> +
>> +block_init(bdrv_add_cow_init);
>> diff --git a/block_int.h b/block_int.h
>> index b460c36..8126f27 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -50,6 +50,7 @@
>>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>  #define BLOCK_OPT_PREALLOC      "preallocation"
>>  #define BLOCK_OPT_SUBFMT        "subformat"
>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>
>>  typedef struct BdrvTrackedRequest BdrvTrackedRequest;
>>
>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>> new file mode 100644
>> index 0000000..db992a4
>> --- /dev/null
>> +++ b/docs/specs/add-cow.txt
>> @@ -0,0 +1,68 @@
>> +== General ==
>> +
>> +Raw file format does not support backing_file and copy on write feature.
>> +The add-cow image format makes it possible to use backing files with raw
>> +image by keeping a separate .add-cow metadata file.  Once all sectors
>> +have been written to in the raw image it is safe to discard the .add-cow
>> +and backing files and instead use the raw image directly.
>> +
>> +When using add-cow, procedures may like this:
>> +(ubuntu.img is a disk image which has been installed OS.)
>> +    1)  Create a raw image with the same size of ubuntu.img
>> +            qemu-img create -f raw test.raw 8G
>> +    2)  Create a add-cow image which will store dirty bitmap
>> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
>> +    3)  Run qemu with add-cow image
>> +            qemu -drive if=virtio,file=test.add-cow
>> +
>> +=Specification=
>> +
>> +The file format looks like this:
>> +
>> + +---------------+--------------------------+
>> + |     Header    |           Data           |
>> + +---------------+--------------------------+
>> +
>> +All numbers in add-cow are stored in Big Endian byte order.
>> +
>> +== Header ==
>> +
>> +The Header is included in the first bytes:
>> +
>> +    Byte  0 -  7:       magic
>> +                        add-cow magic string ("ADD_COW\xff")
>> +
>> +          8 -  11:      version
>> +                        Version number (only valid value is 1 now)
>> +
>> +          12 - 1035:    backing_file
>> +                        backing_file file name related to add-cow file. All
>> +                        unused bytes are padded with zeros. Must not be longer
>> +                        than 1023 bytes.
>> +
>> +         1036 - 2059:   image_file
>> +                        image_file is a raw file. All unused bytes are padded
>> +                        with zeros. Must not be longer than 1023 bytes.
>> +
>> +         2060  - 2559:   The Reserved field is used to make sure Data field starts
>> +                        at the multiple of 512, not used currently. All bytes are
>> +                        filled with 0.
>> +
>> +== Data ==
>> +
>> +The Data field starts at the 2560th byte, stores a bitmap related to backing_file
>> +and image_file. The bitmap will track whether the sector in backing_file is dirty
>> +or not.
>
> Cluster size is not mentioned and not documented in the add-cow header
> structure.
>
> I didn't look closely at how the cache and clusters are supposed to
> work but I think they don't really work.  What I would expect is:
>
> 1. The bitmap operates at ADD_COW_CLUSTER_SIZE granularity.  This
> means the bits in the add-cow file actually represent
> ADD_COW_CLUSTER_SIZE sectors of data allocation, not just 1 sector.
>
Yes, I made a mistake. It should not be called cluster.
> 2. Code that accesses the bitmap first divides by ADD_COW_CLUSTER_SIZE
> to operate on an entire cluster.  Right now the lookup code seems to
> do it *during* the bitmap lookup instead of before.
Sorry, I do not understand this comment clearly. Could you explain more?
>
> Sorry, this isn't a good explanation but I've run out of time to
> really understand how this works in your patch.  I suggest looking
> carefully at the is_allocated lookup and the cache to see if you're
> really maintaining the bitmap at cluster granularity.
>
> Stefan
>
Thanks for you reviewing.
Stefan Hajnoczi - March 8, 2012, 10:50 a.m.
On Thu, Mar 8, 2012 at 2:27 AM, Dong Xu Wang <wdongxu@linux.vnet.ibm.com> wrote:
> On Wed, Mar 7, 2012 at 00:59, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Thu, Mar 1, 2012 at 2:56 AM, Dong Xu Wang <wdongxu@linux.vnet.ibm.com> wrote:
>>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
>>> +        int nb_sectors)
>>> +{
>>> +    BDRVAddCowState *s = bs->opaque;
>>> +    uint8_t *bitmap;
>>> +
>>> +    int i, ret = 0;
>>> +    for (i = 0; i < nb_sectors; i++) {
>>> +        int ret = add_cow_cache_get(bs, s->bitmap_cache,
>>> +            (sector_num + i) / 8 & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
>>> +        if (ret < 0) {
>>> +            abort();
>>> +        }
>>> +        *(bitmap + ((sector_num + i) / 8 & (ADD_COW_CLUSTER_SIZE - 1))) |=
>>> +                    (1 << ((sector_num + i) % 8));
>>> +        add_cow_cache_entry_mark_dirty(s->bitmap_cache, bitmap);
>>> +
>>> +    }
>>> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);
>>
>> Data must be flushed to the cow file before writing metadata updates,
>> otherwise a crash in between could result in zero sectors instead of
>> backing file sectors.
>>
> Sorry, what does it mean?  The correct steps shoud be:
> 1. flush to the image file
> 2. if 1 succeeds, flush to add-cow?
> That means metadata updates should in image  file flushing step, not
> in add_cow_co_writev?

You only need to flush when there are metadata updates.  So when the
guest writes to a sector that is already allocated in the image file,
no metadata updates are necessary and we don't need to flush.  The
guest is expect to issue a flush operation if it really wants to
flush.

However, there is a data integrity problem in the case where metadata
updates *are* required.  When we allocate new sectors in the image
file we need to make sure that they have been populated and are safely
on disk.  Otherwise we risk data corruption if the system crashes.
Here is the scenario:

1. Guest issues a write, this will require allocating new sectors in
the image file.  Therefore we'll need to update the bitmap in the
.add-cow file.
2. Write guest data buffer to image file.
3. Write bitmap update to .add-cow file.

If there is no flush between #2 and #3 then the write operations are
unordered.  #2 could appear before #3.  Or #3 could appear before #2.

I say "appear" because the write operation will complete in order, but
we're not guaranteed to "see" the data on disk after crash due to
volatile disk write caches or the host OS page cache (when -drive
...,cache=writeback).

If we want to ensure that #2 happens before #3 then we need to flush
before #3.  This will force all layers of caching to put the data
safely onto disk.

The reason *why* this is necessary is because we must avoid putting #3
on disk before #2.  If the system crashes and #2 never makes it to
disk but #3 did, then we will have allocated sectors in the image file
which are empty - they will be full of zeroes.  That is incorrect
because the guest either should see the data from the backing file if
the write didn't complete during the crash, or it should see the data
it was writing if the write did complete during the crash.

>>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>>> +{
>>> +    int ret = 0;
>>> +    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
>>> +    BDRVAddCowState *s = bs->opaque;
>>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>>> +
>>> +    ret = bdrv_truncate(bs->file, sizeof(AddCowHeader) + ((image_sectors + 7) >> 3));
>>> +    if (ret < 0) {
>>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>>> +        return ret;
>>> +    }
>>> +    return ret;
>>> +}
>>
>> I'm surprised that we truncate the image file...but only in the error case.
>>
>> Also, do we need to resize the cow cache?
>>
> cow cache should be resized, because the function will be called while creating.
>
> Kevin gave comments in v2:
> "bs->file only contains metadata, so why is this correct? Shoudln't you
> update the header with the new size and truncate s->image_hd?"
>
> So I want to know should image_file have the same virtual size as the
> backing_file?

qcow2 and qed do not require backing file and image to have the same
size.  However, when you truncate it changes the size of the image
file.

I think it's good to have the flexibility of a smaller (or larger)
backing file.  Supporting this is useful because there are some cases,
like provisioning new VMs where it can be advantageous to take a
compact, small backing file and create a larger image from it which
might append a data partition at the end of the disk.

>> 2. Code that accesses the bitmap first divides by ADD_COW_CLUSTER_SIZE
>> to operate on an entire cluster.  Right now the lookup code seems to
>> do it *during* the bitmap lookup instead of before.
> Sorry, I do not understand this comment clearly. Could you explain more?

If you implement clusters (for example, 64 KB size) then each bit in
the bitmap covers 64 KB of data in the image file.  This is why I
expected the code to divide by cluster size before accessing the
bitmap - because we discard the information about which exact sector
we're accessing and only work at 64 KB granularity.

Stefan

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 808de6a..fa9dde0 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,6 +34,7 @@  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
+block-nested-y += add-cow.o add-cow-cache.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
 block-nested-y += stream.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
diff --git a/block.c b/block.c
index 52ffe14..581c092 100644
--- a/block.c
+++ b/block.c
@@ -194,7 +194,7 @@  static void bdrv_io_limits_intercept(BlockDriverState *bs,
 }
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 #ifdef _WIN32
     if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index 48d0bf3..3d96444 100644
--- a/block.h
+++ b/block.h
@@ -310,6 +310,7 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
new file mode 100644
index 0000000..6be02ff
--- /dev/null
+++ b/block/add-cow-cache.c
@@ -0,0 +1,171 @@ 
+/*
+ * Cache For QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "add-cow.h"
+
+AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables)
+{
+    BDRVAddCowState *s = bs->opaque;
+    AddCowCache *c;
+    int i;
+
+    c = g_malloc0(sizeof(*c));
+    c->size = num_tables;
+    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+
+    for (i = 0; i < c->size; i++) {
+        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+        c->entries[i].offset = -1;
+    }
+
+    return c;
+}
+
+int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        qemu_vfree(c->entries[i].table);
+    }
+
+    g_free(c->entries);
+    g_free(c);
+
+    return 0;
+}
+
+static int add_cow_cache_find_entry_to_replace(AddCowCache *c)
+{
+    int i;
+    int min_count = INT_MAX;
+    int min_index = -1;
+
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].cache_hits < min_count) {
+            min_index = i;
+            min_count = c->entries[i].cache_hits;
+        }
+
+        c->entries[i].cache_hits /= 2;
+    }
+
+    if (min_index == -1) {
+        abort();
+    }
+    return min_index;
+}
+
+static int add_cow_cache_entry_flush(BlockDriverState *bs,
+                                        AddCowCache *c, int i)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+
+    if (!c->entries[i].dirty || (-1 == c->entries[i].offset)) {
+        return 0;
+    }
+    ret = bdrv_flush(bs->file);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_pwrite(bs->file,
+                    sizeof(AddCowHeader) + c->entries[i].offset,
+                    c->entries[i].table,
+                    s->cluster_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].dirty = false;
+
+    return 0;
+}
+
+void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    abort();
+
+found:
+    c->entries[i].dirty = true;
+}
+
+int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c)
+{
+    int result = 0;
+    int ret;
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        ret = add_cow_cache_entry_flush(bs, c, i);
+        if (ret < 0 && result != -ENOSPC) {
+            result = ret;
+        }
+    }
+
+    if (result == 0) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c,
+    uint64_t offset, void **table)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int i;
+    int ret;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            goto found;
+        }
+    }
+
+    i = add_cow_cache_find_entry_to_replace(c);
+    if (i < 0) {
+        return i;
+    }
+
+    ret = add_cow_cache_entry_flush(bs, c, i);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_pread(bs->file, sizeof(AddCowHeader) + offset,
+                        c->entries[i].table, s->cluster_size);
+    if (ret < 0) {
+        return ret;
+    }
+    c->entries[i].cache_hits = 32;
+    c->entries[i].offset = offset;
+
+found:
+    c->entries[i].cache_hits++;
+    *table = c->entries[i].table;
+    return 0;
+}
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..6897a52
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,402 @@ 
+/*
+ * QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "add-cow.h"
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const AddCowHeader *header = (const void *)buf;
+
+    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
+        be32_to_cpu(header->version) == ADD_COW_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+    AddCowHeader        header;
+    char                image_filename[ADD_COW_FILE_LEN];
+    BlockDriver         *image_drv = NULL;
+    int                 ret;
+    BDRVAddCowState     *s = bs->opaque;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto fail;
+    }
+
+    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
+        char version[64];
+        snprintf(version, sizeof(version), "ADD-COW version %d", header.version);
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "add-cow", version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
+    strncpy(bs->backing_file, header.backing_file,
+            sizeof(bs->backing_file));
+
+    if (header.image_file[0] == '\0') {
+        ret = -ENOENT;
+        goto fail;
+    }
+    s->image_hd = bdrv_new("");
+    if (path_has_protocol(header.image_file)) {
+        strncpy(image_filename, header.image_file, sizeof(image_filename));
+    } else {
+        path_combine(image_filename, sizeof(image_filename),
+                     bs->filename, header.image_file);
+    }
+
+    image_drv = bdrv_find_format("raw");
+    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
+    if (ret < 0) {
+        bdrv_delete(s->image_hd);
+        goto fail;
+    }
+    bs->total_sectors = s->image_hd->total_sectors;
+    s->cluster_size = ADD_COW_CLUSTER_SIZE;
+    s->bitmap_cache = add_cow_cache_create(bs, ADD_COW_CACHE_SIZE);
+    qemu_co_mutex_init(&s->lock);
+    return 0;
+ fail:
+    return ret;
+}
+
+static inline bool is_bit_set(BlockDriverState *bs, int64_t bitnum)
+{
+    BDRVAddCowState *s = bs->opaque;
+    uint64_t offset = bitnum >> 3;
+    uint8_t *bitmap;
+    int ret = add_cow_cache_get(bs, s->bitmap_cache,
+        offset & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
+    if (ret < 0) {
+        abort();
+    }
+
+    return *(bitmap + (offset & (ADD_COW_CLUSTER_SIZE - 1))) & (1 << (bitnum % 8));
+}
+
+static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
+{
+    int changed;
+
+    if (nb_sectors == 0) {
+        *num_same = nb_sectors;
+        return 0;
+    }
+
+    changed = is_bit_set(bs, sector_num);
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+        if (is_bit_set(bs, sector_num + *num_same) != changed) {
+            break;
+        }
+    }
+
+    return changed;
+}
+
+static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors)
+{
+    BDRVAddCowState *s = bs->opaque;
+    uint8_t *bitmap;
+
+    int i, ret = 0;
+    for (i = 0; i < nb_sectors; i++) {
+        int ret = add_cow_cache_get(bs, s->bitmap_cache,
+            (sector_num + i) / 8 & ~(ADD_COW_CLUSTER_SIZE - 1), (void **)&bitmap);
+        if (ret < 0) {
+            abort();
+        }
+        *(bitmap + ((sector_num + i) / 8 & (ADD_COW_CLUSTER_SIZE - 1))) |=
+                    (1 << ((sector_num + i) % 8));
+        add_cow_cache_entry_mark_dirty(s->bitmap_cache, bitmap);
+
+    }
+    ret = add_cow_cache_flush(bs, s->bitmap_cache);
+    if (ret < 0) {
+        abort();
+    }
+    return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+    BDRVAddCowState *s = bs->opaque;
+    add_cow_cache_destroy(bs, s->bitmap_cache);
+    bdrv_delete(s->image_hd);
+}
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    AddCowHeader header;
+    int64_t image_sectors = 0;
+    const char *backing_filename = NULL;
+    const char *image_filename = NULL;
+    int ret;
+    BlockDriverState *bs, *image_bs = NULL, *backing_bs = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_filename = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    if (!backing_filename || !image_filename) {
+        error_report("Both backing_file and image_file should be given.");
+        return -EINVAL;
+    }
+
+    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    image_sectors = image_bs->total_sectors;
+    bdrv_delete(image_bs);
+
+    ret = bdrv_file_open(&backing_bs, backing_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_delete(backing_bs);
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be64(ADD_COW_MAGIC);
+    header.version = cpu_to_be32(ADD_COW_VERSION);
+    strncpy(header.backing_file, backing_filename, sizeof(header.backing_file));
+    strncpy(header.image_file, image_filename, sizeof(header.image_file));
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    BlockDriver *drv = bdrv_find_format("add-cow");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int add_cow_backing_read(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors)
+{
+    int n1;
+    if ((sector_num + nb_sectors) <= bs->total_sectors) {
+        return nb_sectors;
+    }
+    if (sector_num >= bs->total_sectors) {
+        n1 = 0;
+    } else {
+        n1 = bs->total_sectors - sector_num;
+    }
+
+    qemu_iovec_memset_skip(qiov, 0, BDRV_SECTOR_SIZE * (nb_sectors - n1),
+            BDRV_SECTOR_SIZE * n1);
+    return n1;
+}
+
+static coroutine_fn int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int cur_nr_sectors;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int n, n1, ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    while (remaining_sectors != 0) {
+        cur_nr_sectors = remaining_sectors;
+        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
+            cur_nr_sectors = n;
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            cur_nr_sectors = n;
+            if (bs->backing_hd) {
+                n1 = add_cow_backing_read(bs->backing_hd, &hd_qiov,
+                    sector_num, cur_nr_sectors);
+                if (n1 > 0) {
+                    qemu_iovec_reset(&hd_qiov);
+                    qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                                cur_nr_sectors * BDRV_SECTOR_SIZE);
+                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                        n, &hd_qiov);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                }
+            } else {
+                qemu_iovec_reset(&hd_qiov);
+            }
+        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+    QEMUIOVector hd_qiov;
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_reset(&hd_qiov);
+    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
+    ret = bdrv_co_writev(s->image_hd,
+                     sector_num,
+                     remaining_sectors, &hd_qiov);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
+{
+    int ret = 0;
+    int64_t image_sectors = offset / BDRV_SECTOR_SIZE;
+    BDRVAddCowState *s = bs->opaque;
+    int64_t old_image_sector = s->image_hd->total_sectors;
+
+    ret = bdrv_truncate(bs->file, sizeof(AddCowHeader) + ((image_sectors + 7) >> 3));
+    if (ret < 0) {
+        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+        return ret;
+    }
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = bdrv_co_flush(s->image_hd);
+    if (ret < 0) {
+        return ret;
+    }
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = add_cow_cache_flush(bs, s->bitmap_cache);
+    if (ret < 0) {
+        qemu_co_mutex_unlock(&s->lock);
+        return ret;
+    }
+    qemu_co_mutex_unlock(&s->lock);
+    return bdrv_co_flush(bs->file);
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_IMAGE_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a image file"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_add_cow = {
+    .format_name                = "add-cow",
+    .instance_size              = sizeof(BDRVAddCowState),
+    .bdrv_probe                 = add_cow_probe,
+    .bdrv_open                  = add_cow_open,
+    .bdrv_close                 = add_cow_close,
+    .bdrv_create                = add_cow_create,
+    .bdrv_co_is_allocated       = add_cow_is_allocated,
+
+    .bdrv_co_readv              = add_cow_co_readv,
+    .bdrv_co_writev             = add_cow_co_writev,
+    .bdrv_truncate              = bdrv_add_cow_truncate,
+
+    .create_options             = add_cow_create_options,
+    .bdrv_co_flush_to_disk      = add_cow_co_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+    bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block_int.h b/block_int.h
index b460c36..8126f27 100644
--- a/block_int.h
+++ b/block_int.h
@@ -50,6 +50,7 @@ 
 #define BLOCK_OPT_TABLE_SIZE    "table_size"
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
+#define BLOCK_OPT_IMAGE_FILE    "image_file"
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;
 
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..db992a4
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,68 @@ 
+== General ==
+
+Raw file format does not support backing_file and copy on write feature.
+The add-cow image format makes it possible to use backing files with raw
+image by keeping a separate .add-cow metadata file.  Once all sectors
+have been written to in the raw image it is safe to discard the .add-cow
+and backing files and instead use the raw image directly.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+    1)  Create a raw image with the same size of ubuntu.img
+            qemu-img create -f raw test.raw 8G
+    2)  Create a add-cow image which will store dirty bitmap
+            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
+    3)  Run qemu with add-cow image
+            qemu -drive if=virtio,file=test.add-cow
+
+=Specification=
+
+The file format looks like this:
+
+ +---------------+--------------------------+
+ |     Header    |           Data           |
+ +---------------+--------------------------+
+
+All numbers in add-cow are stored in Big Endian byte order.
+
+== Header ==
+
+The Header is included in the first bytes:
+
+    Byte  0 -  7:       magic
+                        add-cow magic string ("ADD_COW\xff")
+
+          8 -  11:      version
+                        Version number (only valid value is 1 now)
+
+          12 - 1035:    backing_file
+                        backing_file file name related to add-cow file. All
+                        unused bytes are padded with zeros. Must not be longer
+                        than 1023 bytes.
+
+         1036 - 2059:   image_file
+                        image_file is a raw file. All unused bytes are padded
+                        with zeros. Must not be longer than 1023 bytes.
+
+         2060  - 2559:   The Reserved field is used to make sure Data field starts
+                        at the multiple of 512, not used currently. All bytes are
+                        filled with 0.
+
+== Data ==
+
+The Data field starts at the 2560th byte, stores a bitmap related to backing_file
+and image_file. The bitmap will track whether the sector in backing_file is dirty
+or not.
+
+
+Each bit in the bitmap indicates one sector's status. So the size of bitmap is
+calculated according to virtual size of backing_file. In each byte, bit 0 to 7
+will track the 1st to 7th sector in sequence, bit orders in one byte look like:
+ +----+----+----+----+----+----+----+----+
+ | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
+ +----+----+----+----+----+----+----+----+
+
+If the bit is 0, indicates the sector has not been allocated in image_file, data
+should be loaded from backing_file while reading; if the bit is 1,  indicates the
+related sector has been dirty, should be loaded from image_file while reading.
+Writing to a sector causes the corresponding bit to be set to 1.