diff mbox

[1/3,v9] add-cow file format

Message ID 1336412058-13751-1-git-send-email-wdongxu@linux.vnet.ibm.com
State New
Headers show

Commit Message

Robert Wang May 7, 2012, 5:34 p.m. UTC
Provide a new file format: add-cow. The usage can be found in add-cow.txt of
this patch.

CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
---
 Makefile.objs          |    1 +
 block.c                |    2 +-
 block.h                |    1 +
 block/add-cow-cache.c  |  193 +++++++++++++++++++++
 block/add-cow.c        |  446 ++++++++++++++++++++++++++++++++++++++++++++++++
 block/add-cow.h        |   83 +++++++++
 block_int.h            |    1 +
 docs/specs/add-cow.txt |   68 ++++++++
 qemu-img.c             |   39 +++++
 9 files changed, 833 insertions(+), 1 deletion(-)
 create mode 100644 block/add-cow-cache.c
 create mode 100644 block/add-cow.c
 create mode 100644 block/add-cow.h
 create mode 100644 docs/specs/add-cow.txt

Comments

Stefan Hajnoczi May 29, 2012, 3:50 p.m. UTC | #1
> +    image_sectors = image_bs->total_sectors;

Please use bdrv_getlength() instead of accessing total_sectors directly.

> +    image_drv = bdrv_find_format("raw");
> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> +    if (ret < 0) {
> +        bdrv_delete(s->image_hd);
> +        goto fail;
> +    }
> +    bs->total_sectors = s->image_hd->total_sectors;

bdrv_getlength()

> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    int64_t cluster_num;
> +
> +    if (nb_sectors == 0) {
> +        *num_same = 0;
> +        return 0;
> +    }
> +
> +    cluster_num = sector_num / SECTORS_PER_CLUSTER;
> +    changed = is_allocated(bs, sector_num);

Do we need to hold the lock before (indirectly) accessing the cache?

> +    *num_same =
> +        MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
> +
> +    for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
> +        cluster_num <= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
> +        cluster_num++) {
> +        if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) {
> +            break;
> +        }
> +        *num_same = MIN(nb_sectors,
> +            (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
> +    }

I think this makes sense but it would be easier to use a loop counter
that is in sector units instead of clusters.  Then you can calculate
*num_name after the loop simply by subtracting the starting sector_num
from the final cur_sector value.  And it saves you from constantly
converting back and forth between clusters and sectors.

> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
> +        int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0, i;
> +    QEMUIOVector hd_qiov;
> +    uint8_t *table;
> +    bool changed = false;
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, qiov);

We don't need to lock across all writes.

lock()
if write requires allocation:
    ...do allocation stuff under lock...
unlock()
write data

Internal data structure (cache, metadata, and copying unmodified
sectors) access needs to be locked during allocating writes.  The
guest's data can be written without the lock held.

This means that most writes will only lock for a short time to check
that the clusters are already allocated.  Then they will be able to
write in parallel.

If any cluster is not yet allocated then the allocation needs to
happen under lock.  This ensures that we don't copy backing file
sectors while processing another write request that touches those
sectors.

> +
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy content of unmodified sectors */
> +    if (!is_cluster_head(sector_num) && !is_allocated(bs, sector_num)) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = copy_sectors(bs, sector_num & ~(SECTORS_PER_CLUSTER - 1),
> +            sector_num);

As mentioned above, I think we need to lock during copy_sectors() so
that other requests cannot race with this.  (The guest's writes must
always take priority over copying unmodified sectors.)

> +        qemu_co_mutex_lock(&s->lock);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
> +    if (!is_cluster_tail(sector_num + remaining_sectors - 1)
> +        && !is_allocated(bs, sector_num + remaining_sectors - 1)) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = copy_sectors(bs, sector_num + remaining_sectors,
> +            ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) + 1);
> +        qemu_co_mutex_lock(&s->lock);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
> +    for (i = sector_num / SECTORS_PER_CLUSTER;
> +        i <= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
> +        i++) {
> +        ret = add_cow_cache_get(bs, s->bitmap_cache,
> +            i * SECTORS_PER_CLUSTER, (void **)&table);
> +        if (ret < 0) {
> +            return 0;

return ret?

> +        }
> +        if ((table[i / 8] & (1 << (i % 8))) == 0) {
> +            table[i / 8] |= (1 << (i % 8));
> +            changed = true;
> +            add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
> +        }
> +
> +    }
> +    ret = 0;
> +fail:
> +    if (changed) {
> +        ret = add_cow_cache_flush(bs, s->bitmap_cache);
> +    }
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
> +    int ret;
> +    int64_t old_image_sector = s->image_hd->total_sectors;
> +    int64_t bitmap_size =
> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
> +
> +    ret = bdrv_truncate(bs->file,
> +        sizeof(AddCowHeader) + bitmap_size);
> +    if (ret < 0) {
> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);

Why truncate image_hd on failure?  We never touch the image_hd size on
success either.  I think we can just leave it alone.

> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    char            reserved[500];
> +} QEMU_PACKED AddCowHeader;

I'd be tempted to start the bitmap at 4096 bytes into the file.  On 4
KB block size disks that would be a nice alignment and it doesn't
waste much additional space (the disk images we're trying to manage
are many GB :)).

> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>     }
>
>     /* Create the new image */
> +
> +    if (0 == strcmp(out_fmt, "add-cow")) {
> +        image_drv = bdrv_find_format("raw");
> +        if (!drv) {
> +            ret = -1;
> +            goto out;
> +        }
> +        snprintf(image_filename, sizeof(image_filename),
> +            "%s"".ct.raw", out_filename);
> +        ret = bdrv_create(image_drv, image_filename, image_param);
> +        if (ret < 0) {
> +            error_report("%s: error while creating image_file: %s",
> +                     image_filename, strerror(-ret));
> +            goto out;
> +        }
> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
> +
> +        if (!out_baseimg) {
> +            backing_drv = bdrv_find_format("qcow2");
> +            if (!drv) {
> +                ret = -1;
> +                goto out;
> +            }
> +            snprintf(backing_filename, sizeof(backing_filename),
> +                "%s"".ct.qcow2", out_filename);
> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
> +            if (ret < 0) {
> +                error_report("%s: error while creating backing_file: %s",
> +                         backing_filename, strerror(-ret));
> +                goto out;
> +            }
> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
> +                backing_filename);
> +        }
> +    }

If this diff hunk is dropped then the user needs to manually create
the raw file before running qemu-img convert?

qemu-img convert -O add-cow seems like a very rare case.  I'm not sure
we should add special user-friend hacks for this.

I'm not sure I understand why you create a qcow2 file either.

Stefan
Robert Wang May 30, 2012, 1:50 a.m. UTC | #2
On Tue, May 29, 2012 at 11:50 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> +    image_sectors = image_bs->total_sectors;
>
> Please use bdrv_getlength() instead of accessing total_sectors directly.
>
>> +    image_drv = bdrv_find_format("raw");
>> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
>> +    if (ret < 0) {
>> +        bdrv_delete(s->image_hd);
>> +        goto fail;
>> +    }
>> +    bs->total_sectors = s->image_hd->total_sectors;
>
> bdrv_getlength()
Okay.
>
>> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
>> +        int64_t sector_num, int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +    int64_t cluster_num;
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = 0;
>> +        return 0;
>> +    }
>> +
>> +    cluster_num = sector_num / SECTORS_PER_CLUSTER;
>> +    changed = is_allocated(bs, sector_num);
>
> Do we need to hold the lock before (indirectly) accessing the cache?
>
>> +    *num_same =
>> +        MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
>> +
>> +    for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
>> +        cluster_num <= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
>> +        cluster_num++) {
>> +        if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) {
>> +            break;
>> +        }
>> +        *num_same = MIN(nb_sectors,
>> +            (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
>> +    }
>
> I think this makes sense but it would be easier to use a loop counter
> that is in sector units instead of clusters.  Then you can calculate
> *num_name after the loop simply by subtracting the starting sector_num
> from the final cur_sector value.  And it saves you from constantly
> converting back and forth between clusters and sectors.
>
>> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
>> +        int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0, i;
>> +    QEMUIOVector hd_qiov;
>> +    uint8_t *table;
>> +    bool changed = false;
>> +
>> +    qemu_co_mutex_lock(&s->lock);
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors, qiov);
>
> We don't need to lock across all writes.
>
> lock()
> if write requires allocation:
>    ...do allocation stuff under lock...
> unlock()
> write data
>
> Internal data structure (cache, metadata, and copying unmodified
> sectors) access needs to be locked during allocating writes.  The
> guest's data can be written without the lock held.
>
> This means that most writes will only lock for a short time to check
> that the clusters are already allocated.  Then they will be able to
> write in parallel.
>
> If any cluster is not yet allocated then the allocation needs to
> happen under lock.  This ensures that we don't copy backing file
> sectors while processing another write request that touches those
> sectors.
>
>> +
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +    /* copy content of unmodified sectors */
>> +    if (!is_cluster_head(sector_num) && !is_allocated(bs, sector_num)) {
>> +        qemu_co_mutex_unlock(&s->lock);
>> +        ret = copy_sectors(bs, sector_num & ~(SECTORS_PER_CLUSTER - 1),
>> +            sector_num);
>
> As mentioned above, I think we need to lock during copy_sectors() so
> that other requests cannot race with this.  (The guest's writes must
> always take priority over copying unmodified sectors.)
>
>> +        qemu_co_mutex_lock(&s->lock);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    if (!is_cluster_tail(sector_num + remaining_sectors - 1)
>> +        && !is_allocated(bs, sector_num + remaining_sectors - 1)) {
>> +        qemu_co_mutex_unlock(&s->lock);
>> +        ret = copy_sectors(bs, sector_num + remaining_sectors,
>> +            ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) + 1);
>> +        qemu_co_mutex_lock(&s->lock);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    for (i = sector_num / SECTORS_PER_CLUSTER;
>> +        i <= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
>> +        i++) {
>> +        ret = add_cow_cache_get(bs, s->bitmap_cache,
>> +            i * SECTORS_PER_CLUSTER, (void **)&table);
>> +        if (ret < 0) {
>> +            return 0;
>
> return ret?
Yes..
>
>> +        }
>> +        if ((table[i / 8] & (1 << (i % 8))) == 0) {
>> +            table[i / 8] |= (1 << (i % 8));
>> +            changed = true;
>> +            add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
>> +        }
>> +
>> +    }
>> +    ret = 0;
>> +fail:
>> +    if (changed) {
>> +        ret = add_cow_cache_flush(bs, s->bitmap_cache);
>> +    }
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
>> +    int ret;
>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>> +    int64_t bitmap_size =
>> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
>> +
>> +    ret = bdrv_truncate(bs->file,
>> +        sizeof(AddCowHeader) + bitmap_size);
>> +    if (ret < 0) {
>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>
> Why truncate image_hd on failure?  We never touch the image_hd size on
> success either.  I think we can just leave it alone.

That means whether we truncate add-cow fails or not ,we should not never touch
image_hd size?

>
>> +typedef struct AddCowHeader {
>> +    uint64_t        magic;
>> +    uint32_t        version;
>> +    char            backing_file[ADD_COW_FILE_LEN];
>> +    char            image_file[ADD_COW_FILE_LEN];
>> +    char            reserved[500];
>> +} QEMU_PACKED AddCowHeader;
>
> I'd be tempted to start the bitmap at 4096 bytes into the file.  On 4
> KB block size disks that would be a nice alignment and it doesn't
> waste much additional space (the disk images we're trying to manage
> are many GB :)).

Okay.
>
>> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>>     }
>>
>>     /* Create the new image */
>> +
>> +    if (0 == strcmp(out_fmt, "add-cow")) {
>> +        image_drv = bdrv_find_format("raw");
>> +        if (!drv) {
>> +            ret = -1;
>> +            goto out;
>> +        }
>> +        snprintf(image_filename, sizeof(image_filename),
>> +            "%s"".ct.raw", out_filename);
>> +        ret = bdrv_create(image_drv, image_filename, image_param);
>> +        if (ret < 0) {
>> +            error_report("%s: error while creating image_file: %s",
>> +                     image_filename, strerror(-ret));
>> +            goto out;
>> +        }
>> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
>> +
>> +        if (!out_baseimg) {
>> +            backing_drv = bdrv_find_format("qcow2");
>> +            if (!drv) {
>> +                ret = -1;
>> +                goto out;
>> +            }
>> +            snprintf(backing_filename, sizeof(backing_filename),
>> +                "%s"".ct.qcow2", out_filename);
>> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
>> +            if (ret < 0) {
>> +                error_report("%s: error while creating backing_file: %s",
>> +                         backing_filename, strerror(-ret));
>> +                goto out;
>> +            }
>> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
>> +                backing_filename);
>> +        }
>> +    }
>
> If this diff hunk is dropped then the user needs to manually create
> the raw file before running qemu-img convert?
>
> qemu-img convert -O add-cow seems like a very rare case.  I'm not sure
> we should add special user-friend hacks for this.
>
> I'm not sure I understand why you create a qcow2 file either.
>
Yes, if we use "qemu-img convert -O add-cow", we should create 2 other files,
raw file and qcow2(I just picked  up qcow2, other formats is also okay) file,
as image_file and backing_file, without the two files, .add-cow file can not
work properly.

Although it will occour in very rare cases, I wish to pass all qemu-iotests
cases, so I added these code.

Do you think these are not necessary? And some qemu-iotests cases are
using "convert" operation, If I do not write previous code, these cases will
fail. Can I let these cases do not support add-cow?

Really thanks for your review, Stefan.
> Stefan
>
Anthony Liguori May 30, 2012, 2:10 a.m. UTC | #3
On 05/08/2012 01:34 AM, Dong Xu Wang wrote:
> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
> this patch.
>
> CC: Kevin Wolf<kwolf@redhat.com>
> CC: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
> Signed-off-by: Dong Xu Wang<wdongxu@linux.vnet.ibm.com>

You should split out the spec to be the first patch.  That makes it easier for 
people to review the specification independent of the code and also makes 
subsequent code review easier.

> ---
>   Makefile.objs          |    1 +
>   block.c                |    2 +-
>   block.h                |    1 +
>   block/add-cow-cache.c  |  193 +++++++++++++++++++++
>   block/add-cow.c        |  446 ++++++++++++++++++++++++++++++++++++++++++++++++
>   block/add-cow.h        |   83 +++++++++
>   block_int.h            |    1 +
>   docs/specs/add-cow.txt |   68 ++++++++
>   qemu-img.c             |   39 +++++
>   9 files changed, 833 insertions(+), 1 deletion(-)
>   create mode 100644 block/add-cow-cache.c
>   create mode 100644 block/add-cow.c
>   create mode 100644 block/add-cow.h
>   create mode 100644 docs/specs/add-cow.txt
>
> diff --git a/Makefile.objs b/Makefile.objs
> index 70c5c79..10c5c52 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -52,6 +52,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>   block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>   block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>   block-nested-y += qed-check.o
> +block-nested-y += add-cow.o add-cow-cache.o
>   block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>   block-nested-y += stream.o
>   block-nested-$(CONFIG_WIN32) += raw-win32.o
> diff --git a/block.c b/block.c
> index 43c794c..206860c 100644
> --- a/block.c
> +++ b/block.c
> @@ -196,7 +196,7 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
>   }
>
>   /* check if the path starts with "<protocol>:" */
> -static int path_has_protocol(const char *path)
> +int path_has_protocol(const char *path)
>   {
>   #ifdef _WIN32
>       if (is_windows_drive(path) ||
> diff --git a/block.h b/block.h
> index f163e54..f74c79e 100644
> --- a/block.h
> +++ b/block.h
> @@ -319,6 +319,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>
>   char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>   int path_is_absolute(const char *path);
> +int path_has_protocol(const char *path);
>   void path_combine(char *dest, int dest_size,
>                     const char *base_path,
>                     const char *filename);
> diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
> new file mode 100644
> index 0000000..3ae23c1
> --- /dev/null
> +++ b/block/add-cow-cache.c
> @@ -0,0 +1,193 @@
> +/*
> + * Cache For QEMU ADD-COW Disk Format
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
>

This is not a valid copyright block.  Look at hw/virtio.c for an example.  If 
you want to do LGPL instead of GPL, it should also be 2.1 or later.

> +#include "block_int.h"
> +#include "qemu-common.h"
> +#include "add-cow.h"
> +
> +/* Based on qcow2-cache.c */

If the code is based on qcow2, then you need to preserve the qcow2 copyrights in 
this file.

> +AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
> +    bool writethrough)
> +{
> +    AddCowCache *c;
> +    int i;
> +
> +    c = g_malloc0(sizeof(*c));
> +    c->size = num_tables;
> +    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
> +    c->writethrough = writethrough;
> +    c->entry_size = ADD_COW_CACHE_ENTRY_SIZE;
> +
> +    for (i = 0; i<  c->size; i++) {
> +        c->entries[i].table = qemu_blockalign(bs, c->entry_size);
> +        c->entries[i].offset = -1;
> +    }
> +
> +    return c;
> +}
> +
> +int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c)
> +{
> +    int i;
> +
> +    for (i = 0; i<  c->size; i++) {
> +        qemu_vfree(c->entries[i].table);
> +    }
> +
> +    g_free(c->entries);
> +    g_free(c);
> +
> +    return 0;
> +}
> +
> +static int add_cow_cache_entry_flush(BlockDriverState *bs,
> +    AddCowCache *c,
> +    int i)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0;
> +
> +    if (!c->entries[i].dirty || -1 == c->entries[i].offset) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + c->entries[i].offset,
> +        c->entries[i].table,
> +        MIN(c->entry_size, s->bitmap_size - c->entries[i].offset));

This is a synchronous I/O function.  We shouldn't introduce new formats that 
have *any* synchronous I/O in them.

Honestly, I think using coroutines for this format is a mistake.  It's simple 
enough that doing a state machine would be straight forward enough.

> +    if (ret<  0) {
> +        return ret;
> +    }
> +
> +    c->entries[i].dirty = false;
> +
> +    return 0;
> +}
> +
> +int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int result = 0;
> +    int ret;
> +    int i;
> +
> +    ret = bdrv_flush(s->image_hd);
> +    if (ret<  0) {
> +        return result;
> +    }
> +
> +    for (i = 0; i<  c->size; i++) {
> +        ret = add_cow_cache_entry_flush(bs, c, i);
> +        if (ret<  0&&  result != -ENOSPC) {
> +            result = ret;
> +        }
> +    }
> +
> +    if (result == 0) {
> +        ret = bdrv_flush(bs->file);
> +        if (ret<  0) {
> +            result = ret;
> +        }
> +    }
> +
> +    return result;
> +}
> +
> +static int add_cow_cache_find_entry_to_replace(AddCowCache *c)
> +{
> +    int i;
> +    int min_count = INT_MAX;
> +    int min_index = -1;
> +
> +
> +    for (i = 0; i<  c->size; i++) {
> +        if (c->entries[i].cache_hits<  min_count) {
> +            min_index = i;
> +            min_count = c->entries[i].cache_hits;
> +        }
> +
> +        c->entries[i].cache_hits /= 2;
> +    }
> +
> +    if (min_index == -1) {
> +        abort();
> +    }
> +    return min_index;
> +}
> +
> +static int add_cow_cache_do_get(BlockDriverState *bs, AddCowCache *c,
> +    uint64_t offset, void **table)
> +{
> +    int i, ret;
> +
> +    for (i = 0; i<  c->size; i++) {
> +        if (c->entries[i].offset == offset) {
> +            goto found;
> +        }
> +    }
> +
> +    i = add_cow_cache_find_entry_to_replace(c);
> +    if (i<  0) {
> +        return i;
> +    }
> +
> +    ret = add_cow_cache_entry_flush(bs, c, i);
> +    if (ret<  0) {
> +        return ret;
> +    }
> +    c->entries[i].offset = -1;
> +    ret = bdrv_pread(bs->file, sizeof(AddCowHeader) + offset,
> +        c->entries[i].table, c->entry_size);
> +    if (ret<  0) {
> +        return ret;
> +    }
> +
> +    c->entries[i].cache_hits = 32;
> +    c->entries[i].offset = offset;
> +
> +found:
> +    c->entries[i].cache_hits++;
> +    *table = c->entries[i].table;
> +
> +    return 0;
> +}
> +
> +int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c, uint64_t sector_num,
> +    void **table)
> +{
> +    /* each byte in bitmap indicates 8 * SECTORS_PER_CLUSTER clusters */
> +    uint64_t offset = offset_in_bitmap(sector_num)&  (~(c->entry_size - 1));
> +    return add_cow_cache_do_get(bs, c, offset, table);
> +}
> +
> +void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table)
> +{
> +    int i;
> +
> +    for (i = 0; i<  c->size; i++) {
> +        if (c->entries[i].table == table) {
> +            goto found;
> +        }
> +    }
> +    abort();
> +
> +found:
> +    c->entries[i].dirty = true;
> +}
> +
> +bool add_cow_cache_set_writethrough(BlockDriverState *bs, AddCowCache *c,
> +    bool enable)
> +{
> +    bool old = c->writethrough;
> +
> +    if (!old&&  enable) {
> +        add_cow_cache_flush(bs, c);
> +    }
> +
> +    c->writethrough = enable;
> +    return old;
> +}

This cache confuses me a bit.  It's an LRU?  Couldn't we share some code here 
with the QED cache?

> diff --git a/block/add-cow.c b/block/add-cow.c
> new file mode 100644
> index 0000000..8bf27ab
> --- /dev/null
> +++ b/block/add-cow.c
> @@ -0,0 +1,446 @@
> +/*
> + * QEMU ADD-COW Disk Format
> + *
> + * Copyright IBM, Corp. 2012
> + *
> + * Authors:
> + *  Dong Xu Wang<wdongxu@linux.vnet.ibm.com>
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "module.h"
> +#include "add-cow.h"
> +
> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
> +{
> +    const AddCowHeader *header = (const AddCowHeader *)buf;
> +
> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC&&
> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
> +        return 100;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int add_cow_create(const char *filename, QEMUOptionParameter *options)
> +{
> +    AddCowHeader header;
> +    int64_t image_sectors = 0;
> +    const char *backing_filename = NULL;
> +    const char *image_filename = NULL;
> +    int ret;
> +    BlockDriverState *bs, *image_bs = NULL;
> +
> +    while (options&&  options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
> +            backing_filename = options->value.s;
> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
> +            image_filename = options->value.s;
> +        }
> +        options++;
> +    }
> +
> +    if (!backing_filename || !image_filename) {
> +        error_report("Both backing_file and image_file should be given.");
> +        return -EINVAL;
> +    }
> +
> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
> +            | BDRV_O_CACHE_WB);
> +    if (ret<  0) {
> +        return ret;
> +    }
> +    image_sectors = image_bs->total_sectors;
> +    bdrv_delete(image_bs);
> +
> +    ret = bdrv_create_file(filename, NULL);
> +    if (ret<  0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
> +    if (ret<  0) {
> +        return ret;
> +    }
> +
> +    memset(&header, 0, sizeof(header));
> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
> +    header.version = cpu_to_be32(ADD_COW_VERSION);
> +    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
> +    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
> +
> +    ret = bdrv_pwrite(bs, 0,&header, sizeof(header));
> +    if (ret<  0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    BlockDriver *drv = bdrv_find_format("add-cow");

Please make all declarations at the top level of a function.  While this allowed 
by C99, it shouldn't been IMHO.  We are pretty consistent about this in QEMU.

> +    assert(drv != NULL);
> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
> +    if (ret<  0) {
> +        bdrv_delete(bs);
> +        return ret;
> +    }
> +
> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
> +    bdrv_delete(bs);
> +    return ret;
> +}
> +
> +static int add_cow_open(BlockDriverState *bs, int flags)
> +{
> +    AddCowHeader        header;
> +    char                image_filename[ADD_COW_FILE_LEN];
> +    BlockDriver         *image_drv = NULL;
> +    int                 ret;
> +    bool                writethrough;
> +    int                 sector_per_byte;
> +    BDRVAddCowState     *s = bs->opaque;
> +
> +    ret = bdrv_pread(bs->file, 0,&header, sizeof(header));
> +    if (ret != sizeof(header)) {
> +        goto fail;
> +    }
> +
> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
> +        ret = -EINVAL;
> +        goto fail;
> +    }
> +    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
> +        char version[64];
> +        snprintf(version, sizeof(version), "ADD-COW version %d",
> +            be32_to_cpu(header.version));
> +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
> +            bs->device_name, "add-cow", version);
> +        ret = -ENOTSUP;
> +        goto fail;
> +    }
> +
> +    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file), header.backing_file);
> +
> +    if (header.image_file[0] == '\0') {
> +        ret = -ENOENT;
> +        goto fail;
> +    }
> +    header.image_file[ADD_COW_FILE_LEN - 1] = '\0';
> +    s->image_hd = bdrv_new("");
> +    if (path_has_protocol(header.image_file)) {
> +        pstrcpy(image_filename, sizeof(image_filename), header.image_file);
> +    } else {
> +        path_combine(image_filename, sizeof(image_filename),
> +                     bs->filename, header.image_file);
> +    }
> +
> +    image_drv = bdrv_find_format("raw");
> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> +    if (ret<  0) {
> +        bdrv_delete(s->image_hd);
> +        goto fail;
> +    }
> +    bs->total_sectors = s->image_hd->total_sectors;
> +    s->cluster_size = ADD_COW_CLUSTER_SIZE;
> +    sector_per_byte = SECTORS_PER_CLUSTER * 8;
> +    s->bitmap_size =
> +        (bs->total_sectors + sector_per_byte - 1) / sector_per_byte;
> +    writethrough = ((flags&  BDRV_O_CACHE_WB) == 0);
> +    s->bitmap_cache =
> +        add_cow_cache_create(bs, ADD_COW_CACHE_SIZE, writethrough);
> +
> +    qemu_co_mutex_init(&s->lock);
> +    return 0;
> +fail:
> +    if (s->bitmap_cache) {
> +        add_cow_cache_destroy(bs, s->bitmap_cache);
> +    }
> +    return ret;
> +}
> +
> +static void add_cow_close(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    add_cow_cache_destroy(bs, s->bitmap_cache);
> +    bdrv_delete(s->image_hd);
> +}
> +
> +static bool is_allocated(BlockDriverState *bs, int64_t sector_num)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int64_t cluster_num = sector_num / SECTORS_PER_CLUSTER;
> +    uint8_t *table = NULL;
> +    int ret = add_cow_cache_get(bs, s->bitmap_cache,
> +        sector_num, (void **)&table);
> +
> +    if (ret<  0) {
> +        return 0;
> +    }
> +    return table[cluster_num / 8 % ADD_COW_CACHE_ENTRY_SIZE]
> +&  (1<<  (cluster_num % 8));
> +}
> +
> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    int64_t cluster_num;
> +
> +    if (nb_sectors == 0) {
> +        *num_same = 0;
> +        return 0;
> +    }
> +
> +    cluster_num = sector_num / SECTORS_PER_CLUSTER;
> +    changed = is_allocated(bs, sector_num);
> +    *num_same =
> +        MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
> +
> +    for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
> +        cluster_num<= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
> +        cluster_num++) {
> +        if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) {
> +            break;
> +        }
> +        *num_same = MIN(nb_sectors,
> +            (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
> +    }
> +
> +    return changed;
> +}
> +
> +static coroutine_fn int add_cow_co_readv(BlockDriverState *bs,
> +    int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int cur_nr_sectors;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int n, ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    qemu_co_mutex_lock(&s->lock);
> +    while (remaining_sectors != 0) {
> +        cur_nr_sectors = remaining_sectors;
> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors,&n)) {
> +            cur_nr_sectors = n;
> +            qemu_iovec_reset(&hd_qiov);
> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +            qemu_co_mutex_unlock(&s->lock);
> +            ret = bdrv_co_readv(s->image_hd, sector_num, n,&hd_qiov);
> +            qemu_co_mutex_lock(&s->lock);
> +            if (ret<  0) {
> +                goto fail;
> +            }
> +        } else {
> +            cur_nr_sectors = n;
> +            if (bs->backing_hd) {
> +                qemu_iovec_reset(&hd_qiov);
> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
> +                qemu_co_mutex_unlock(&s->lock);
> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
> +                                    n,&hd_qiov);
> +                qemu_co_mutex_lock(&s->lock);
> +                if (ret<  0) {
> +                    goto fail;
> +                }
> +            } else {
> +                qemu_iovec_memset(&hd_qiov, 0,
> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
> +            }
> +        }
> +        remaining_sectors -= cur_nr_sectors;
> +        sector_num += cur_nr_sectors;
> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
> +    }
> +fail:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int coroutine_fn copy_sectors(BlockDriverState *bs,
> +                                     int n_start, int n_end)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    QEMUIOVector qiov;
> +    struct iovec iov;
> +    int n, ret;
> +
> +    n = n_end - n_start;
> +    if (n<= 0) {
> +        return 0;
> +    }
> +
> +    iov.iov_len = n * BDRV_SECTOR_SIZE;
> +    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
> +
> +    qemu_iovec_init_external(&qiov,&iov, 1);
> +
> +    ret = bdrv_co_readv(bs->backing_hd, n_start, n,&qiov);
> +    if (ret<  0) {
> +        goto out;
> +    }
> +    ret = bdrv_co_writev(s->image_hd, n_start, n,&qiov);
> +    if (ret<  0) {
> +        goto out;
> +    }
> +
> +    ret = 0;
> +out:
> +    qemu_vfree(iov.iov_base);
> +    return ret;
> +}
> +
> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
> +        int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0, i;
> +    QEMUIOVector hd_qiov;
> +    uint8_t *table;
> +    bool changed = false;
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, qiov);
> +
> +    if (ret<  0) {
> +        goto fail;
> +    }
> +    /* copy content of unmodified sectors */
> +    if (!is_cluster_head(sector_num)&&  !is_allocated(bs, sector_num)) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = copy_sectors(bs, sector_num&  ~(SECTORS_PER_CLUSTER - 1),
> +            sector_num);
> +        qemu_co_mutex_lock(&s->lock);
> +        if (ret<  0) {
> +            goto fail;
> +        }
> +    }
> +
> +    if (!is_cluster_tail(sector_num + remaining_sectors - 1)
> +&&  !is_allocated(bs, sector_num + remaining_sectors - 1)) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = copy_sectors(bs, sector_num + remaining_sectors,
> +            ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) + 1);
> +        qemu_co_mutex_lock(&s->lock);
> +        if (ret<  0) {
> +            goto fail;
> +        }
> +    }
> +
> +    for (i = sector_num / SECTORS_PER_CLUSTER;
> +        i<= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
> +        i++) {
> +        ret = add_cow_cache_get(bs, s->bitmap_cache,
> +            i * SECTORS_PER_CLUSTER, (void **)&table);
> +        if (ret<  0) {
> +            return 0;
> +        }
> +        if ((table[i / 8]&  (1<<  (i % 8))) == 0) {
> +            table[i / 8] |= (1<<  (i % 8));
> +            changed = true;
> +            add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
> +        }
> +
> +    }
> +    ret = 0;
> +fail:
> +    if (changed) {
> +        ret = add_cow_cache_flush(bs, s->bitmap_cache);
> +    }
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
> +    int ret;
> +    int64_t old_image_sector = s->image_hd->total_sectors;
> +    int64_t bitmap_size =
> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
> +
> +    ret = bdrv_truncate(bs->file,
> +        sizeof(AddCowHeader) + bitmap_size);
> +    if (ret<  0) {
> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
> +        return ret;
> +    }
> +    return 0;
> +}
> +
> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret;
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    ret = add_cow_cache_flush(bs, s->bitmap_cache);
> +    qemu_co_mutex_unlock(&s->lock);
> +    if (ret<  0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_flush(bs->file);
> +}
> +
> +static QEMUOptionParameter add_cow_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a base image"
> +    },
> +    {
> +        .name = BLOCK_OPT_IMAGE_FILE,
> +        .type = OPT_STRING,
> +        .help = "File name of a image file"
> +    },
> +    {
> +        .name = BLOCK_OPT_BACKING_FMT,
> +        .type = OPT_STRING,
> +        .help = "Image format of the base image"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_add_cow = {
> +    .format_name                = "add-cow",
> +    .instance_size              = sizeof(BDRVAddCowState),
> +    .bdrv_probe                 = add_cow_probe,
> +    .bdrv_open                  = add_cow_open,
> +    .bdrv_close                 = add_cow_close,
> +    .bdrv_create                = add_cow_create,
> +    .bdrv_co_readv              = add_cow_co_readv,
> +    .bdrv_co_writev             = add_cow_co_writev,
> +    .bdrv_truncate              = bdrv_add_cow_truncate,
> +    .bdrv_co_is_allocated       = add_cow_is_allocated,
> +
> +    .create_options             = add_cow_create_options,
> +    .bdrv_co_flush_to_disk      = add_cow_co_flush,
> +};
> +
> +static void bdrv_add_cow_init(void)
> +{
> +    bdrv_register(&bdrv_add_cow);
> +}
> +
> +block_init(bdrv_add_cow_init);
> diff --git a/block/add-cow.h b/block/add-cow.h
> new file mode 100644
> index 0000000..46567bb
> --- /dev/null
> +++ b/block/add-cow.h
> @@ -0,0 +1,83 @@
> +/*
> + * QEMU ADD-COW Disk Format
> + *
> + * Copyright IBM, Corp. 2012
> + *
> + * Authors:
> + *  Dong Xu Wang<wdongxu@linux.vnet.ibm.com>
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#ifndef BLOCK_ADD_COW_H
> +#define BLOCK_ADD_COW_H
> +
> +#define ADD_COW_MAGIC       (((uint64_t)'A'<<  56) | ((uint64_t)'D'<<  48) | \
> +                            ((uint64_t)'D'<<  40) | ((uint64_t)'_'<<  32) | \
> +                            ((uint64_t)'C'<<  24) | ((uint64_t)'O'<<  16) | \
> +                            ((uint64_t)'W'<<  8) | 0xFF)
> +#define ADD_COW_VERSION            1
> +#define ADD_COW_FILE_LEN           1024
> +#define ADD_COW_CACHE_SIZE         16
> +#define ADD_COW_CACHE_ENTRY_SIZE   65536
> +#define ADD_COW_CLUSTER_SIZE       65536
> +#define SECTORS_PER_CLUSTER         (ADD_COW_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
> +
> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    char            reserved[500];
> +} QEMU_PACKED AddCowHeader;
> +
> +typedef struct AddCowCachedTable {
> +    void    *table;
> +    int64_t offset;
> +    bool    dirty;
> +    int     cache_hits;
> +} AddCowCachedTable;
> +
> +typedef struct AddCowCache {
> +    AddCowCachedTable       *entries;
> +    int                     entry_size;
> +    int                     size;
> +    bool                    writethrough;
> +} AddCowCache;
> +
> +typedef struct BDRVAddCowState {
> +    BlockDriverState    *image_hd;
> +    CoMutex             lock;
> +    int                 cluster_size;
> +    AddCowCache         *bitmap_cache;
> +    uint64_t            bitmap_size;
> +} BDRVAddCowState;
> +
> +/* convert sector_num to offset in bitmap */
> +static inline int64_t offset_in_bitmap(int64_t sector_num)
> +{
> +    int64_t cluster_num = sector_num / SECTORS_PER_CLUSTER;
> +    return cluster_num / 8;
> +}
> +
> +static inline bool is_cluster_head(int64_t sector_num)
> +{
> +    return sector_num % SECTORS_PER_CLUSTER == 0;
> +}
> +
> +static inline bool is_cluster_tail(int64_t sector_num)
> +{
> +    return (sector_num + 1) % SECTORS_PER_CLUSTER == 0;
> +}
> +
> +AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
> +    bool writethrough);
> +int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c);
> +void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table);
> +int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c, uint64_t offset,
> +    void **table);
> +int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c);
> +bool add_cow_cache_set_writethrough(BlockDriverState *bs, AddCowCache *c,
> +    bool enable);
> +#endif
> diff --git a/block_int.h b/block_int.h
> index 086832a..83e75ea 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -51,6 +51,7 @@
>   #define BLOCK_OPT_PREALLOC      "preallocation"
>   #define BLOCK_OPT_SUBFMT        "subformat"
>   #define BLOCK_OPT_COMPAT_LEVEL  "compat"
> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>
>   typedef struct BdrvTrackedRequest BdrvTrackedRequest;
>
> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
> new file mode 100644
> index 0000000..28085d4
> --- /dev/null
> +++ b/docs/specs/add-cow.txt
> @@ -0,0 +1,68 @@
> +== General ==
> +
> +Raw file format does not support backing_file and copy on write feature.
> +The add-cow image format makes it possible to use backing files with raw
> +image by keeping a separate .add-cow metadata file.  Once all sectors
> +have been written to in the raw image it is safe to discard the .add-cow
> +and backing files and instead use the raw image directly.
> +
> +When using add-cow, procedures may like this:
> +(ubuntu.img is a disk image which has been installed OS.)
> +    1)  Create a raw image with the same size of ubuntu.img
> +            qemu-img create -f raw test.raw 8G
> +    2)  Create a add-cow image which will store dirty bitmap
> +            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
> +    3)  Run qemu with add-cow image
> +            qemu -drive if=virtio,file=test.add-cow
> +
> +=Specification=
> +
> +The file format looks like this:
> +
> + +---------------+--------------------------+
> + |     Header    |         COW bitmap       |
> + +---------------+--------------------------+
> +
> +All numbers in add-cow are stored in Big Endian byte order.
> +
> +== Header ==
> +
> +The Header is included in the first bytes:
> +
> +    Byte  0 -  7:       magic
> +                        add-cow magic string ("ADD_COW\xff")
> +
> +          8 -  11:      version
> +                        Version number (only valid value is 1 now)
> +
> +          12 - 1035:    backing_file
> +                        backing_file file name related to add-cow file. All
> +                        unused bytes are padded with zeros. Must not be longer
> +                        than 1023 bytes.

It's far better to shrink the size of the header and use an offset/len pointer 
to the backing file string.  Limiting backing files to 1023 is unacceptable.

> +
> +         1036 - 2059:   image_file
> +                        image_file is a raw file. All unused bytes are padded
> +                        with zeros. Must not be longer than 1023 bytes.

Likewise, you should use a combination of offset/len.

I think you should have a feature bitmap here too along with an offset to the 
bitmap.

Regards,

Anthony Liguori


> +
> +         2060  - 2559:   The Reserved field is used to make sure Data field starts
> +                        at the multiple of 512, not used currently. All bytes are
> +                        filled with 0.
> +
> +== COW bitmap ==
> +
> +The "COW bitmap" field starts at the 2560th byte, stores a bitmap related to
> +backing_file and image_file. The bitmap will track whether the sector in
> +backing_file is dirty or not.
> +
> +Each bit in the bitmap indicates one cluster's status. One cluster includes 128
> +sectors, then each bit indicates 512 * 128 = 64k bytes, So the size of bitmap is
> +calculated according to virtual size of image_file. In each byte, bit 0 to 7
> +will track the 1st to 7th cluster in sequence, bit orders in one byte look like:
> + +----+----+----+----+----+----+----+----+
> + | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
> + +----+----+----+----+----+----+----+----+
> +
> +If the bit is 0, indicates the sector has not been allocated in image_file, data
> +should be loaded from backing_file while reading; if the bit is 1,  indicates the
> +related sector has been dirty, should be loaded from image_file while reading.
> +Writing to a sector causes the corresponding bit to be set to 1.
> diff --git a/qemu-img.c b/qemu-img.c
> index 0ae543c..29865b5 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -628,7 +628,9 @@ static int img_convert(int argc, char **argv)
>       int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size, cluster_sectors;
>       int progress = 0, flags;
>       const char *fmt, *out_fmt, *cache, *out_baseimg, *out_filename;
> +    char image_filename[1024], backing_filename[1024];
>       BlockDriver *drv, *proto_drv;
> +    BlockDriver *image_drv, *backing_drv;
>       BlockDriverState **bs = NULL, *out_bs = NULL;
>       int64_t total_sectors, nb_sectors, sector_num, bs_offset;
>       uint64_t bs_sectors;
> @@ -637,6 +639,7 @@ static int img_convert(int argc, char **argv)
>       BlockDriverInfo bdi;
>       QEMUOptionParameter *param = NULL, *create_options = NULL;
>       QEMUOptionParameter *out_baseimg_param;
> +    QEMUOptionParameter *image_param = NULL;
>       char *options = NULL;
>       const char *snapshot_name = NULL;
>       float local_progress;
> @@ -797,6 +800,7 @@ static int img_convert(int argc, char **argv)
>           out_baseimg = out_baseimg_param->value.s;
>       }
>
> +    image_param = get_option_parameter(param, BLOCK_OPT_SIZE);
>       /* Check if compression is supported */
>       if (compress) {
>           QEMUOptionParameter *encryption =
> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>       }
>
>       /* Create the new image */
> +
> +    if (0 == strcmp(out_fmt, "add-cow")) {
> +        image_drv = bdrv_find_format("raw");
> +        if (!drv) {
> +            ret = -1;
> +            goto out;
> +        }
> +        snprintf(image_filename, sizeof(image_filename),
> +            "%s"".ct.raw", out_filename);
> +        ret = bdrv_create(image_drv, image_filename, image_param);
> +        if (ret<  0) {
> +            error_report("%s: error while creating image_file: %s",
> +                     image_filename, strerror(-ret));
> +            goto out;
> +        }
> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
> +
> +        if (!out_baseimg) {
> +            backing_drv = bdrv_find_format("qcow2");
> +            if (!drv) {
> +                ret = -1;
> +                goto out;
> +            }
> +            snprintf(backing_filename, sizeof(backing_filename),
> +                "%s"".ct.qcow2", out_filename);
> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
> +            if (ret<  0) {
> +                error_report("%s: error while creating backing_file: %s",
> +                         backing_filename, strerror(-ret));
> +                goto out;
> +            }
> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
> +                backing_filename);
> +        }
> +    }
>       ret = bdrv_create(drv, out_filename, param);
>       if (ret<  0) {
>           if (ret == -ENOTSUP) {
Kevin Wolf May 30, 2012, 7:33 a.m. UTC | #4
Am 30.05.2012 04:10, schrieb Anthony Liguori:
> On 05/08/2012 01:34 AM, Dong Xu Wang wrote:
>> Provide a new file format: add-cow. The usage can be found in add-cow.txt of
>> this patch.
>>
>> CC: Kevin Wolf<kwolf@redhat.com>
>> CC: Stefan Hajnoczi<stefanha@linux.vnet.ibm.com>
>> Signed-off-by: Dong Xu Wang<wdongxu@linux.vnet.ibm.com>
> 
> You should split out the spec to be the first patch.  That makes it easier for 
> people to review the specification independent of the code and also makes 
> subsequent code review easier.

Yes, please.

>> ---
>>   Makefile.objs          |    1 +
>>   block.c                |    2 +-
>>   block.h                |    1 +
>>   block/add-cow-cache.c  |  193 +++++++++++++++++++++
>>   block/add-cow.c        |  446 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/add-cow.h        |   83 +++++++++
>>   block_int.h            |    1 +
>>   docs/specs/add-cow.txt |   68 ++++++++
>>   qemu-img.c             |   39 +++++
>>   9 files changed, 833 insertions(+), 1 deletion(-)
>>   create mode 100644 block/add-cow-cache.c
>>   create mode 100644 block/add-cow.c
>>   create mode 100644 block/add-cow.h
>>   create mode 100644 docs/specs/add-cow.txt
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index 70c5c79..10c5c52 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -52,6 +52,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>   block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>   block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>   block-nested-y += qed-check.o
>> +block-nested-y += add-cow.o add-cow-cache.o
>>   block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>   block-nested-y += stream.o
>>   block-nested-$(CONFIG_WIN32) += raw-win32.o
>> diff --git a/block.c b/block.c
>> index 43c794c..206860c 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -196,7 +196,7 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>   }
>>
>>   /* check if the path starts with "<protocol>:" */
>> -static int path_has_protocol(const char *path)
>> +int path_has_protocol(const char *path)
>>   {
>>   #ifdef _WIN32
>>       if (is_windows_drive(path) ||
>> diff --git a/block.h b/block.h
>> index f163e54..f74c79e 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -319,6 +319,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
>>
>>   char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>   int path_is_absolute(const char *path);
>> +int path_has_protocol(const char *path);
>>   void path_combine(char *dest, int dest_size,
>>                     const char *base_path,
>>                     const char *filename);
>> diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
>> new file mode 100644
>> index 0000000..3ae23c1
>> --- /dev/null
>> +++ b/block/add-cow-cache.c
>> @@ -0,0 +1,193 @@
>> +/*
>> + * Cache For QEMU ADD-COW Disk Format
>> + *
>> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
>> + * See the COPYING.LIB file in the top-level directory.
>> + *
>> + */
>>
> 
> This is not a valid copyright block.  Look at hw/virtio.c for an example.  If 
> you want to do LGPL instead of GPL, it should also be 2.1 or later.

You mean because the name of the copyright holder is missing? Would be
nice to include indeed, but I don't think it makes a legal difference.
And the vast majority of source files doesn't have the names of all
copyright holders there anyway.

>> +#include "block_int.h"
>> +#include "qemu-common.h"
>> +#include "add-cow.h"
>> +
>> +/* Based on qcow2-cache.c */
> 
> If the code is based on qcow2, then you need to preserve the qcow2 copyrights in 
> this file.

Or ideally we would generalise qcow2-cache.c so that it can be used by
both. Not sure how easy it would be, though.

>> +AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
>> +    bool writethrough)
>> +{
>> +    AddCowCache *c;
>> +    int i;
>> +
>> +    c = g_malloc0(sizeof(*c));
>> +    c->size = num_tables;
>> +    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
>> +    c->writethrough = writethrough;
>> +    c->entry_size = ADD_COW_CACHE_ENTRY_SIZE;
>> +
>> +    for (i = 0; i<  c->size; i++) {
>> +        c->entries[i].table = qemu_blockalign(bs, c->entry_size);
>> +        c->entries[i].offset = -1;
>> +    }
>> +
>> +    return c;
>> +}
>> +
>> +int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i<  c->size; i++) {
>> +        qemu_vfree(c->entries[i].table);
>> +    }
>> +
>> +    g_free(c->entries);
>> +    g_free(c);
>> +
>> +    return 0;
>> +}
>> +
>> +static int add_cow_cache_entry_flush(BlockDriverState *bs,
>> +    AddCowCache *c,
>> +    int i)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0;
>> +
>> +    if (!c->entries[i].dirty || -1 == c->entries[i].offset) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + c->entries[i].offset,
>> +        c->entries[i].table,
>> +        MIN(c->entry_size, s->bitmap_size - c->entries[i].offset));
> 
> This is a synchronous I/O function.  We shouldn't introduce new formats that 
> have *any* synchronous I/O in them.

No, this is fine. We're inside a coroutine, this doesn't block anything.

> Honestly, I think using coroutines for this format is a mistake.  It's simple 
> enough that doing a state machine would be straight forward enough.

Introducing new state machines would be backwards. Eventually we want to
have a threaded block layer and state machines just don't fit there.
Coroutine based code is much closer.

Kevin
Stefan Hajnoczi May 30, 2012, 8:20 a.m. UTC | #5
On Wed, May 30, 2012 at 2:50 AM, Dong Xu Wang
<wdongxu@linux.vnet.ibm.com> wrote:
> On Tue, May 29, 2012 at 11:50 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:

I thought a bit more about locking.  Because the metadata is simple
not much locking is necessary except when fetching new bitmap clusters
from the image file into the cache and when populating untouched
sectors during data cluster allocation.  Those are the two cases where
parallel requests could put the block driver or image file into a bad
state if allowed to run without any locking.

Another way of describing the consequences of parallelism:
1. Coroutines must not duplicate the same add-cow bitmap cluster into
the cache if they run at the same time.
2. Coroutines must not hold bitmap tables across blocking operations
since the cache entry has no reference count and might be evicted from
the cache.
3. Coroutines must not allocate the same data cluster simultaneously
because untouched head/tail sectors must never race with guest writes.

>>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
>>> +{
>>> +    BDRVAddCowState *s = bs->opaque;
>>> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
>>> +    int ret;
>>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>>> +    int64_t bitmap_size =
>>> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
>>> +
>>> +    ret = bdrv_truncate(bs->file,
>>> +        sizeof(AddCowHeader) + bitmap_size);
>>> +    if (ret < 0) {
>>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>>
>> Why truncate image_hd on failure?  We never touch the image_hd size on
>> success either.  I think we can just leave it alone.
>
> That means whether we truncate add-cow fails or not ,we should not never touch
> image_hd size?

I thought about this more and I think we should truncate image_hd in
the success case only.  In order to resize the image we need to resize
the cow bitmap and then resize image_hd.  If resizing the add-cow file
failed, then we haven't changed the cow bitmap and we don't need to
truncate image_hd.  Do you agree with this or have I missed something?

>>> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>>>     }
>>>
>>>     /* Create the new image */
>>> +
>>> +    if (0 == strcmp(out_fmt, "add-cow")) {
>>> +        image_drv = bdrv_find_format("raw");
>>> +        if (!drv) {
>>> +            ret = -1;
>>> +            goto out;
>>> +        }
>>> +        snprintf(image_filename, sizeof(image_filename),
>>> +            "%s"".ct.raw", out_filename);
>>> +        ret = bdrv_create(image_drv, image_filename, image_param);
>>> +        if (ret < 0) {
>>> +            error_report("%s: error while creating image_file: %s",
>>> +                     image_filename, strerror(-ret));
>>> +            goto out;
>>> +        }
>>> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
>>> +
>>> +        if (!out_baseimg) {
>>> +            backing_drv = bdrv_find_format("qcow2");
>>> +            if (!drv) {
>>> +                ret = -1;
>>> +                goto out;
>>> +            }
>>> +            snprintf(backing_filename, sizeof(backing_filename),
>>> +                "%s"".ct.qcow2", out_filename);
>>> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
>>> +            if (ret < 0) {
>>> +                error_report("%s: error while creating backing_file: %s",
>>> +                         backing_filename, strerror(-ret));
>>> +                goto out;
>>> +            }
>>> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
>>> +                backing_filename);
>>> +        }
>>> +    }
>>
>> If this diff hunk is dropped then the user needs to manually create
>> the raw file before running qemu-img convert?
>>
>> qemu-img convert -O add-cow seems like a very rare case.  I'm not sure
>> we should add special user-friend hacks for this.
>>
>> I'm not sure I understand why you create a qcow2 file either.
>>
> Yes, if we use "qemu-img convert -O add-cow", we should create 2 other files,
> raw file and qcow2(I just picked  up qcow2, other formats is also okay) file,
> as image_file and backing_file, without the two files, .add-cow file can not
> work properly.
>
> Although it will occour in very rare cases, I wish to pass all qemu-iotests
> cases, so I added these code.
>
> Do you think these are not necessary? And some qemu-iotests cases are
> using "convert" operation, If I do not write previous code, these cases will
> fail. Can I let these cases do not support add-cow?

If a test uses qemu-img convert then it's probably not that
interesting for add-cow.  Converting is not a useful operation because
add-cow is an "add-on" block driver that adds a feature on top of raw,
rather than a format like vmdk or qcow2 which is used to share disk
images.  I see why you did this to make qemu-iotests work, but
personally I would drop this special case code and skip those tests.

Stefan
Robert Wang June 4, 2012, 8:15 a.m. UTC | #6
Okay, thanks all of your comments, if no other comments, I will write
next version.


On Wed, May 30, 2012 at 4:20 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, May 30, 2012 at 2:50 AM, Dong Xu Wang
> <wdongxu@linux.vnet.ibm.com> wrote:
>> On Tue, May 29, 2012 at 11:50 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>
> I thought a bit more about locking.  Because the metadata is simple
> not much locking is necessary except when fetching new bitmap clusters
> from the image file into the cache and when populating untouched
> sectors during data cluster allocation.  Those are the two cases where
> parallel requests could put the block driver or image file into a bad
> state if allowed to run without any locking.
>
> Another way of describing the consequences of parallelism:
> 1. Coroutines must not duplicate the same add-cow bitmap cluster into
> the cache if they run at the same time.
> 2. Coroutines must not hold bitmap tables across blocking operations
> since the cache entry has no reference count and might be evicted from
> the cache.
> 3. Coroutines must not allocate the same data cluster simultaneously
> because untouched head/tail sectors must never race with guest writes.
>
>>>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
>>>> +{
>>>> +    BDRVAddCowState *s = bs->opaque;
>>>> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
>>>> +    int ret;
>>>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>>>> +    int64_t bitmap_size =
>>>> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
>>>> +
>>>> +    ret = bdrv_truncate(bs->file,
>>>> +        sizeof(AddCowHeader) + bitmap_size);
>>>> +    if (ret < 0) {
>>>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>>>
>>> Why truncate image_hd on failure?  We never touch the image_hd size on
>>> success either.  I think we can just leave it alone.
>>
>> That means whether we truncate add-cow fails or not ,we should not never touch
>> image_hd size?
>
> I thought about this more and I think we should truncate image_hd in
> the success case only.  In order to resize the image we need to resize
> the cow bitmap and then resize image_hd.  If resizing the add-cow file
> failed, then we haven't changed the cow bitmap and we don't need to
> truncate image_hd.  Do you agree with this or have I missed something?
>
>>>> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>>>>     }
>>>>
>>>>     /* Create the new image */
>>>> +
>>>> +    if (0 == strcmp(out_fmt, "add-cow")) {
>>>> +        image_drv = bdrv_find_format("raw");
>>>> +        if (!drv) {
>>>> +            ret = -1;
>>>> +            goto out;
>>>> +        }
>>>> +        snprintf(image_filename, sizeof(image_filename),
>>>> +            "%s"".ct.raw", out_filename);
>>>> +        ret = bdrv_create(image_drv, image_filename, image_param);
>>>> +        if (ret < 0) {
>>>> +            error_report("%s: error while creating image_file: %s",
>>>> +                     image_filename, strerror(-ret));
>>>> +            goto out;
>>>> +        }
>>>> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
>>>> +
>>>> +        if (!out_baseimg) {
>>>> +            backing_drv = bdrv_find_format("qcow2");
>>>> +            if (!drv) {
>>>> +                ret = -1;
>>>> +                goto out;
>>>> +            }
>>>> +            snprintf(backing_filename, sizeof(backing_filename),
>>>> +                "%s"".ct.qcow2", out_filename);
>>>> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
>>>> +            if (ret < 0) {
>>>> +                error_report("%s: error while creating backing_file: %s",
>>>> +                         backing_filename, strerror(-ret));
>>>> +                goto out;
>>>> +            }
>>>> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
>>>> +                backing_filename);
>>>> +        }
>>>> +    }
>>>
>>> If this diff hunk is dropped then the user needs to manually create
>>> the raw file before running qemu-img convert?
>>>
>>> qemu-img convert -O add-cow seems like a very rare case.  I'm not sure
>>> we should add special user-friend hacks for this.
>>>
>>> I'm not sure I understand why you create a qcow2 file either.
>>>
>> Yes, if we use "qemu-img convert -O add-cow", we should create 2 other files,
>> raw file and qcow2(I just picked  up qcow2, other formats is also okay) file,
>> as image_file and backing_file, without the two files, .add-cow file can not
>> work properly.
>>
>> Although it will occour in very rare cases, I wish to pass all qemu-iotests
>> cases, so I added these code.
>>
>> Do you think these are not necessary? And some qemu-iotests cases are
>> using "convert" operation, If I do not write previous code, these cases will
>> fail. Can I let these cases do not support add-cow?
>
> If a test uses qemu-img convert then it's probably not that
> interesting for add-cow.  Converting is not a useful operation because
> add-cow is an "add-on" block driver that adds a feature on top of raw,
> rather than a format like vmdk or qcow2 which is used to share disk
> images.  I see why you did this to make qemu-iotests work, but
> personally I would drop this special case code and skip those tests.
>
> Stefan
>
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 70c5c79..10c5c52 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -52,6 +52,7 @@  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
+block-nested-y += add-cow.o add-cow-cache.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
 block-nested-y += stream.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
diff --git a/block.c b/block.c
index 43c794c..206860c 100644
--- a/block.c
+++ b/block.c
@@ -196,7 +196,7 @@  static void bdrv_io_limits_intercept(BlockDriverState *bs,
 }
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 #ifdef _WIN32
     if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index f163e54..f74c79e 100644
--- a/block.h
+++ b/block.h
@@ -319,6 +319,7 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
diff --git a/block/add-cow-cache.c b/block/add-cow-cache.c
new file mode 100644
index 0000000..3ae23c1
--- /dev/null
+++ b/block/add-cow-cache.c
@@ -0,0 +1,193 @@ 
+/*
+ * Cache For QEMU ADD-COW Disk Format
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "block_int.h"
+#include "qemu-common.h"
+#include "add-cow.h"
+
+/* Based on qcow2-cache.c */
+AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
+    bool writethrough)
+{
+    AddCowCache *c;
+    int i;
+
+    c = g_malloc0(sizeof(*c));
+    c->size = num_tables;
+    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+    c->writethrough = writethrough;
+    c->entry_size = ADD_COW_CACHE_ENTRY_SIZE;
+
+    for (i = 0; i < c->size; i++) {
+        c->entries[i].table = qemu_blockalign(bs, c->entry_size);
+        c->entries[i].offset = -1;
+    }
+
+    return c;
+}
+
+int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        qemu_vfree(c->entries[i].table);
+    }
+
+    g_free(c->entries);
+    g_free(c);
+
+    return 0;
+}
+
+static int add_cow_cache_entry_flush(BlockDriverState *bs,
+    AddCowCache *c,
+    int i)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+
+    if (!c->entries[i].dirty || -1 == c->entries[i].offset) {
+        return ret;
+    }
+
+    ret = bdrv_pwrite(bs->file, sizeof(AddCowHeader) + c->entries[i].offset,
+        c->entries[i].table,
+        MIN(c->entry_size, s->bitmap_size - c->entries[i].offset));
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].dirty = false;
+
+    return 0;
+}
+
+int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int result = 0;
+    int ret;
+    int i;
+
+    ret = bdrv_flush(s->image_hd);
+    if (ret < 0) {
+        return result;
+    }
+
+    for (i = 0; i < c->size; i++) {
+        ret = add_cow_cache_entry_flush(bs, c, i);
+        if (ret < 0 && result != -ENOSPC) {
+            result = ret;
+        }
+    }
+
+    if (result == 0) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+static int add_cow_cache_find_entry_to_replace(AddCowCache *c)
+{
+    int i;
+    int min_count = INT_MAX;
+    int min_index = -1;
+
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].cache_hits < min_count) {
+            min_index = i;
+            min_count = c->entries[i].cache_hits;
+        }
+
+        c->entries[i].cache_hits /= 2;
+    }
+
+    if (min_index == -1) {
+        abort();
+    }
+    return min_index;
+}
+
+static int add_cow_cache_do_get(BlockDriverState *bs, AddCowCache *c,
+    uint64_t offset, void **table)
+{
+    int i, ret;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            goto found;
+        }
+    }
+
+    i = add_cow_cache_find_entry_to_replace(c);
+    if (i < 0) {
+        return i;
+    }
+
+    ret = add_cow_cache_entry_flush(bs, c, i);
+    if (ret < 0) {
+        return ret;
+    }
+    c->entries[i].offset = -1;
+    ret = bdrv_pread(bs->file, sizeof(AddCowHeader) + offset,
+        c->entries[i].table, c->entry_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].cache_hits = 32;
+    c->entries[i].offset = offset;
+
+found:
+    c->entries[i].cache_hits++;
+    *table = c->entries[i].table;
+
+    return 0;
+}
+
+int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c, uint64_t sector_num,
+    void **table)
+{
+    /* each byte in bitmap indicates 8 * SECTORS_PER_CLUSTER clusters */
+    uint64_t offset = offset_in_bitmap(sector_num) & (~(c->entry_size - 1));
+    return add_cow_cache_do_get(bs, c, offset, table);
+}
+
+void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    abort();
+
+found:
+    c->entries[i].dirty = true;
+}
+
+bool add_cow_cache_set_writethrough(BlockDriverState *bs, AddCowCache *c,
+    bool enable)
+{
+    bool old = c->writethrough;
+
+    if (!old && enable) {
+        add_cow_cache_flush(bs, c);
+    }
+
+    c->writethrough = enable;
+    return old;
+}
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..8bf27ab
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,446 @@ 
+/*
+ * QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include "add-cow.h"
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const AddCowHeader *header = (const AddCowHeader *)buf;
+
+    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
+        be32_to_cpu(header->version) == ADD_COW_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    AddCowHeader header;
+    int64_t image_sectors = 0;
+    const char *backing_filename = NULL;
+    const char *image_filename = NULL;
+    int ret;
+    BlockDriverState *bs, *image_bs = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_filename = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    if (!backing_filename || !image_filename) {
+        error_report("Both backing_file and image_file should be given.");
+        return -EINVAL;
+    }
+
+    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    image_sectors = image_bs->total_sectors;
+    bdrv_delete(image_bs);
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be64(ADD_COW_MAGIC);
+    header.version = cpu_to_be32(ADD_COW_VERSION);
+    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
+    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    BlockDriver *drv = bdrv_find_format("add-cow");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        bdrv_delete(bs);
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+    AddCowHeader        header;
+    char                image_filename[ADD_COW_FILE_LEN];
+    BlockDriver         *image_drv = NULL;
+    int                 ret;
+    bool                writethrough;
+    int                 sector_per_byte;
+    BDRVAddCowState     *s = bs->opaque;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto fail;
+    }
+
+    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (be32_to_cpu(header.version) != ADD_COW_VERSION) {
+        char version[64];
+        snprintf(version, sizeof(version), "ADD-COW version %d",
+            be32_to_cpu(header.version));
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "add-cow", version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    QEMU_BUILD_BUG_ON(sizeof(bs->backing_file) != sizeof(header.backing_file));
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file), header.backing_file);
+
+    if (header.image_file[0] == '\0') {
+        ret = -ENOENT;
+        goto fail;
+    }
+    header.image_file[ADD_COW_FILE_LEN - 1] = '\0';
+    s->image_hd = bdrv_new("");
+    if (path_has_protocol(header.image_file)) {
+        pstrcpy(image_filename, sizeof(image_filename), header.image_file);
+    } else {
+        path_combine(image_filename, sizeof(image_filename),
+                     bs->filename, header.image_file);
+    }
+
+    image_drv = bdrv_find_format("raw");
+    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
+    if (ret < 0) {
+        bdrv_delete(s->image_hd);
+        goto fail;
+    }
+    bs->total_sectors = s->image_hd->total_sectors;
+    s->cluster_size = ADD_COW_CLUSTER_SIZE;
+    sector_per_byte = SECTORS_PER_CLUSTER * 8;
+    s->bitmap_size =
+        (bs->total_sectors + sector_per_byte - 1) / sector_per_byte;
+    writethrough = ((flags & BDRV_O_CACHE_WB) == 0);
+    s->bitmap_cache =
+        add_cow_cache_create(bs, ADD_COW_CACHE_SIZE, writethrough);
+
+    qemu_co_mutex_init(&s->lock);
+    return 0;
+fail:
+    if (s->bitmap_cache) {
+        add_cow_cache_destroy(bs, s->bitmap_cache);
+    }
+    return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+    BDRVAddCowState *s = bs->opaque;
+    add_cow_cache_destroy(bs, s->bitmap_cache);
+    bdrv_delete(s->image_hd);
+}
+
+static bool is_allocated(BlockDriverState *bs, int64_t sector_num)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int64_t cluster_num = sector_num / SECTORS_PER_CLUSTER;
+    uint8_t *table = NULL;
+    int ret = add_cow_cache_get(bs, s->bitmap_cache,
+        sector_num, (void **)&table);
+
+    if (ret < 0) {
+        return 0;
+    }
+    return table[cluster_num / 8 % ADD_COW_CACHE_ENTRY_SIZE]
+        & (1 << (cluster_num % 8));
+}
+
+static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
+{
+    int changed;
+    int64_t cluster_num;
+
+    if (nb_sectors == 0) {
+        *num_same = 0;
+        return 0;
+    }
+
+    cluster_num = sector_num / SECTORS_PER_CLUSTER;
+    changed = is_allocated(bs, sector_num);
+    *num_same =
+        MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
+
+    for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
+        cluster_num <= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
+        cluster_num++) {
+        if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) {
+            break;
+        }
+        *num_same = MIN(nb_sectors,
+            (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
+    }
+
+    return changed;
+}
+
+static coroutine_fn int add_cow_co_readv(BlockDriverState *bs,
+    int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int cur_nr_sectors;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int n, ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    while (remaining_sectors != 0) {
+        cur_nr_sectors = remaining_sectors;
+        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
+            cur_nr_sectors = n;
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            cur_nr_sectors = n;
+            if (bs->backing_hd) {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                qemu_iovec_memset(&hd_qiov, 0,
+                    BDRV_SECTOR_SIZE * cur_nr_sectors);
+            }
+        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+                                     int n_start, int n_end)
+{
+    BDRVAddCowState *s = bs->opaque;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int n, ret;
+
+    n = n_end - n_start;
+    if (n <= 0) {
+        return 0;
+    }
+
+    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = bdrv_co_readv(bs->backing_hd, n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+    ret = bdrv_co_writev(s->image_hd, n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0, i;
+    QEMUIOVector hd_qiov;
+    uint8_t *table;
+    bool changed = false;
+
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    ret = bdrv_co_writev(s->image_hd,
+                     sector_num,
+                     remaining_sectors, qiov);
+
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy content of unmodified sectors */
+    if (!is_cluster_head(sector_num) && !is_allocated(bs, sector_num)) {
+        qemu_co_mutex_unlock(&s->lock);
+        ret = copy_sectors(bs, sector_num & ~(SECTORS_PER_CLUSTER - 1),
+            sector_num);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    if (!is_cluster_tail(sector_num + remaining_sectors - 1)
+        && !is_allocated(bs, sector_num + remaining_sectors - 1)) {
+        qemu_co_mutex_unlock(&s->lock);
+        ret = copy_sectors(bs, sector_num + remaining_sectors,
+            ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) + 1);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    for (i = sector_num / SECTORS_PER_CLUSTER;
+        i <= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
+        i++) {
+        ret = add_cow_cache_get(bs, s->bitmap_cache,
+            i * SECTORS_PER_CLUSTER, (void **)&table);
+        if (ret < 0) {
+            return 0;
+        }
+        if ((table[i / 8] & (1 << (i % 8))) == 0) {
+            table[i / 8] |= (1 << (i % 8));
+            changed = true;
+            add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
+        }
+
+    }
+    ret = 0;
+fail:
+    if (changed) {
+        ret = add_cow_cache_flush(bs, s->bitmap_cache);
+    }
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
+    int ret;
+    int64_t old_image_sector = s->image_hd->total_sectors;
+    int64_t bitmap_size =
+        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
+
+    ret = bdrv_truncate(bs->file,
+        sizeof(AddCowHeader) + bitmap_size);
+    if (ret < 0) {
+        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
+        return ret;
+    }
+    return 0;
+}
+
+static coroutine_fn int add_cow_co_flush(BlockDriverState *bs)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = add_cow_cache_flush(bs, s->bitmap_cache);
+    qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_co_flush(bs->file);
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_IMAGE_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a image file"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_add_cow = {
+    .format_name                = "add-cow",
+    .instance_size              = sizeof(BDRVAddCowState),
+    .bdrv_probe                 = add_cow_probe,
+    .bdrv_open                  = add_cow_open,
+    .bdrv_close                 = add_cow_close,
+    .bdrv_create                = add_cow_create,
+    .bdrv_co_readv              = add_cow_co_readv,
+    .bdrv_co_writev             = add_cow_co_writev,
+    .bdrv_truncate              = bdrv_add_cow_truncate,
+    .bdrv_co_is_allocated       = add_cow_is_allocated,
+
+    .create_options             = add_cow_create_options,
+    .bdrv_co_flush_to_disk      = add_cow_co_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+    bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block/add-cow.h b/block/add-cow.h
new file mode 100644
index 0000000..46567bb
--- /dev/null
+++ b/block/add-cow.h
@@ -0,0 +1,83 @@ 
+/*
+ * QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_ADD_COW_H
+#define BLOCK_ADD_COW_H
+
+#define ADD_COW_MAGIC       (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
+                            ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
+                            ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
+                            ((uint64_t)'W' << 8) | 0xFF)
+#define ADD_COW_VERSION            1
+#define ADD_COW_FILE_LEN           1024
+#define ADD_COW_CACHE_SIZE         16
+#define ADD_COW_CACHE_ENTRY_SIZE   65536
+#define ADD_COW_CLUSTER_SIZE       65536
+#define SECTORS_PER_CLUSTER         (ADD_COW_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+typedef struct AddCowHeader {
+    uint64_t        magic;
+    uint32_t        version;
+    char            backing_file[ADD_COW_FILE_LEN];
+    char            image_file[ADD_COW_FILE_LEN];
+    char            reserved[500];
+} QEMU_PACKED AddCowHeader;
+
+typedef struct AddCowCachedTable {
+    void    *table;
+    int64_t offset;
+    bool    dirty;
+    int     cache_hits;
+} AddCowCachedTable;
+
+typedef struct AddCowCache {
+    AddCowCachedTable       *entries;
+    int                     entry_size;
+    int                     size;
+    bool                    writethrough;
+} AddCowCache;
+
+typedef struct BDRVAddCowState {
+    BlockDriverState    *image_hd;
+    CoMutex             lock;
+    int                 cluster_size;
+    AddCowCache         *bitmap_cache;
+    uint64_t            bitmap_size;
+} BDRVAddCowState;
+
+/* convert sector_num to offset in bitmap */
+static inline int64_t offset_in_bitmap(int64_t sector_num)
+{
+    int64_t cluster_num = sector_num / SECTORS_PER_CLUSTER;
+    return cluster_num / 8;
+}
+
+static inline bool is_cluster_head(int64_t sector_num)
+{
+    return sector_num % SECTORS_PER_CLUSTER == 0;
+}
+
+static inline bool is_cluster_tail(int64_t sector_num)
+{
+    return (sector_num + 1) % SECTORS_PER_CLUSTER == 0;
+}
+
+AddCowCache *add_cow_cache_create(BlockDriverState *bs, int num_tables,
+    bool writethrough);
+int add_cow_cache_destroy(BlockDriverState *bs, AddCowCache *c);
+void add_cow_cache_entry_mark_dirty(AddCowCache *c, void *table);
+int add_cow_cache_get(BlockDriverState *bs, AddCowCache *c, uint64_t offset,
+    void **table);
+int add_cow_cache_flush(BlockDriverState *bs, AddCowCache *c);
+bool add_cow_cache_set_writethrough(BlockDriverState *bs, AddCowCache *c,
+    bool enable);
+#endif
diff --git a/block_int.h b/block_int.h
index 086832a..83e75ea 100644
--- a/block_int.h
+++ b/block_int.h
@@ -51,6 +51,7 @@ 
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
 #define BLOCK_OPT_COMPAT_LEVEL  "compat"
+#define BLOCK_OPT_IMAGE_FILE    "image_file"
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;
 
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..28085d4
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,68 @@ 
+== General ==
+
+Raw file format does not support backing_file and copy on write feature.
+The add-cow image format makes it possible to use backing files with raw
+image by keeping a separate .add-cow metadata file.  Once all sectors
+have been written to in the raw image it is safe to discard the .add-cow
+and backing files and instead use the raw image directly.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+    1)  Create a raw image with the same size of ubuntu.img
+            qemu-img create -f raw test.raw 8G
+    2)  Create a add-cow image which will store dirty bitmap
+            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
+    3)  Run qemu with add-cow image
+            qemu -drive if=virtio,file=test.add-cow
+
+=Specification=
+
+The file format looks like this:
+
+ +---------------+--------------------------+
+ |     Header    |         COW bitmap       |
+ +---------------+--------------------------+
+
+All numbers in add-cow are stored in Big Endian byte order.
+
+== Header ==
+
+The Header is included in the first bytes:
+
+    Byte  0 -  7:       magic
+                        add-cow magic string ("ADD_COW\xff")
+
+          8 -  11:      version
+                        Version number (only valid value is 1 now)
+
+          12 - 1035:    backing_file
+                        backing_file file name related to add-cow file. All
+                        unused bytes are padded with zeros. Must not be longer
+                        than 1023 bytes.
+
+         1036 - 2059:   image_file
+                        image_file is a raw file. All unused bytes are padded
+                        with zeros. Must not be longer than 1023 bytes.
+
+         2060  - 2559:   The Reserved field is used to make sure Data field starts
+                        at the multiple of 512, not used currently. All bytes are
+                        filled with 0.
+
+== COW bitmap ==
+
+The "COW bitmap" field starts at the 2560th byte, stores a bitmap related to
+backing_file and image_file. The bitmap will track whether the sector in
+backing_file is dirty or not.
+
+Each bit in the bitmap indicates one cluster's status. One cluster includes 128
+sectors, then each bit indicates 512 * 128 = 64k bytes, So the size of bitmap is
+calculated according to virtual size of image_file. In each byte, bit 0 to 7
+will track the 1st to 7th cluster in sequence, bit orders in one byte look like:
+ +----+----+----+----+----+----+----+----+
+ | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
+ +----+----+----+----+----+----+----+----+
+
+If the bit is 0, indicates the sector has not been allocated in image_file, data
+should be loaded from backing_file while reading; if the bit is 1,  indicates the
+related sector has been dirty, should be loaded from image_file while reading.
+Writing to a sector causes the corresponding bit to be set to 1.
diff --git a/qemu-img.c b/qemu-img.c
index 0ae543c..29865b5 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -628,7 +628,9 @@  static int img_convert(int argc, char **argv)
     int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size, cluster_sectors;
     int progress = 0, flags;
     const char *fmt, *out_fmt, *cache, *out_baseimg, *out_filename;
+    char image_filename[1024], backing_filename[1024];
     BlockDriver *drv, *proto_drv;
+    BlockDriver *image_drv, *backing_drv;
     BlockDriverState **bs = NULL, *out_bs = NULL;
     int64_t total_sectors, nb_sectors, sector_num, bs_offset;
     uint64_t bs_sectors;
@@ -637,6 +639,7 @@  static int img_convert(int argc, char **argv)
     BlockDriverInfo bdi;
     QEMUOptionParameter *param = NULL, *create_options = NULL;
     QEMUOptionParameter *out_baseimg_param;
+    QEMUOptionParameter *image_param = NULL;
     char *options = NULL;
     const char *snapshot_name = NULL;
     float local_progress;
@@ -797,6 +800,7 @@  static int img_convert(int argc, char **argv)
         out_baseimg = out_baseimg_param->value.s;
     }
 
+    image_param = get_option_parameter(param, BLOCK_OPT_SIZE);
     /* Check if compression is supported */
     if (compress) {
         QEMUOptionParameter *encryption =
@@ -828,6 +832,41 @@  static int img_convert(int argc, char **argv)
     }
 
     /* Create the new image */
+
+    if (0 == strcmp(out_fmt, "add-cow")) {
+        image_drv = bdrv_find_format("raw");
+        if (!drv) {
+            ret = -1;
+            goto out;
+        }
+        snprintf(image_filename, sizeof(image_filename),
+            "%s"".ct.raw", out_filename);
+        ret = bdrv_create(image_drv, image_filename, image_param);
+        if (ret < 0) {
+            error_report("%s: error while creating image_file: %s",
+                     image_filename, strerror(-ret));
+            goto out;
+        }
+        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
+
+        if (!out_baseimg) {
+            backing_drv = bdrv_find_format("qcow2");
+            if (!drv) {
+                ret = -1;
+                goto out;
+            }
+            snprintf(backing_filename, sizeof(backing_filename),
+                "%s"".ct.qcow2", out_filename);
+            ret = bdrv_create(backing_drv, backing_filename, image_param);
+            if (ret < 0) {
+                error_report("%s: error while creating backing_file: %s",
+                         backing_filename, strerror(-ret));
+                goto out;
+            }
+            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
+                backing_filename);
+        }
+    }
     ret = bdrv_create(drv, out_filename, param);
     if (ret < 0) {
         if (ret == -ENOTSUP) {