Patchwork [v3] add add-cow file format

login
register
mail settings
Submitter Robert Wang
Date Oct. 26, 2011, 10:08 a.m.
Message ID <CAGrFBsgGoB4=5eDeyUunzGZOB3JaXpNJrgZNzToLNt2w1LxODw@mail.gmail.com>
Download mbox | patch
Permalink /patch/121869/
State New
Headers show

Comments

Robert Wang - Oct. 26, 2011, 10:08 a.m.
Please find version 4 in the attachment.

2011/10/23 shu ming <shuming@linux.vnet.ibm.com>:
> On 2011-10-13 0:23, Dong Xu Wang wrote:
>>
>> Add add-cow file format
>>
>> Signed-off-by: Dong Xu Wang<wdongxu@linux.vnet.ibm.com>
>> ---
>>  Makefile.objs          |    1 +
>>  block.c                |    2 +-
>>  block.h                |    1 +
>>  block/add-cow.c        |  412
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>  block_int.h            |    1 +
>>  docs/specs/add-cow.txt |   45 ++++++
>>  6 files changed, 461 insertions(+), 1 deletions(-)
>>  create mode 100644 block/add-cow.c
>>  create mode 100644 docs/specs/add-cow.txt
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index c849e51..624c04c 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>
>>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o
>> vpc.o vvfat.o
>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
>> qcow2-snapshot.o qcow2-cache.o
>> +block-nested-y += add-cow.o
>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
>> qed-cluster.o
>>  block-nested-y += qed-check.o
>>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>> diff --git a/block.c b/block.c
>> index e865fab..c25241d 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>>  #endif
>>
>>  /* check if the path starts with "<protocol>:" */
>> -static int path_has_protocol(const char *path)
>> +int path_has_protocol(const char *path)
>>  {
>>  #ifdef _WIN32
>>      if (is_windows_drive(path) ||
>> diff --git a/block.h b/block.h
>> index 16bfa0a..8b09f12 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -256,6 +256,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,
>> QEMUSnapshotInfo *sn);
>>
>>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>  int path_is_absolute(const char *path);
>> +int path_has_protocol(const char *path);
>>  void path_combine(char *dest, int dest_size,
>>                    const char *base_path,
>>                    const char *filename);
>> diff --git a/block/add-cow.c b/block/add-cow.c
>> new file mode 100644
>> index 0000000..d2538a2
>> --- /dev/null
>> +++ b/block/add-cow.c
>> @@ -0,0 +1,412 @@
>> +#include "qemu-common.h"
>> +#include "block_int.h"
>> +#include "module.h"
>> +
>> +#define ADD_COW_MAGIC  (((uint64_t)'A'<<  56) | ((uint64_t)'D'<<  48) | \
>> +                        ((uint64_t)'D'<<  40) | ((uint64_t)'_'<<  32) | \
>> +                        ((uint64_t)'C'<<  24) | ((uint64_t)'O'<<  16) | \
>> +                        ((uint64_t)'W'<<  8) | 0xFF)
>> +#define ADD_COW_VERSION 1
>> +
>> +typedef struct AddCowHeader {
>> +    uint64_t magic;
>> +    uint32_t version;
>> +    char backing_file[1024];
>> +    char image_file[1024];
>
> 1024 is a magic number for me.  Can we have a meaningful macro?
>
>
>> +    uint64_t size;
>> +} QEMU_PACKED AddCowHeader;
>> +
>> +typedef struct BDRVAddCowState {
>> +    char image_file[1024];
>> +    BlockDriverState *image_hd;
>> +    uint8_t *bitmap;
>> +    uint64_t bitmap_size;
>> +} BDRVAddCowState;
>> +
>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char
>> *filename)
>> +{
>> +    const AddCowHeader *header = (const void *)buf;
>> +
>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC&&
>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>> +        return 100;
>> +    } else {
>> +        return 0;
>> +    }
>> +}
>> +
>> +static int add_cow_open(BlockDriverState *bs, int flags)
>> +{
>> +    AddCowHeader header;
>> +    int64_t size;
>> +    char image_filename[1024];
>> +    int image_flags;
>> +    BlockDriver *image_drv = NULL;
>> +    int ret;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +
>> +    ret = bdrv_pread(bs->file, 0,&header, sizeof(header));
>> +    if (ret != sizeof(header)) {
>> +        goto fail;
>> +    }
>> +
>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
>> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
>> +        ret = -1;
>> +        goto fail;
>> +    }
>> +
>> +    size = be64_to_cpu(header.size);
>> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
>> +
>> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) !=
>> sizeof(header.image_file));
>> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
>> +            header.backing_file);
>> +    pstrcpy(state->image_file, sizeof(state->image_file),
>> +            header.image_file);
>> +
>> +    state->bitmap_size = ((bs->total_sectors + 7)>>  3);
>> +    state->bitmap = g_malloc0(state->bitmap_size);
>> +
>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>> +            state->bitmap_size);
>> +    if (ret != state->bitmap_size) {
>> +        goto fail;
>> +    }
>> +   /* If there is a image_file, must be together with backing_file */
>> +    if (state->image_file[0] != '\0') {
>> +        state->image_hd = bdrv_new("");
>> +        /* Relative to image or working dir, need discussion */
>> +        if (path_has_protocol(state->image_file)) {
>> +            pstrcpy(image_filename, sizeof(image_filename),
>> +                    state->image_file);
>> +        } else {
>> +            path_combine(image_filename, sizeof(image_filename),
>> +                         bs->filename, state->image_file);
>> +        }
>> +
>> +        image_drv = bdrv_find_format("raw");
>> +        image_flags =
>> +             (flags&  (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) |
>> BDRV_O_RDWR;
>> +        state->image_hd->keep_read_only = 0;
>> +
>> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
>> +                image_drv);
>> +        if (ret<  0) {
>> +            goto fail;
>> +        }
>> +    } else {
>> +        ret = -ENOENT;
>> +        goto fail;
>> +    }
>> +    return 0;
>> + fail:
>> +    if (state->bitmap) {
>> +        g_free(state->bitmap);
>> +        state->bitmap = NULL;
>> +    }
>> +    return ret;
>> +}
>> +
>> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    uint64_t offset = bitnum / 8;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    state->bitmap[offset] |= (1<<  (bitnum % 8));
>> +}
>> +
>> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>> +{
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    uint64_t offset = bitnum / 8;
>> +    return !!(state->bitmap[offset]&  (1<<  (bitnum % 8)));
>> +}
>> +
>> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
>> +        int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +    uint64_t bitmap_size = ((BDRVAddCowState
>> *)(bs->opaque))->bitmap_size;
>> +
>> +    /* Beyond the end of bitmap, return error or read from backing_file?
>> */
>> +    if (((sector_num + nb_sectors + 7) / 8)>  bitmap_size) {
>> +        return 0;
>> +    }
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = nb_sectors;
>> +        return 0;
>> +    }
>> +
>> +    changed = is_bit_set(bs, sector_num);
>> +    if (changed<  0) {
>> +        return 0;
>> +    }
>> +
>> +    for (*num_same = 1; *num_same<  nb_sectors; (*num_same)++) {
>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>> +            break;
>> +        }
>> +    }
>> +
>> +    return changed;
>> +}
>> +
>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t
>> sector_num,
>> +        int nb_sectors)
>> +{
>> +    int i, ret = 0;
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    uint64_t start_pos = sector_num / 8;
>> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
>> +
>> +    if (start_pos>  state->bitmap_size) {
>> +        return -1;
>> +    }
>> +
>> +    for (i = 0; i<  nb_sectors; i++) {
>> +        add_cow_set_bit(bs, sector_num + i);
>> +    }
>> +    ret = bdrv_pwrite_sync(bs->file,
>> +        sizeof(AddCowHeader) + start_pos,
>> +        state->bitmap + start_pos,
>> +        MIN(((end_pos - start_pos)&  (~512)) + 512, state->bitmap_size -
>> start_pos));
>> +    return ret;
>> +}
>> +
>> +static void add_cow_close(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>> +    if (state->bitmap) {
>> +        g_free(state->bitmap);
>> +        state->bitmap = NULL;
>> +    }
>> +}
>> +
>> +static int add_cow_create(const char *filename, QEMUOptionParameter
>> *options)
>> +{
>> +    AddCowHeader header;
>> +    int64_t image_sectors = 0;
>> +    const char *backing_filename = NULL;
>> +    const char *image_filename = NULL;
>> +    int ret;
>> +    BlockDriverState *bs, *image_bs = NULL;
>> +
>> +    while (options&&  options->name) {
>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>> +            backing_filename = options->value.s;
>> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
>> +            image_filename = options->value.s;
>> +        }
>> +        options++;
>> +    }
>> +
>> +    if (!backing_filename || !image_filename) {
>> +        return -EINVAL;
>> +    }
>> +    /* Make sure image file exists */
>> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
>> +            | BDRV_O_CACHE_WB);
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +    bdrv_close(image_bs);
>> +    ret = bdrv_create_file(filename, NULL);
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +
>> +    memset(&header, 0, sizeof(header));
>> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
>> +    header.version = cpu_to_be32(ADD_COW_VERSION);
>> +    pstrcpy(header.backing_file, sizeof(header.backing_file),
>> backing_filename);
>> +    pstrcpy(header.image_file, sizeof(header.image_file),
>> image_filename);
>> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
>> +
>> +    ret = bdrv_pwrite(bs, 0,&header, sizeof(header));
>> +    if (ret<  0) {
>> +        bdrv_close(bs);
>> +        return ret;
>> +    }
>> +    bdrv_close(bs);
>> +
>> +    BlockDriver *drv = bdrv_find_format("add-cow");
>> +    assert(drv != NULL);
>> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
>> +fail:
>
> "fail" label is not used in this function.
>
>
>> +    bdrv_close(bs);
>> +    return ret;
>> +}
>> +
>> +static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
>> +                         int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int cur_nr_sectors;
>> +    uint64_t bytes_done = 0;
>> +    QEMUIOVector hd_qiov;
>> +    int n, ret = 0;
>> +
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    while (remaining_sectors != 0) {
>> +        cur_nr_sectors = remaining_sectors;
>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors,&n)) {
>> +            cur_nr_sectors = n;
>> +            qemu_iovec_reset(&hd_qiov);
>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n,&hd_qiov);
>> +            if (ret<  0) {
>> +                goto fail;
>> +            }
>> +        } else {
>> +            cur_nr_sectors = n;
>> +            if (bs->backing_hd) {
>> +                qemu_iovec_reset(&hd_qiov);
>> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
>> +                                    n,&hd_qiov);
>> +                if (ret<  0) {
>> +                    goto fail;
>> +                }
>> +            } else {
>> +                qemu_iovec_reset(&hd_qiov);
>> +                qemu_iovec_memset(&hd_qiov, 0,
>> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
>> +            }
>> +        }
>> +        remaining_sectors -= cur_nr_sectors;
>> +        sector_num += cur_nr_sectors;
>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>> +    }
>> +fail:
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
>> +                          int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0;
>> +    QEMUIOVector hd_qiov;
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +
>
> It looks like that the memory where "bs" is pointing to should be protected
> by a lock. Because "bs"
> was passed to several BlockDriver entries like ".bdrv_co_writev',
> 'bdrv_co_readv'&etc which can be executed
> in parallel to access the memory where "bs" is pointing to.
>
>> +    qemu_iovec_reset(&hd_qiov);
>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors *
>> BDRV_SECTOR_SIZE);
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors,&hd_qiov);
>> +    if (ret<  0) {
>> +        goto fail;
>> +    }
>> +
>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>> +    if (ret<  0) {
>> +        goto fail;
>> +    }
>> +fail:
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>> +{
>> +    int ret = 0;
>> +    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
>> +    int64_t be_offset = cpu_to_be64(offset);
>> +    BDRVAddCowState *state = bs->opaque;
>> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7)>>  3)
>> +        + sizeof(AddCowHeader));
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
>> +&be_offset, sizeof(uint64_t));
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_truncate(state->image_hd, offset);
>> +    return ret;
>> +}
>> +
>> +static QEMUOptionParameter add_cow_create_options[] = {
>> +    {
>> +        .name = BLOCK_OPT_SIZE,
>> +        .type = OPT_SIZE,
>> +        .help = "Virtual disk size"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_BACKING_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a base image"
>> +    },
>> +    {
>> +        .name = BLOCK_OPT_IMAGE_FILE,
>> +        .type = OPT_STRING,
>> +        .help = "File name of a image file"
>> +    },
>> +    { NULL }
>> +};
>> +
>> +static int add_cow_flush(BlockDriverState *bs)
>> +{
>> +    BDRVAddCowState *state = bs->opaque;
>> +    int ret = bdrv_flush(state->image_hd);
>> +    if (ret<  0) {
>> +        return ret;
>> +    }
>> +    return bdrv_flush(bs->file);
>> +}
>> +
>> +static BlockDriverAIOCB *add_cow_aio_flush(BlockDriverState *bs,
>> +        BlockDriverCompletionFunc *cb, void *opaque)
>> +{
>> +    BDRVAddCowState *state = bs->opaque;
>> +    int ret = bdrv_flush(state->image_hd);
>> +    if (ret<  0) {
>> +        return NULL;
>> +    }
>> +
>> +    return bdrv_aio_flush(bs->file, cb, opaque);
>> +}
>> +
>> +static BlockDriver bdrv_add_cow = {
>> +    .format_name        = "add-cow",
>> +    .instance_size      = sizeof(BDRVAddCowState),
>> +    .bdrv_probe         = add_cow_probe,
>> +    .bdrv_open          = add_cow_open,
>> +    .bdrv_close         = add_cow_close,
>> +    .bdrv_create        = add_cow_create,
>> +    .bdrv_is_allocated  = add_cow_is_allocated,
>> +
>> +    .bdrv_co_readv      = add_cow_co_readv,
>> +    .bdrv_co_writev     = add_cow_co_writev,
>> +    .bdrv_truncate      = bdrv_add_cow_truncate,
>> +
>> +    .create_options     = add_cow_create_options,
>> +    .bdrv_flush         = add_cow_flush,
>> +    .bdrv_aio_flush     = add_cow_aio_flush,
>> +};
>> +
>> +static void bdrv_add_cow_init(void)
>> +{
>> +    bdrv_register(&bdrv_add_cow);
>> +}
>> +
>> +block_init(bdrv_add_cow_init);
>> diff --git a/block_int.h b/block_int.h
>> index 8c3b863..2b26fdc 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -42,6 +42,7 @@
>>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>  #define BLOCK_OPT_PREALLOC      "preallocation"
>>  #define BLOCK_OPT_SUBFMT        "subformat"
>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>
>>  typedef struct AIOPool {
>>      void (*cancel)(BlockDriverAIOCB *acb);
>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>> new file mode 100644
>> index 0000000..0443c6c
>> --- /dev/null
>> +++ b/docs/specs/add-cow.txt
>> @@ -0,0 +1,45 @@
>> +== General ==
>> +
>> +Raw file format does not support backing_file and copy on write feature.
>> Then
>> +you can use add-cow file realize these features.
>> +
>> +When using add-cow, procedures may like this:
>> +(ubuntu.img is a disk image which has been installed OS.)
>> +    1)  Create a raw image with the same size of ubuntu.img
>> +            qemu-img create -f raw test.raw 8G
>> +    2)  Create a add-cow image which will store dirty bitmap
>> +            qemu-img create -f add-cow test.add-cow -o
>> backing_file=ubuntu.img,image_file=test.raw
>> +    3)  Run qemu with add-cow image
>> +            qemu -drive if=virtio,file=test.add-cow
>> +
>> +=Specification=
>> +
>> +The file format looks like this:
>> +
>> + +----------+----------+----------+-----+
>> + |  Header  |   Data   |   Data   | ... |
>> + +----------+----------+----------+-----+
>> +
>> + All numbers in add-cow are stored in Big Endian byte order.
>> +
>> +
>> +== Header ==
>> +
>> +The Header is included in the first bytes:
>> +
>> +    Byte  0 -  7:       magic
>> +                        add-cow magic string ("ADD_COW\xff")
>> +
>> +          8 -  11:      version
>> +                        Version number (only valid value is 1 now)
>> +
>> +          12 - 1035:    backing_file
>> +                        backing_file file name related to add-cow file.
>> While
>> +                        using backing_file, must together with
>> image_file.
>> +
>> +         1036 - 2059:   image_file
>> +                        image_file is a raw file, While using image_file,
>> must
>> +                        together with image_file.
>> +
>> +         2060 - 2067:   size
>> +                        Virtual disk size of image_file in bytes.
>
>
>
>
shu ming - Oct. 27, 2011, 5:34 a.m.
Looks good to me.  A nit, it seems that bdrv_flush is not supported 
anymore in upstream.
bdrv_co_flush should be used instead if you update your workspace to 
latest one.
On 2011-10-26 18:08, Robert Wang wrote:
> Please find version 4 in the attachment.
>
> 2011/10/23 shu ming<shuming@linux.vnet.ibm.com>:
>> On 2011-10-13 0:23, Dong Xu Wang wrote:
>>> Add add-cow file format
>>>
>>> Signed-off-by: Dong Xu Wang<wdongxu@linux.vnet.ibm.com>
>>> ---
>>>   Makefile.objs          |    1 +
>>>   block.c                |    2 +-
>>>   block.h                |    1 +
>>>   block/add-cow.c        |  412
>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>   block_int.h            |    1 +
>>>   docs/specs/add-cow.txt |   45 ++++++
>>>   6 files changed, 461 insertions(+), 1 deletions(-)
>>>   create mode 100644 block/add-cow.c
>>>   create mode 100644 docs/specs/add-cow.txt
>>>
>>> diff --git a/Makefile.objs b/Makefile.objs
>>> index c849e51..624c04c 100644
>>> --- a/Makefile.objs
>>> +++ b/Makefile.objs
>>> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>>
>>>   block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o
>>> vpc.o vvfat.o
>>>   block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
>>> qcow2-snapshot.o qcow2-cache.o
>>> +block-nested-y += add-cow.o
>>>   block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
>>> qed-cluster.o
>>>   block-nested-y += qed-check.o
>>>   block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>> diff --git a/block.c b/block.c
>>> index e865fab..c25241d 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>>>   #endif
>>>
>>>   /* check if the path starts with "<protocol>:" */
>>> -static int path_has_protocol(const char *path)
>>> +int path_has_protocol(const char *path)
>>>   {
>>>   #ifdef _WIN32
>>>       if (is_windows_drive(path) ||
>>> diff --git a/block.h b/block.h
>>> index 16bfa0a..8b09f12 100644
>>> --- a/block.h
>>> +++ b/block.h
>>> @@ -256,6 +256,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,
>>> QEMUSnapshotInfo *sn);
>>>
>>>   char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>>   int path_is_absolute(const char *path);
>>> +int path_has_protocol(const char *path);
>>>   void path_combine(char *dest, int dest_size,
>>>                     const char *base_path,
>>>                     const char *filename);
>>> diff --git a/block/add-cow.c b/block/add-cow.c
>>> new file mode 100644
>>> index 0000000..d2538a2
>>> --- /dev/null
>>> +++ b/block/add-cow.c
>>> @@ -0,0 +1,412 @@
>>> +#include "qemu-common.h"
>>> +#include "block_int.h"
>>> +#include "module.h"
>>> +
>>> +#define ADD_COW_MAGIC  (((uint64_t)'A'<<    56) | ((uint64_t)'D'<<    48) | \
>>> +                        ((uint64_t)'D'<<    40) | ((uint64_t)'_'<<    32) | \
>>> +                        ((uint64_t)'C'<<    24) | ((uint64_t)'O'<<    16) | \
>>> +                        ((uint64_t)'W'<<    8) | 0xFF)
>>> +#define ADD_COW_VERSION 1
>>> +
>>> +typedef struct AddCowHeader {
>>> +    uint64_t magic;
>>> +    uint32_t version;
>>> +    char backing_file[1024];
>>> +    char image_file[1024];
>> 1024 is a magic number for me.  Can we have a meaningful macro?
>>
>>
>>> +    uint64_t size;
>>> +} QEMU_PACKED AddCowHeader;
>>> +
>>> +typedef struct BDRVAddCowState {
>>> +    char image_file[1024];
>>> +    BlockDriverState *image_hd;
>>> +    uint8_t *bitmap;
>>> +    uint64_t bitmap_size;
>>> +} BDRVAddCowState;
>>> +
>>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char
>>> *filename)
>>> +{
>>> +    const AddCowHeader *header = (const void *)buf;
>>> +
>>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC&&
>>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>>> +        return 100;
>>> +    } else {
>>> +        return 0;
>>> +    }
>>> +}
>>> +
>>> +static int add_cow_open(BlockDriverState *bs, int flags)
>>> +{
>>> +    AddCowHeader header;
>>> +    int64_t size;
>>> +    char image_filename[1024];
>>> +    int image_flags;
>>> +    BlockDriver *image_drv = NULL;
>>> +    int ret;
>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>> +
>>> +    ret = bdrv_pread(bs->file, 0,&header, sizeof(header));
>>> +    if (ret != sizeof(header)) {
>>> +        goto fail;
>>> +    }
>>> +
>>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
>>> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
>>> +        ret = -1;
>>> +        goto fail;
>>> +    }
>>> +
>>> +    size = be64_to_cpu(header.size);
>>> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
>>> +
>>> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) !=
>>> sizeof(header.image_file));
>>> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
>>> +            header.backing_file);
>>> +    pstrcpy(state->image_file, sizeof(state->image_file),
>>> +            header.image_file);
>>> +
>>> +    state->bitmap_size = ((bs->total_sectors + 7)>>    3);
>>> +    state->bitmap = g_malloc0(state->bitmap_size);
>>> +
>>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>>> +            state->bitmap_size);
>>> +    if (ret != state->bitmap_size) {
>>> +        goto fail;
>>> +    }
>>> +   /* If there is a image_file, must be together with backing_file */
>>> +    if (state->image_file[0] != '\0') {
>>> +        state->image_hd = bdrv_new("");
>>> +        /* Relative to image or working dir, need discussion */
>>> +        if (path_has_protocol(state->image_file)) {
>>> +            pstrcpy(image_filename, sizeof(image_filename),
>>> +                    state->image_file);
>>> +        } else {
>>> +            path_combine(image_filename, sizeof(image_filename),
>>> +                         bs->filename, state->image_file);
>>> +        }
>>> +
>>> +        image_drv = bdrv_find_format("raw");
>>> +        image_flags =
>>> +             (flags&    (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) |
>>> BDRV_O_RDWR;
>>> +        state->image_hd->keep_read_only = 0;
>>> +
>>> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
>>> +                image_drv);
>>> +        if (ret<    0) {
>>> +            goto fail;
>>> +        }
>>> +    } else {
>>> +        ret = -ENOENT;
>>> +        goto fail;
>>> +    }
>>> +    return 0;
>>> + fail:
>>> +    if (state->bitmap) {
>>> +        g_free(state->bitmap);
>>> +        state->bitmap = NULL;
>>> +    }
>>> +    return ret;
>>> +}
>>> +
>>> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
>>> +{
>>> +    uint64_t offset = bitnum / 8;
>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>> +    state->bitmap[offset] |= (1<<    (bitnum % 8));
>>> +}
>>> +
>>> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>>> +{
>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>> +    uint64_t offset = bitnum / 8;
>>> +    return !!(state->bitmap[offset]&    (1<<    (bitnum % 8)));
>>> +}
>>> +
>>> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
>>> +        int nb_sectors, int *num_same)
>>> +{
>>> +    int changed;
>>> +    uint64_t bitmap_size = ((BDRVAddCowState
>>> *)(bs->opaque))->bitmap_size;
>>> +
>>> +    /* Beyond the end of bitmap, return error or read from backing_file?
>>> */
>>> +    if (((sector_num + nb_sectors + 7) / 8)>    bitmap_size) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (nb_sectors == 0) {
>>> +        *num_same = nb_sectors;
>>> +        return 0;
>>> +    }
>>> +
>>> +    changed = is_bit_set(bs, sector_num);
>>> +    if (changed<    0) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    for (*num_same = 1; *num_same<    nb_sectors; (*num_same)++) {
>>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    return changed;
>>> +}
>>> +
>>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t
>>> sector_num,
>>> +        int nb_sectors)
>>> +{
>>> +    int i, ret = 0;
>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>> +    uint64_t start_pos = sector_num / 8;
>>> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
>>> +
>>> +    if (start_pos>    state->bitmap_size) {
>>> +        return -1;
>>> +    }
>>> +
>>> +    for (i = 0; i<    nb_sectors; i++) {
>>> +        add_cow_set_bit(bs, sector_num + i);
>>> +    }
>>> +    ret = bdrv_pwrite_sync(bs->file,
>>> +        sizeof(AddCowHeader) + start_pos,
>>> +        state->bitmap + start_pos,
>>> +        MIN(((end_pos - start_pos)&    (~512)) + 512, state->bitmap_size -
>>> start_pos));
>>> +    return ret;
>>> +}
>>> +
>>> +static void add_cow_close(BlockDriverState *bs)
>>> +{
>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>> +    if (state->bitmap) {
>>> +        g_free(state->bitmap);
>>> +        state->bitmap = NULL;
>>> +    }
>>> +}
>>> +
>>> +static int add_cow_create(const char *filename, QEMUOptionParameter
>>> *options)
>>> +{
>>> +    AddCowHeader header;
>>> +    int64_t image_sectors = 0;
>>> +    const char *backing_filename = NULL;
>>> +    const char *image_filename = NULL;
>>> +    int ret;
>>> +    BlockDriverState *bs, *image_bs = NULL;
>>> +
>>> +    while (options&&    options->name) {
>>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>>> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
>>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>>> +            backing_filename = options->value.s;
>>> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
>>> +            image_filename = options->value.s;
>>> +        }
>>> +        options++;
>>> +    }
>>> +
>>> +    if (!backing_filename || !image_filename) {
>>> +        return -EINVAL;
>>> +    }
>>> +    /* Make sure image file exists */
>>> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
>>> +            | BDRV_O_CACHE_WB);
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +    bdrv_close(image_bs);
>>> +    ret = bdrv_create_file(filename, NULL);
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    memset(&header, 0, sizeof(header));
>>> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
>>> +    header.version = cpu_to_be32(ADD_COW_VERSION);
>>> +    pstrcpy(header.backing_file, sizeof(header.backing_file),
>>> backing_filename);
>>> +    pstrcpy(header.image_file, sizeof(header.image_file),
>>> image_filename);
>>> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
>>> +
>>> +    ret = bdrv_pwrite(bs, 0,&header, sizeof(header));
>>> +    if (ret<    0) {
>>> +        bdrv_close(bs);
>>> +        return ret;
>>> +    }
>>> +    bdrv_close(bs);
>>> +
>>> +    BlockDriver *drv = bdrv_find_format("add-cow");
>>> +    assert(drv != NULL);
>>> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
>>> +fail:
>> "fail" label is not used in this function.
>>
>>
>>> +    bdrv_close(bs);
>>> +    return ret;
>>> +}
>>> +
>>> +static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
>>> +                         int remaining_sectors, QEMUIOVector *qiov)
>>> +{
>>> +    BDRVAddCowState *s = bs->opaque;
>>> +    int cur_nr_sectors;
>>> +    uint64_t bytes_done = 0;
>>> +    QEMUIOVector hd_qiov;
>>> +    int n, ret = 0;
>>> +
>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>> +    while (remaining_sectors != 0) {
>>> +        cur_nr_sectors = remaining_sectors;
>>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors,&n)) {
>>> +            cur_nr_sectors = n;
>>> +            qemu_iovec_reset(&hd_qiov);
>>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n,&hd_qiov);
>>> +            if (ret<    0) {
>>> +                goto fail;
>>> +            }
>>> +        } else {
>>> +            cur_nr_sectors = n;
>>> +            if (bs->backing_hd) {
>>> +                qemu_iovec_reset(&hd_qiov);
>>> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>>> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
>>> +                                    n,&hd_qiov);
>>> +                if (ret<    0) {
>>> +                    goto fail;
>>> +                }
>>> +            } else {
>>> +                qemu_iovec_reset(&hd_qiov);
>>> +                qemu_iovec_memset(&hd_qiov, 0,
>>> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
>>> +            }
>>> +        }
>>> +        remaining_sectors -= cur_nr_sectors;
>>> +        sector_num += cur_nr_sectors;
>>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>>> +    }
>>> +fail:
>>> +    qemu_iovec_destroy(&hd_qiov);
>>> +    return ret;
>>> +}
>>> +
>>> +static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
>>> +                          int remaining_sectors, QEMUIOVector *qiov)
>>> +{
>>> +    BDRVAddCowState *s = bs->opaque;
>>> +    int ret = 0;
>>> +    QEMUIOVector hd_qiov;
>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>> +
>> It looks like that the memory where "bs" is pointing to should be protected
>> by a lock. Because "bs"
>> was passed to several BlockDriver entries like ".bdrv_co_writev',
>> 'bdrv_co_readv'&etc which can be executed
>> in parallel to access the memory where "bs" is pointing to.
>>
>>> +    qemu_iovec_reset(&hd_qiov);
>>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors *
>>> BDRV_SECTOR_SIZE);
>>> +    ret = bdrv_co_writev(s->image_hd,
>>> +                     sector_num,
>>> +                     remaining_sectors,&hd_qiov);
>>> +    if (ret<    0) {
>>> +        goto fail;
>>> +    }
>>> +
>>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>>> +    if (ret<    0) {
>>> +        goto fail;
>>> +    }
>>> +fail:
>>> +    qemu_iovec_destroy(&hd_qiov);
>>> +    return ret;
>>> +}
>>> +
>>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>>> +{
>>> +    int ret = 0;
>>> +    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
>>> +    int64_t be_offset = cpu_to_be64(offset);
>>> +    BDRVAddCowState *state = bs->opaque;
>>> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7)>>    3)
>>> +        + sizeof(AddCowHeader));
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
>>> +&be_offset, sizeof(uint64_t));
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = bdrv_truncate(state->image_hd, offset);
>>> +    return ret;
>>> +}
>>> +
>>> +static QEMUOptionParameter add_cow_create_options[] = {
>>> +    {
>>> +        .name = BLOCK_OPT_SIZE,
>>> +        .type = OPT_SIZE,
>>> +        .help = "Virtual disk size"
>>> +    },
>>> +    {
>>> +        .name = BLOCK_OPT_BACKING_FILE,
>>> +        .type = OPT_STRING,
>>> +        .help = "File name of a base image"
>>> +    },
>>> +    {
>>> +        .name = BLOCK_OPT_IMAGE_FILE,
>>> +        .type = OPT_STRING,
>>> +        .help = "File name of a image file"
>>> +    },
>>> +    { NULL }
>>> +};
>>> +
>>> +static int add_cow_flush(BlockDriverState *bs)
>>> +{
>>> +    BDRVAddCowState *state = bs->opaque;
>>> +    int ret = bdrv_flush(state->image_hd);
>>> +    if (ret<    0) {
>>> +        return ret;
>>> +    }
>>> +    return bdrv_flush(bs->file);
>>> +}
>>> +
>>> +static BlockDriverAIOCB *add_cow_aio_flush(BlockDriverState *bs,
>>> +        BlockDriverCompletionFunc *cb, void *opaque)
>>> +{
>>> +    BDRVAddCowState *state = bs->opaque;
>>> +    int ret = bdrv_flush(state->image_hd);
>>> +    if (ret<    0) {
>>> +        return NULL;
>>> +    }
>>> +
>>> +    return bdrv_aio_flush(bs->file, cb, opaque);
>>> +}
>>> +
>>> +static BlockDriver bdrv_add_cow = {
>>> +    .format_name        = "add-cow",
>>> +    .instance_size      = sizeof(BDRVAddCowState),
>>> +    .bdrv_probe         = add_cow_probe,
>>> +    .bdrv_open          = add_cow_open,
>>> +    .bdrv_close         = add_cow_close,
>>> +    .bdrv_create        = add_cow_create,
>>> +    .bdrv_is_allocated  = add_cow_is_allocated,
>>> +
>>> +    .bdrv_co_readv      = add_cow_co_readv,
>>> +    .bdrv_co_writev     = add_cow_co_writev,
>>> +    .bdrv_truncate      = bdrv_add_cow_truncate,
>>> +
>>> +    .create_options     = add_cow_create_options,
>>> +    .bdrv_flush         = add_cow_flush,
>>> +    .bdrv_aio_flush     = add_cow_aio_flush,
>>> +};
>>> +
>>> +static void bdrv_add_cow_init(void)
>>> +{
>>> +    bdrv_register(&bdrv_add_cow);
>>> +}
>>> +
>>> +block_init(bdrv_add_cow_init);
>>> diff --git a/block_int.h b/block_int.h
>>> index 8c3b863..2b26fdc 100644
>>> --- a/block_int.h
>>> +++ b/block_int.h
>>> @@ -42,6 +42,7 @@
>>>   #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>>   #define BLOCK_OPT_PREALLOC      "preallocation"
>>>   #define BLOCK_OPT_SUBFMT        "subformat"
>>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>>
>>>   typedef struct AIOPool {
>>>       void (*cancel)(BlockDriverAIOCB *acb);
>>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>>> new file mode 100644
>>> index 0000000..0443c6c
>>> --- /dev/null
>>> +++ b/docs/specs/add-cow.txt
>>> @@ -0,0 +1,45 @@
>>> +== General ==
>>> +
>>> +Raw file format does not support backing_file and copy on write feature.
>>> Then
>>> +you can use add-cow file realize these features.
>>> +
>>> +When using add-cow, procedures may like this:
>>> +(ubuntu.img is a disk image which has been installed OS.)
>>> +    1)  Create a raw image with the same size of ubuntu.img
>>> +            qemu-img create -f raw test.raw 8G
>>> +    2)  Create a add-cow image which will store dirty bitmap
>>> +            qemu-img create -f add-cow test.add-cow -o
>>> backing_file=ubuntu.img,image_file=test.raw
>>> +    3)  Run qemu with add-cow image
>>> +            qemu -drive if=virtio,file=test.add-cow
>>> +
>>> +=Specification=
>>> +
>>> +The file format looks like this:
>>> +
>>> + +----------+----------+----------+-----+
>>> + |  Header  |   Data   |   Data   | ... |
>>> + +----------+----------+----------+-----+
>>> +
>>> + All numbers in add-cow are stored in Big Endian byte order.
>>> +
>>> +
>>> +== Header ==
>>> +
>>> +The Header is included in the first bytes:
>>> +
>>> +    Byte  0 -  7:       magic
>>> +                        add-cow magic string ("ADD_COW\xff")
>>> +
>>> +          8 -  11:      version
>>> +                        Version number (only valid value is 1 now)
>>> +
>>> +          12 - 1035:    backing_file
>>> +                        backing_file file name related to add-cow file.
>>> While
>>> +                        using backing_file, must together with
>>> image_file.
>>> +
>>> +         1036 - 2059:   image_file
>>> +                        image_file is a raw file, While using image_file,
>>> must
>>> +                        together with image_file.
>>> +
>>> +         2060 - 2067:   size
>>> +                        Virtual disk size of image_file in bytes.
>>
>>
>>
>
>
Stefan Hajnoczi - Oct. 27, 2011, 6:59 a.m.
On Wed, Oct 26, 2011 at 06:08:03PM +0800, Robert Wang wrote:
> Please find version 4 in the attachment.

Please send patches inline instead of attaching them.  It makes it
easier to reply with feedback.

If you want to add extra comments that should not be included in the git
commit you can add it below the '---' line in the patch.

Stefan
Robert Wang - Oct. 27, 2011, 8:54 a.m.
Re-sent it, thanks.

2011/10/27 Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>:
> On Wed, Oct 26, 2011 at 06:08:03PM +0800, Robert Wang wrote:
>> Please find version 4 in the attachment.
>
> Please send patches inline instead of attaching them.  It makes it
> easier to reply with feedback.
>
> If you want to add extra comments that should not be included in the git
> commit you can add it below the '---' line in the patch.
>
> Stefan
>
Dong xu Wang - Oct. 27, 2011, 8:56 a.m.
I re-sent the patch and use bdrv_co_flush now, thanks.

2011/10/27 shu ming <shuming@linux.vnet.ibm.com>:
> Looks good to me.  A nit, it seems that bdrv_flush is not supported anymore
> in upstream.
> bdrv_co_flush should be used instead if you update your workspace to latest
> one.
> On 2011-10-26 18:08, Robert Wang wrote:
>>
>> Please find version 4 in the attachment.
>>
>> 2011/10/23 shu ming<shuming@linux.vnet.ibm.com>:
>>>
>>> On 2011-10-13 0:23, Dong Xu Wang wrote:
>>>>
>>>> Add add-cow file format
>>>>
>>>> Signed-off-by: Dong Xu Wang<wdongxu@linux.vnet.ibm.com>
>>>> ---
>>>>  Makefile.objs          |    1 +
>>>>  block.c                |    2 +-
>>>>  block.h                |    1 +
>>>>  block/add-cow.c        |  412
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  block_int.h            |    1 +
>>>>  docs/specs/add-cow.txt |   45 ++++++
>>>>  6 files changed, 461 insertions(+), 1 deletions(-)
>>>>  create mode 100644 block/add-cow.c
>>>>  create mode 100644 docs/specs/add-cow.txt
>>>>
>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>> index c849e51..624c04c 100644
>>>> --- a/Makefile.objs
>>>> +++ b/Makefile.objs
>>>> @@ -31,6 +31,7 @@ block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>>>>
>>>>  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o
>>>> vpc.o vvfat.o
>>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o
>>>> qcow2-snapshot.o qcow2-cache.o
>>>> +block-nested-y += add-cow.o
>>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
>>>> qed-cluster.o
>>>>  block-nested-y += qed-check.o
>>>>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>>> diff --git a/block.c b/block.c
>>>> index e865fab..c25241d 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -106,7 +106,7 @@ int is_windows_drive(const char *filename)
>>>>  #endif
>>>>
>>>>  /* check if the path starts with "<protocol>:" */
>>>> -static int path_has_protocol(const char *path)
>>>> +int path_has_protocol(const char *path)
>>>>  {
>>>>  #ifdef _WIN32
>>>>      if (is_windows_drive(path) ||
>>>> diff --git a/block.h b/block.h
>>>> index 16bfa0a..8b09f12 100644
>>>> --- a/block.h
>>>> +++ b/block.h
>>>> @@ -256,6 +256,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size,
>>>> QEMUSnapshotInfo *sn);
>>>>
>>>>  char *get_human_readable_size(char *buf, int buf_size, int64_t size);
>>>>  int path_is_absolute(const char *path);
>>>> +int path_has_protocol(const char *path);
>>>>  void path_combine(char *dest, int dest_size,
>>>>                    const char *base_path,
>>>>                    const char *filename);
>>>> diff --git a/block/add-cow.c b/block/add-cow.c
>>>> new file mode 100644
>>>> index 0000000..d2538a2
>>>> --- /dev/null
>>>> +++ b/block/add-cow.c
>>>> @@ -0,0 +1,412 @@
>>>> +#include "qemu-common.h"
>>>> +#include "block_int.h"
>>>> +#include "module.h"
>>>> +
>>>> +#define ADD_COW_MAGIC  (((uint64_t)'A'<<    56) | ((uint64_t)'D'<<
>>>>  48) | \
>>>> +                        ((uint64_t)'D'<<    40) | ((uint64_t)'_'<<
>>>>  32) | \
>>>> +                        ((uint64_t)'C'<<    24) | ((uint64_t)'O'<<
>>>>  16) | \
>>>> +                        ((uint64_t)'W'<<    8) | 0xFF)
>>>> +#define ADD_COW_VERSION 1
>>>> +
>>>> +typedef struct AddCowHeader {
>>>> +    uint64_t magic;
>>>> +    uint32_t version;
>>>> +    char backing_file[1024];
>>>> +    char image_file[1024];
>>>
>>> 1024 is a magic number for me.  Can we have a meaningful macro?
>>>
>>>
>>>> +    uint64_t size;
>>>> +} QEMU_PACKED AddCowHeader;
>>>> +
>>>> +typedef struct BDRVAddCowState {
>>>> +    char image_file[1024];
>>>> +    BlockDriverState *image_hd;
>>>> +    uint8_t *bitmap;
>>>> +    uint64_t bitmap_size;
>>>> +} BDRVAddCowState;
>>>> +
>>>> +static int add_cow_probe(const uint8_t *buf, int buf_size, const char
>>>> *filename)
>>>> +{
>>>> +    const AddCowHeader *header = (const void *)buf;
>>>> +
>>>> +    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC&&
>>>> +        be32_to_cpu(header->version) == ADD_COW_VERSION) {
>>>> +        return 100;
>>>> +    } else {
>>>> +        return 0;
>>>> +    }
>>>> +}
>>>> +
>>>> +static int add_cow_open(BlockDriverState *bs, int flags)
>>>> +{
>>>> +    AddCowHeader header;
>>>> +    int64_t size;
>>>> +    char image_filename[1024];
>>>> +    int image_flags;
>>>> +    BlockDriver *image_drv = NULL;
>>>> +    int ret;
>>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>>> +
>>>> +    ret = bdrv_pread(bs->file, 0,&header, sizeof(header));
>>>> +    if (ret != sizeof(header)) {
>>>> +        goto fail;
>>>> +    }
>>>> +
>>>> +    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
>>>> +        be32_to_cpu(header.version) != ADD_COW_VERSION) {
>>>> +        ret = -1;
>>>> +        goto fail;
>>>> +    }
>>>> +
>>>> +    size = be64_to_cpu(header.size);
>>>> +    bs->total_sectors = size / BDRV_SECTOR_SIZE;
>>>> +
>>>> +    QEMU_BUILD_BUG_ON(sizeof(state->image_file) !=
>>>> sizeof(header.image_file));
>>>> +    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
>>>> +            header.backing_file);
>>>> +    pstrcpy(state->image_file, sizeof(state->image_file),
>>>> +            header.image_file);
>>>> +
>>>> +    state->bitmap_size = ((bs->total_sectors + 7)>>    3);
>>>> +    state->bitmap = g_malloc0(state->bitmap_size);
>>>> +
>>>> +    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
>>>> +            state->bitmap_size);
>>>> +    if (ret != state->bitmap_size) {
>>>> +        goto fail;
>>>> +    }
>>>> +   /* If there is a image_file, must be together with backing_file */
>>>> +    if (state->image_file[0] != '\0') {
>>>> +        state->image_hd = bdrv_new("");
>>>> +        /* Relative to image or working dir, need discussion */
>>>> +        if (path_has_protocol(state->image_file)) {
>>>> +            pstrcpy(image_filename, sizeof(image_filename),
>>>> +                    state->image_file);
>>>> +        } else {
>>>> +            path_combine(image_filename, sizeof(image_filename),
>>>> +                         bs->filename, state->image_file);
>>>> +        }
>>>> +
>>>> +        image_drv = bdrv_find_format("raw");
>>>> +        image_flags =
>>>> +             (flags&    (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) |
>>>> BDRV_O_RDWR;
>>>> +        state->image_hd->keep_read_only = 0;
>>>> +
>>>> +        ret = bdrv_open(state->image_hd, image_filename, image_flags,
>>>> +                image_drv);
>>>> +        if (ret<    0) {
>>>> +            goto fail;
>>>> +        }
>>>> +    } else {
>>>> +        ret = -ENOENT;
>>>> +        goto fail;
>>>> +    }
>>>> +    return 0;
>>>> + fail:
>>>> +    if (state->bitmap) {
>>>> +        g_free(state->bitmap);
>>>> +        state->bitmap = NULL;
>>>> +    }
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static inline void add_cow_set_bit(BlockDriverState *bs, int64_t
>>>> bitnum)
>>>> +{
>>>> +    uint64_t offset = bitnum / 8;
>>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>>> +    state->bitmap[offset] |= (1<<    (bitnum % 8));
>>>> +}
>>>> +
>>>> +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
>>>> +{
>>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>>> +    uint64_t offset = bitnum / 8;
>>>> +    return !!(state->bitmap[offset]&    (1<<    (bitnum % 8)));
>>>> +}
>>>> +
>>>> +static int add_cow_is_allocated(BlockDriverState *bs, int64_t
>>>> sector_num,
>>>> +        int nb_sectors, int *num_same)
>>>> +{
>>>> +    int changed;
>>>> +    uint64_t bitmap_size = ((BDRVAddCowState
>>>> *)(bs->opaque))->bitmap_size;
>>>> +
>>>> +    /* Beyond the end of bitmap, return error or read from
>>>> backing_file?
>>>> */
>>>> +    if (((sector_num + nb_sectors + 7) / 8)>    bitmap_size) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    if (nb_sectors == 0) {
>>>> +        *num_same = nb_sectors;
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    changed = is_bit_set(bs, sector_num);
>>>> +    if (changed<    0) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    for (*num_same = 1; *num_same<    nb_sectors; (*num_same)++) {
>>>> +        if (is_bit_set(bs, sector_num + *num_same) != changed) {
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return changed;
>>>> +}
>>>> +
>>>> +static int add_cow_update_bitmap(BlockDriverState *bs, int64_t
>>>> sector_num,
>>>> +        int nb_sectors)
>>>> +{
>>>> +    int i, ret = 0;
>>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>>> +    uint64_t start_pos = sector_num / 8;
>>>> +    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
>>>> +
>>>> +    if (start_pos>    state->bitmap_size) {
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    for (i = 0; i<    nb_sectors; i++) {
>>>> +        add_cow_set_bit(bs, sector_num + i);
>>>> +    }
>>>> +    ret = bdrv_pwrite_sync(bs->file,
>>>> +        sizeof(AddCowHeader) + start_pos,
>>>> +        state->bitmap + start_pos,
>>>> +        MIN(((end_pos - start_pos)&    (~512)) + 512,
>>>> state->bitmap_size -
>>>> start_pos));
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static void add_cow_close(BlockDriverState *bs)
>>>> +{
>>>> +    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
>>>> +    if (state->bitmap) {
>>>> +        g_free(state->bitmap);
>>>> +        state->bitmap = NULL;
>>>> +    }
>>>> +}
>>>> +
>>>> +static int add_cow_create(const char *filename, QEMUOptionParameter
>>>> *options)
>>>> +{
>>>> +    AddCowHeader header;
>>>> +    int64_t image_sectors = 0;
>>>> +    const char *backing_filename = NULL;
>>>> +    const char *image_filename = NULL;
>>>> +    int ret;
>>>> +    BlockDriverState *bs, *image_bs = NULL;
>>>> +
>>>> +    while (options&&    options->name) {
>>>> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
>>>> +            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
>>>> +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
>>>> +            backing_filename = options->value.s;
>>>> +        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
>>>> +            image_filename = options->value.s;
>>>> +        }
>>>> +        options++;
>>>> +    }
>>>> +
>>>> +    if (!backing_filename || !image_filename) {
>>>> +        return -EINVAL;
>>>> +    }
>>>> +    /* Make sure image file exists */
>>>> +    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
>>>> +            | BDRV_O_CACHE_WB);
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +    bdrv_close(image_bs);
>>>> +    ret = bdrv_create_file(filename, NULL);
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    memset(&header, 0, sizeof(header));
>>>> +    header.magic = cpu_to_be64(ADD_COW_MAGIC);
>>>> +    header.version = cpu_to_be32(ADD_COW_VERSION);
>>>> +    pstrcpy(header.backing_file, sizeof(header.backing_file),
>>>> backing_filename);
>>>> +    pstrcpy(header.image_file, sizeof(header.image_file),
>>>> image_filename);
>>>> +    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
>>>> +
>>>> +    ret = bdrv_pwrite(bs, 0,&header, sizeof(header));
>>>> +    if (ret<    0) {
>>>> +        bdrv_close(bs);
>>>> +        return ret;
>>>> +    }
>>>> +    bdrv_close(bs);
>>>> +
>>>> +    BlockDriver *drv = bdrv_find_format("add-cow");
>>>> +    assert(drv != NULL);
>>>> +    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
>>>> +fail:
>>>
>>> "fail" label is not used in this function.
>>>
>>>
>>>> +    bdrv_close(bs);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
>>>> +                         int remaining_sectors, QEMUIOVector *qiov)
>>>> +{
>>>> +    BDRVAddCowState *s = bs->opaque;
>>>> +    int cur_nr_sectors;
>>>> +    uint64_t bytes_done = 0;
>>>> +    QEMUIOVector hd_qiov;
>>>> +    int n, ret = 0;
>>>> +
>>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>>> +    while (remaining_sectors != 0) {
>>>> +        cur_nr_sectors = remaining_sectors;
>>>> +        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors,&n)) {
>>>> +            cur_nr_sectors = n;
>>>> +            qemu_iovec_reset(&hd_qiov);
>>>> +            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>>>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>>>> +            ret = bdrv_co_readv(s->image_hd, sector_num, n,&hd_qiov);
>>>> +            if (ret<    0) {
>>>> +                goto fail;
>>>> +            }
>>>> +        } else {
>>>> +            cur_nr_sectors = n;
>>>> +            if (bs->backing_hd) {
>>>> +                qemu_iovec_reset(&hd_qiov);
>>>> +                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
>>>> +                            cur_nr_sectors * BDRV_SECTOR_SIZE);
>>>> +                ret = bdrv_co_readv(bs->backing_hd, sector_num,
>>>> +                                    n,&hd_qiov);
>>>> +                if (ret<    0) {
>>>> +                    goto fail;
>>>> +                }
>>>> +            } else {
>>>> +                qemu_iovec_reset(&hd_qiov);
>>>> +                qemu_iovec_memset(&hd_qiov, 0,
>>>> +                    BDRV_SECTOR_SIZE * cur_nr_sectors);
>>>> +            }
>>>> +        }
>>>> +        remaining_sectors -= cur_nr_sectors;
>>>> +        sector_num += cur_nr_sectors;
>>>> +        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
>>>> +    }
>>>> +fail:
>>>> +    qemu_iovec_destroy(&hd_qiov);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
>>>> +                          int remaining_sectors, QEMUIOVector *qiov)
>>>> +{
>>>> +    BDRVAddCowState *s = bs->opaque;
>>>> +    int ret = 0;
>>>> +    QEMUIOVector hd_qiov;
>>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>>> +
>>>
>>> It looks like that the memory where "bs" is pointing to should be
>>> protected
>>> by a lock. Because "bs"
>>> was passed to several BlockDriver entries like ".bdrv_co_writev',
>>> 'bdrv_co_readv'&etc which can be executed
>>> in parallel to access the memory where "bs" is pointing to.
>>>
>>>> +    qemu_iovec_reset(&hd_qiov);
>>>> +    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors *
>>>> BDRV_SECTOR_SIZE);
>>>> +    ret = bdrv_co_writev(s->image_hd,
>>>> +                     sector_num,
>>>> +                     remaining_sectors,&hd_qiov);
>>>> +    if (ret<    0) {
>>>> +        goto fail;
>>>> +    }
>>>> +
>>>> +    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
>>>> +    if (ret<    0) {
>>>> +        goto fail;
>>>> +    }
>>>> +fail:
>>>> +    qemu_iovec_destroy(&hd_qiov);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
>>>> +{
>>>> +    int ret = 0;
>>>> +    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
>>>> +    int64_t be_offset = cpu_to_be64(offset);
>>>> +    BDRVAddCowState *state = bs->opaque;
>>>> +    ret = bdrv_truncate(bs->file, ((image_sectors + 7)>>    3)
>>>> +        + sizeof(AddCowHeader));
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
>>>> +&be_offset, sizeof(uint64_t));
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_truncate(state->image_hd, offset);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static QEMUOptionParameter add_cow_create_options[] = {
>>>> +    {
>>>> +        .name = BLOCK_OPT_SIZE,
>>>> +        .type = OPT_SIZE,
>>>> +        .help = "Virtual disk size"
>>>> +    },
>>>> +    {
>>>> +        .name = BLOCK_OPT_BACKING_FILE,
>>>> +        .type = OPT_STRING,
>>>> +        .help = "File name of a base image"
>>>> +    },
>>>> +    {
>>>> +        .name = BLOCK_OPT_IMAGE_FILE,
>>>> +        .type = OPT_STRING,
>>>> +        .help = "File name of a image file"
>>>> +    },
>>>> +    { NULL }
>>>> +};
>>>> +
>>>> +static int add_cow_flush(BlockDriverState *bs)
>>>> +{
>>>> +    BDRVAddCowState *state = bs->opaque;
>>>> +    int ret = bdrv_flush(state->image_hd);
>>>> +    if (ret<    0) {
>>>> +        return ret;
>>>> +    }
>>>> +    return bdrv_flush(bs->file);
>>>> +}
>>>> +
>>>> +static BlockDriverAIOCB *add_cow_aio_flush(BlockDriverState *bs,
>>>> +        BlockDriverCompletionFunc *cb, void *opaque)
>>>> +{
>>>> +    BDRVAddCowState *state = bs->opaque;
>>>> +    int ret = bdrv_flush(state->image_hd);
>>>> +    if (ret<    0) {
>>>> +        return NULL;
>>>> +    }
>>>> +
>>>> +    return bdrv_aio_flush(bs->file, cb, opaque);
>>>> +}
>>>> +
>>>> +static BlockDriver bdrv_add_cow = {
>>>> +    .format_name        = "add-cow",
>>>> +    .instance_size      = sizeof(BDRVAddCowState),
>>>> +    .bdrv_probe         = add_cow_probe,
>>>> +    .bdrv_open          = add_cow_open,
>>>> +    .bdrv_close         = add_cow_close,
>>>> +    .bdrv_create        = add_cow_create,
>>>> +    .bdrv_is_allocated  = add_cow_is_allocated,
>>>> +
>>>> +    .bdrv_co_readv      = add_cow_co_readv,
>>>> +    .bdrv_co_writev     = add_cow_co_writev,
>>>> +    .bdrv_truncate      = bdrv_add_cow_truncate,
>>>> +
>>>> +    .create_options     = add_cow_create_options,
>>>> +    .bdrv_flush         = add_cow_flush,
>>>> +    .bdrv_aio_flush     = add_cow_aio_flush,
>>>> +};
>>>> +
>>>> +static void bdrv_add_cow_init(void)
>>>> +{
>>>> +    bdrv_register(&bdrv_add_cow);
>>>> +}
>>>> +
>>>> +block_init(bdrv_add_cow_init);
>>>> diff --git a/block_int.h b/block_int.h
>>>> index 8c3b863..2b26fdc 100644
>>>> --- a/block_int.h
>>>> +++ b/block_int.h
>>>> @@ -42,6 +42,7 @@
>>>>  #define BLOCK_OPT_TABLE_SIZE    "table_size"
>>>>  #define BLOCK_OPT_PREALLOC      "preallocation"
>>>>  #define BLOCK_OPT_SUBFMT        "subformat"
>>>> +#define BLOCK_OPT_IMAGE_FILE    "image_file"
>>>>
>>>>  typedef struct AIOPool {
>>>>      void (*cancel)(BlockDriverAIOCB *acb);
>>>> diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
>>>> new file mode 100644
>>>> index 0000000..0443c6c
>>>> --- /dev/null
>>>> +++ b/docs/specs/add-cow.txt
>>>> @@ -0,0 +1,45 @@
>>>> +== General ==
>>>> +
>>>> +Raw file format does not support backing_file and copy on write
>>>> feature.
>>>> Then
>>>> +you can use add-cow file realize these features.
>>>> +
>>>> +When using add-cow, procedures may like this:
>>>> +(ubuntu.img is a disk image which has been installed OS.)
>>>> +    1)  Create a raw image with the same size of ubuntu.img
>>>> +            qemu-img create -f raw test.raw 8G
>>>> +    2)  Create a add-cow image which will store dirty bitmap
>>>> +            qemu-img create -f add-cow test.add-cow -o
>>>> backing_file=ubuntu.img,image_file=test.raw
>>>> +    3)  Run qemu with add-cow image
>>>> +            qemu -drive if=virtio,file=test.add-cow
>>>> +
>>>> +=Specification=
>>>> +
>>>> +The file format looks like this:
>>>> +
>>>> + +----------+----------+----------+-----+
>>>> + |  Header  |   Data   |   Data   | ... |
>>>> + +----------+----------+----------+-----+
>>>> +
>>>> + All numbers in add-cow are stored in Big Endian byte order.
>>>> +
>>>> +
>>>> +== Header ==
>>>> +
>>>> +The Header is included in the first bytes:
>>>> +
>>>> +    Byte  0 -  7:       magic
>>>> +                        add-cow magic string ("ADD_COW\xff")
>>>> +
>>>> +          8 -  11:      version
>>>> +                        Version number (only valid value is 1 now)
>>>> +
>>>> +          12 - 1035:    backing_file
>>>> +                        backing_file file name related to add-cow file.
>>>> While
>>>> +                        using backing_file, must together with
>>>> image_file.
>>>> +
>>>> +         1036 - 2059:   image_file
>>>> +                        image_file is a raw file, While using
>>>> image_file,
>>>> must
>>>> +                        together with image_file.
>>>> +
>>>> +         2060 - 2067:   size
>>>> +                        Virtual disk size of image_file in bytes.
>>>
>>>
>>>
>>
>>
>
>
>

Patch

From 2e91b1e7fbb5be88942d86309dfdf0af602cc88c Mon Sep 17 00:00:00 2001
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Date: Wed, 26 Oct 2011 17:41:53 +0800
Subject: [PATCH v4] block:add_cow file format

add-cow file format.

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
---
 Makefile.objs          |    1 +
 block.c                |    2 +-
 block.h                |    1 +
 block/add-cow.c        |  417 ++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h            |    1 +
 docs/specs/add-cow.txt |   45 +++++
 6 files changed, 466 insertions(+), 1 deletions(-)
 create mode 100644 block/add-cow.c
 create mode 100644 docs/specs/add-cow.txt

diff --git a/Makefile.objs b/Makefile.objs
index 01587c8..208c12c 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -31,6 +31,7 @@  block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-nested-y += add-cow.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block.c b/block.c
index 9873b57..d9470ed 100644
--- a/block.c
+++ b/block.c
@@ -110,7 +110,7 @@  int is_windows_drive(const char *filename)
 #endif
 
 /* check if the path starts with "<protocol>:" */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 #ifdef _WIN32
     if (is_windows_drive(path) ||
diff --git a/block.h b/block.h
index e77988e..a5a4f3a 100644
--- a/block.h
+++ b/block.h
@@ -266,6 +266,7 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
                   const char *base_path,
                   const char *filename);
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 0000000..51b0f5a
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,417 @@ 
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define ADD_COW_MAGIC  (((uint64_t)'A' << 56) | ((uint64_t)'D' << 48) | \
+                        ((uint64_t)'D' << 40) | ((uint64_t)'_' << 32) | \
+                        ((uint64_t)'C' << 24) | ((uint64_t)'O' << 16) | \
+                        ((uint64_t)'W' << 8) | 0xFF)
+#define ADD_COW_VERSION 1
+#define ADD_COW_FILE_LEN 1024
+
+typedef struct AddCowHeader {
+    uint64_t magic;
+    uint32_t version;
+    char backing_file[ADD_COW_FILE_LEN];
+    char image_file[ADD_COW_FILE_LEN];
+    uint64_t size;
+} QEMU_PACKED AddCowHeader;
+
+typedef struct BDRVAddCowState {
+    char image_file[ADD_COW_FILE_LEN];
+    BlockDriverState *image_hd;
+    uint8_t *bitmap;
+    uint64_t bitmap_size;
+    CoMutex lock;
+} BDRVAddCowState;
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const AddCowHeader *header = (const void *)buf;
+
+    if (be64_to_cpu(header->magic) == ADD_COW_MAGIC &&
+        be32_to_cpu(header->version) == ADD_COW_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int add_cow_open(BlockDriverState *bs, int flags)
+{
+    AddCowHeader header;
+    int64_t size;
+    char image_filename[ADD_COW_FILE_LEN];
+    int image_flags;
+    BlockDriver *image_drv = NULL;
+    int ret;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto fail;
+    }
+
+    if (be64_to_cpu(header.magic) != ADD_COW_MAGIC ||
+        be32_to_cpu(header.version) != ADD_COW_VERSION) {
+        ret = -1;
+        goto fail;
+    }
+
+    size = be64_to_cpu(header.size);
+    bs->total_sectors = size / BDRV_SECTOR_SIZE;
+
+    QEMU_BUILD_BUG_ON(sizeof(state->image_file) != sizeof(header.image_file));
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+            header.backing_file);
+    pstrcpy(state->image_file, sizeof(state->image_file),
+            header.image_file);
+
+    state->bitmap_size = ((bs->total_sectors + 7) >> 3);
+    state->bitmap = g_malloc0(state->bitmap_size);
+
+    ret = bdrv_pread(bs->file, sizeof(header), state->bitmap,
+            state->bitmap_size);
+    if (ret != state->bitmap_size) {
+        goto fail;
+    }
+   /* If there is a image_file, must be together with backing_file */
+    if (state->image_file[0] != '\0') {
+        state->image_hd = bdrv_new("");
+        /* Relative to image or working dir, need discussion */
+        if (path_has_protocol(state->image_file)) {
+            pstrcpy(image_filename, sizeof(image_filename),
+                    state->image_file);
+        } else {
+            path_combine(image_filename, sizeof(image_filename),
+                         bs->filename, state->image_file);
+        }
+
+        image_drv = bdrv_find_format("raw");
+        image_flags =
+             (flags & (~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING))) | BDRV_O_RDWR;
+        state->image_hd->keep_read_only = 0;
+
+        ret = bdrv_open(state->image_hd, image_filename, image_flags,
+                image_drv);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        ret = -ENOENT;
+        goto fail;
+    }
+    qemu_co_mutex_init(&state->lock);
+    return 0;
+ fail:
+    if (state->bitmap) {
+        g_free(state->bitmap);
+        state->bitmap = NULL;
+    }
+    return ret;
+}
+
+static inline void add_cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+{
+    uint64_t offset = bitnum / 8;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    state->bitmap[offset] |= (1 << (bitnum % 8));
+}
+
+static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+{
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    uint64_t offset = bitnum / 8;
+    return !!(state->bitmap[offset] & (1 << (bitnum % 8)));
+}
+
+static int add_cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors, int *num_same)
+{
+    int changed;
+    uint64_t bitmap_size = ((BDRVAddCowState *)(bs->opaque))->bitmap_size;
+
+    /* Beyond the end of bitmap, return error or read from backing_file? */
+    if (((sector_num + nb_sectors + 7) / 8) > bitmap_size) {
+        return 0;
+    }
+
+    if (nb_sectors == 0) {
+        *num_same = nb_sectors;
+        return 0;
+    }
+
+    changed = is_bit_set(bs, sector_num);
+    if (changed < 0) {
+        return 0;
+    }
+
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+        if (is_bit_set(bs, sector_num + *num_same) != changed) {
+            break;
+        }
+    }
+
+    return changed;
+}
+
+static int add_cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
+        int nb_sectors)
+{
+    int i, ret = 0;
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    uint64_t start_pos = sector_num / 8;
+    uint64_t end_pos = (sector_num + nb_sectors - 1) / 8;
+
+    if (start_pos > state->bitmap_size) {
+        return -1;
+    }
+
+    for (i = 0; i < nb_sectors; i++) {
+        add_cow_set_bit(bs, sector_num + i);
+    }
+    ret = bdrv_pwrite_sync(bs->file,
+        sizeof(AddCowHeader) + start_pos,
+        state->bitmap + start_pos,
+        MIN(((end_pos - start_pos) & (~512)) + 512, state->bitmap_size - start_pos));
+    return ret;
+}
+
+static void add_cow_close(BlockDriverState *bs)
+{
+    BDRVAddCowState *state = (BDRVAddCowState *)(bs->opaque);
+    if (state->bitmap) {
+        g_free(state->bitmap);
+        state->bitmap = NULL;
+    }
+}
+
+static int add_cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    AddCowHeader header;
+    int64_t image_sectors = 0;
+    const char *backing_filename = NULL;
+    const char *image_filename = NULL;
+    int ret;
+    BlockDriverState *bs, *image_bs = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_filename = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_IMAGE_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    if (!backing_filename || !image_filename) {
+        return -EINVAL;
+    }
+    /* Make sure image file exists */
+    ret = bdrv_file_open(&image_bs, image_filename, BDRV_O_RDWR
+            | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+    bdrv_close(image_bs);
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be64(ADD_COW_MAGIC);
+    header.version = cpu_to_be32(ADD_COW_VERSION);
+    pstrcpy(header.backing_file, sizeof(header.backing_file), backing_filename);
+    pstrcpy(header.image_file, sizeof(header.image_file), image_filename);
+    header.size = cpu_to_be64(image_sectors * BDRV_SECTOR_SIZE);
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        bdrv_close(bs);
+        return ret;
+    }
+    bdrv_close(bs);
+
+    BlockDriver *drv = bdrv_find_format("add-cow");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(bs, image_sectors * BDRV_SECTOR_SIZE);
+    bdrv_close(bs);
+    return ret;
+}
+
+static int add_cow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int cur_nr_sectors;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int n, ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    while (remaining_sectors != 0) {
+        cur_nr_sectors = remaining_sectors;
+        if (add_cow_is_allocated(bs, sector_num, cur_nr_sectors, &n)) {
+            cur_nr_sectors = n;
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+            ret = bdrv_co_readv(s->image_hd, sector_num, n, &hd_qiov);
+            if (ret < 0) {
+                goto fail;
+            }
+        } else {
+            cur_nr_sectors = n;
+            if (bs->backing_hd) {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+                            cur_nr_sectors * BDRV_SECTOR_SIZE);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_memset(&hd_qiov, 0,
+                    BDRV_SECTOR_SIZE * cur_nr_sectors);
+            }
+        }
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * BDRV_SECTOR_SIZE;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int add_cow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVAddCowState *s = bs->opaque;
+    int ret = 0;
+    QEMUIOVector hd_qiov;
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+    qemu_co_mutex_lock(&s->lock);
+    qemu_iovec_reset(&hd_qiov);
+    qemu_iovec_copy(&hd_qiov, qiov, 0, remaining_sectors * BDRV_SECTOR_SIZE);
+    ret = bdrv_co_writev(s->image_hd,
+                     sector_num,
+                     remaining_sectors, &hd_qiov);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = add_cow_update_bitmap(bs, sector_num, remaining_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
+static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t offset)
+{
+    int ret = 0;
+    int64_t image_sectors = offset/BDRV_SECTOR_SIZE;
+    int64_t be_offset = cpu_to_be64(offset);
+    BDRVAddCowState *state = bs->opaque;
+    ret = bdrv_truncate(bs->file, ((image_sectors + 7) >> 3)
+        + sizeof(AddCowHeader));
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(AddCowHeader, size),
+        &be_offset, sizeof(uint64_t));
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(state->image_hd, offset);
+    return ret;
+}
+
+static QEMUOptionParameter add_cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_IMAGE_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a image file"
+    },
+    { NULL }
+};
+
+static int add_cow_flush(BlockDriverState *bs)
+{
+    BDRVAddCowState *state = bs->opaque;
+    int ret = bdrv_flush(state->image_hd);
+    if (ret < 0) {
+        return ret;
+    }
+    return bdrv_flush(bs->file);
+}
+
+static BlockDriverAIOCB *add_cow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVAddCowState *state = bs->opaque;
+    int ret = bdrv_flush(state->image_hd);
+    if (ret < 0) {
+        return NULL;
+    }
+
+    return bdrv_aio_flush(bs->file, cb, opaque);
+}
+
+static BlockDriver bdrv_add_cow = {
+    .format_name        = "add-cow",
+    .instance_size      = sizeof(BDRVAddCowState),
+    .bdrv_probe         = add_cow_probe,
+    .bdrv_open          = add_cow_open,
+    .bdrv_close         = add_cow_close,
+    .bdrv_create        = add_cow_create,
+    .bdrv_is_allocated  = add_cow_is_allocated,
+
+    .bdrv_co_readv      = add_cow_co_readv,
+    .bdrv_co_writev     = add_cow_co_writev,
+    .bdrv_truncate      = bdrv_add_cow_truncate,
+
+    .create_options     = add_cow_create_options,
+    .bdrv_flush         = add_cow_flush,
+    .bdrv_aio_flush     = add_cow_aio_flush,
+};
+
+static void bdrv_add_cow_init(void)
+{
+    bdrv_register(&bdrv_add_cow);
+}
+
+block_init(bdrv_add_cow_init);
diff --git a/block_int.h b/block_int.h
index f2f4f2d..b581fe3 100644
--- a/block_int.h
+++ b/block_int.h
@@ -42,6 +42,7 @@ 
 #define BLOCK_OPT_TABLE_SIZE    "table_size"
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
+#define BLOCK_OPT_IMAGE_FILE    "image_file"
 
 typedef struct AIOPool {
     void (*cancel)(BlockDriverAIOCB *acb);
diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 0000000..0443c6c
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,45 @@ 
+== General ==
+
+Raw file format does not support backing_file and copy on write feature. Then
+you can use add-cow file realize these features.
+
+When using add-cow, procedures may like this:
+(ubuntu.img is a disk image which has been installed OS.)
+    1)  Create a raw image with the same size of ubuntu.img
+            qemu-img create -f raw test.raw 8G
+    2)  Create a add-cow image which will store dirty bitmap
+            qemu-img create -f add-cow test.add-cow -o backing_file=ubuntu.img,image_file=test.raw
+    3)  Run qemu with add-cow image
+            qemu -drive if=virtio,file=test.add-cow
+
+=Specification=
+
+The file format looks like this:
+
+ +----------+----------+----------+-----+
+ |  Header  |   Data   |   Data   | ... |
+ +----------+----------+----------+-----+
+
+ All numbers in add-cow are stored in Big Endian byte order.
+
+
+== Header ==
+
+The Header is included in the first bytes:
+
+    Byte  0 -  7:       magic
+                        add-cow magic string ("ADD_COW\xff")
+
+          8 -  11:      version
+                        Version number (only valid value is 1 now)
+
+          12 - 1035:    backing_file
+                        backing_file file name related to add-cow file. While
+                        using backing_file, must together with image_file.
+
+         1036 - 2059:   image_file
+                        image_file is a raw file, While using image_file, must
+                        together with image_file.
+
+         2060 - 2067:   size
+                        Virtual disk size of image_file in bytes.
-- 
1.7.5.4