Patchwork [2/7] Add blkmirror block driver

login
register
mail settings
Submitter Marcelo Tosatti
Date June 6, 2011, 4:55 p.m.
Message ID <20110606165823.588925767@amt.cnet>
Download mbox | patch
Permalink /patch/99046/
State New
Headers show

Comments

Marcelo Tosatti - June 6, 2011, 4:55 p.m.
Mirrored writes are used by live block copy.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
malc - June 6, 2011, 9:52 p.m.
On Mon, 6 Jun 2011, Marcelo Tosatti wrote:

> Mirrored writes are used by live block copy.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Index: qemu-block-copy/block/blkmirror.c
> ===================================================================
> --- /dev/null
> +++ qemu-block-copy/block/blkmirror.c
> @@ -0,0 +1,277 @@
> +/*
> + * Block driver for mirrored writes.
> + *
> + * Copyright (C) 2011 Red Hat, Inc.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include <stdarg.h>
> +#include "block_int.h"
> +
> +typedef struct {
> +    BlockDriverState *bs[2];
> +} BdrvMirrorState;
> +
> +typedef struct DupAIOCB DupAIOCB;
> +
> +typedef struct SingleAIOCB {
> +    BlockDriverAIOCB *aiocb;
> +    int finished;
> +    DupAIOCB *parent;
> +} SingleAIOCB;
> +
> +struct DupAIOCB {
> +    BlockDriverAIOCB common;
> +    int count;
> +
> +    BlockDriverCompletionFunc *cb;
> +    SingleAIOCB aios[2];
> +    int ret;
> +};
> +
> +/* Valid blkmirror filenames look like
> + * blkmirror:path/to/image1:path/to/image2 */
> +static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int ret, escape, i, n;
> +    char *raw;
> +
> +    /* Parse the blkmirror: prefix */
> +    if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
> +        return -EINVAL;
> +    }
> +    filename += strlen("blkmirror:");
> +
> +    /* Parse the raw image filename */
> +    raw = malloc(strlen(filename));
> +    escape = 0;
> +    for (i = n = 0; i < strlen(filename); i++) {
> +        if (!escape && filename[i] == ':') {
> +            break;
> +        }
> +        if (!escape && filename[i] == '\\') {
> +            escape = 1;
> +        } else {
> +            escape = 0;
> +        }
> +
> +        if (!escape) {
> +            raw[n++] = filename[i];
> +        }
> +    }
> +    raw[n] = '\0';

This potentially writes past the memory allocated for raw.

> +
> +    m->bs[0] = bdrv_new("");
> +    if (m->bs[0] == NULL) {
> +        return -ENOMEM;
> +    }
> +    ret = bdrv_open(m->bs[0], raw, flags, NULL);
> +    free(raw);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    filename += i + 1;
> +
> +    m->bs[1] = bdrv_new("");
> +    if (m->bs[1] == NULL) {
> +        return -ENOMEM;
> +    }
> +    ret = bdrv_open(m->bs[1], filename, flags, NULL);
> +    if (ret < 0) {
> +        bdrv_delete(m->bs[0]);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void blkmirror_close(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        bdrv_delete(m->bs[i]);
> +        m->bs[i] = NULL;
> +    }
> +}
> +
> +static int blkmirror_flush(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +
> +    bdrv_flush(m->bs[0]);
> +    bdrv_flush(m->bs[1]);
> +
> +    return 0;
> +}
> +
> +static int64_t blkmirror_getlength(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +
> +    return bdrv_getlength(m->bs[0]);
> +}
> +
> +static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
> +                                             int64_t sector_num,
> +                                             QEMUIOVector *qiov,
> +                                             int nb_sectors,
> +                                             BlockDriverCompletionFunc *cb,
> +                                             void *opaque)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
> +}
> +
> +static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
> +    int i;
> +
> +    for (i = 0 ; i < 2; i++) {
> +        if (!acb->aios[i].finished) {
> +            bdrv_aio_cancel(acb->aios[i].aiocb);
> +        }
> +    }
> +    qemu_aio_release(acb);
> +}
> +
> +static AIOPool dup_aio_pool = {
> +    .aiocb_size         = sizeof(DupAIOCB),
> +    .cancel             = dup_aio_cancel,
> +};
> +
> +static void blkmirror_aio_cb(void *opaque, int ret)
> +{
> +    SingleAIOCB *scb = opaque;
> +    DupAIOCB *dcb = scb->parent;
> +
> +    scb->finished = 1;
> +    dcb->count--;
> +    assert(dcb->count >= 0);
> +    if (ret < 0) {
> +        dcb->ret = ret;
> +    }
> +    if (dcb->count == 0) {
> +        dcb->common.cb(dcb->common.opaque, dcb->ret);
> +        qemu_aio_release(dcb);
> +    }
> +}
> +
> +static DupAIOCB *dup_aio_get(BlockDriverState *bs,
> +                             BlockDriverCompletionFunc *cb,
> +                             void *opaque)
> +{
> +    DupAIOCB *dcb;
> +    int i;
> +
> +    dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
> +    if (!dcb) {
> +        return NULL;
> +    }
> +    dcb->count = 2;
> +    for (i = 0; i < 2; i++) {
> +        dcb->aios[i].parent = dcb;
> +        dcb->aios[i].finished = 0;
> +    }
> +    dcb->ret = 0;
> +
> +    return dcb;
> +}
> +
> +static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
> +                                              int64_t sector_num,
> +                                              QEMUIOVector *qiov,
> +                                              int nb_sectors,
> +                                              BlockDriverCompletionFunc *cb,
> +                                              void *opaque)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
> +                                             nb_sectors, &blkmirror_aio_cb,
> +                                             &dcb->aios[i]);
> +        if (!dcb->aios[i].aiocb) {
> +            int a;
> +
> +            for (a = 0; a < i; a++) {
> +                bdrv_aio_cancel(dcb->aios[i].aiocb);
> +            }
> +            qemu_aio_release(dcb);
> +            return NULL;
> +        }
> +    }
> +
> +    return &dcb->common;
> +}
> +
> +static BlockDriverAIOCB *blkmirror_aio_flush(BlockDriverState *bs,
> +                                             BlockDriverCompletionFunc *cb,
> +                                             void *opaque)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        dcb->aios[i].aiocb = bdrv_aio_flush(m->bs[i], &blkmirror_aio_cb,
> +                                            &dcb->aios[i]);
> +        if (!dcb->aios[i].aiocb) {
> +            int a;
> +
> +            for (a = 0; a < i; a++) {
> +                bdrv_aio_cancel(dcb->aios[i].aiocb);
> +            }
> +            qemu_aio_release(dcb);
> +            return NULL;
> +        }
> +    }
> +
> +    return &dcb->common;
> +}
> +
> +static int blkmirror_discard(BlockDriverState *bs, int64_t sector_num,
> +                             int nb_sectors)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int ret;
> +
> +    ret = bdrv_discard(m->bs[0], sector_num, nb_sectors);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_discard(m->bs[1], sector_num, nb_sectors);
> +}
> +
> +
> +static BlockDriver bdrv_blkmirror = {
> +    .format_name        = "blkmirror",
> +    .protocol_name      = "blkmirror",
> +    .instance_size      = sizeof(BdrvMirrorState),
> +
> +    .bdrv_getlength     = blkmirror_getlength,
> +
> +    .bdrv_file_open     = blkmirror_open,
> +    .bdrv_close         = blkmirror_close,
> +    .bdrv_flush         = blkmirror_flush,
> +    .bdrv_discard       = blkmirror_discard,
> +
> +    .bdrv_aio_readv     = blkmirror_aio_readv,
> +    .bdrv_aio_writev    = blkmirror_aio_writev,
> +    .bdrv_aio_flush     = blkmirror_aio_flush,
> +};
> +
> +static void bdrv_blkmirror_init(void)
> +{
> +    bdrv_register(&bdrv_blkmirror);
> +}
> +
> +block_init(bdrv_blkmirror_init);
> Index: qemu-block-copy/Makefile.objs
> ===================================================================
> --- qemu-block-copy.orig/Makefile.objs
> +++ qemu-block-copy/Makefile.objs
> @@ -22,7 +22,7 @@ block-nested-y += raw.o cow.o qcow.o vdi
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_CURL) += curl.o
> Index: qemu-block-copy/docs/blkmirror.txt
> ===================================================================
> --- /dev/null
> +++ qemu-block-copy/docs/blkmirror.txt
> @@ -0,0 +1,15 @@
> +Block mirror driver
> +-------------------
> +
> +This driver will mirror writes to two distinct images.
> +Its used internally by live block copy.
> +
> +Format
> +------
> +
> +blkmirror:/image1.img:/image2.img
> +
> +'\' (backslash) can be used to escape colon processing
> +as a separator.
> +
> +
> 
> 
>
Stefan Hajnoczi - June 7, 2011, 10:25 a.m.
On Mon, Jun 6, 2011 at 5:55 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> +/* Valid blkmirror filenames look like
> + * blkmirror:path/to/image1:path/to/image2 */
> +static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int ret, escape, i, n;
> +    char *raw;
> +
> +    /* Parse the blkmirror: prefix */
> +    if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
> +        return -EINVAL;
> +    }
> +    filename += strlen("blkmirror:");
> +
> +    /* Parse the raw image filename */
> +    raw = malloc(strlen(filename));

Please use qemu_malloc()/qemu_strdup()/qemu_free() instead of the
system library versions.

I'm guilty of this in blkverify :(.

> +    escape = 0;
> +    for (i = n = 0; i < strlen(filename); i++) {
> +        if (!escape && filename[i] == ':') {
> +            break;
> +        }
> +        if (!escape && filename[i] == '\\') {
> +            escape = 1;
> +        } else {
> +            escape = 0;
> +        }
> +
> +        if (!escape) {
> +            raw[n++] = filename[i];
> +        }
> +    }
> +    raw[n] = '\0';
> +
> +    m->bs[0] = bdrv_new("");
> +    if (m->bs[0] == NULL) {
> +        return -ENOMEM;

raw is leaked.

> +    }
> +    ret = bdrv_open(m->bs[0], raw, flags, NULL);

This isn't necessarily a "raw" file.  filename0 and filename1 would be
clearer IMO.

> +    free(raw);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    filename += i + 1;

Please document that escaping only takes effect in filename0.  After
the second ':' you may no longer escape.

For sanity perhaps the whole string should be unescaped.

> +
> +    m->bs[1] = bdrv_new("");
> +    if (m->bs[1] == NULL) {

bs[0] is leaked.

> +        return -ENOMEM;
> +    }
> +    ret = bdrv_open(m->bs[1], filename, flags, NULL);
> +    if (ret < 0) {
> +        bdrv_delete(m->bs[0]);
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +static void blkmirror_close(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < 2; i++) {
> +        bdrv_delete(m->bs[i]);
> +        m->bs[i] = NULL;
> +    }
> +}
> +
> +static int blkmirror_flush(BlockDriverState *bs)
> +{
> +    BdrvMirrorState *m = bs->opaque;
> +
> +    bdrv_flush(m->bs[0]);
> +    bdrv_flush(m->bs[1]);

Return values should be checked.

Stefan

Patch

Index: qemu-block-copy/block/blkmirror.c
===================================================================
--- /dev/null
+++ qemu-block-copy/block/blkmirror.c
@@ -0,0 +1,277 @@ 
+/*
+ * Block driver for mirrored writes.
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <stdarg.h>
+#include "block_int.h"
+
+typedef struct {
+    BlockDriverState *bs[2];
+} BdrvMirrorState;
+
+typedef struct DupAIOCB DupAIOCB;
+
+typedef struct SingleAIOCB {
+    BlockDriverAIOCB *aiocb;
+    int finished;
+    DupAIOCB *parent;
+} SingleAIOCB;
+
+struct DupAIOCB {
+    BlockDriverAIOCB common;
+    int count;
+
+    BlockDriverCompletionFunc *cb;
+    SingleAIOCB aios[2];
+    int ret;
+};
+
+/* Valid blkmirror filenames look like
+ * blkmirror:path/to/image1:path/to/image2 */
+static int blkmirror_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BdrvMirrorState *m = bs->opaque;
+    int ret, escape, i, n;
+    char *raw;
+
+    /* Parse the blkmirror: prefix */
+    if (strncmp(filename, "blkmirror:", strlen("blkmirror:"))) {
+        return -EINVAL;
+    }
+    filename += strlen("blkmirror:");
+
+    /* Parse the raw image filename */
+    raw = malloc(strlen(filename));
+    escape = 0;
+    for (i = n = 0; i < strlen(filename); i++) {
+        if (!escape && filename[i] == ':') {
+            break;
+        }
+        if (!escape && filename[i] == '\\') {
+            escape = 1;
+        } else {
+            escape = 0;
+        }
+
+        if (!escape) {
+            raw[n++] = filename[i];
+        }
+    }
+    raw[n] = '\0';
+
+    m->bs[0] = bdrv_new("");
+    if (m->bs[0] == NULL) {
+        return -ENOMEM;
+    }
+    ret = bdrv_open(m->bs[0], raw, flags, NULL);
+    free(raw);
+    if (ret < 0) {
+        return ret;
+    }
+    filename += i + 1;
+
+    m->bs[1] = bdrv_new("");
+    if (m->bs[1] == NULL) {
+        return -ENOMEM;
+    }
+    ret = bdrv_open(m->bs[1], filename, flags, NULL);
+    if (ret < 0) {
+        bdrv_delete(m->bs[0]);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void blkmirror_close(BlockDriverState *bs)
+{
+    BdrvMirrorState *m = bs->opaque;
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        bdrv_delete(m->bs[i]);
+        m->bs[i] = NULL;
+    }
+}
+
+static int blkmirror_flush(BlockDriverState *bs)
+{
+    BdrvMirrorState *m = bs->opaque;
+
+    bdrv_flush(m->bs[0]);
+    bdrv_flush(m->bs[1]);
+
+    return 0;
+}
+
+static int64_t blkmirror_getlength(BlockDriverState *bs)
+{
+    BdrvMirrorState *m = bs->opaque;
+
+    return bdrv_getlength(m->bs[0]);
+}
+
+static BlockDriverAIOCB *blkmirror_aio_readv(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             QEMUIOVector *qiov,
+                                             int nb_sectors,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    BdrvMirrorState *m = bs->opaque;
+    return bdrv_aio_readv(m->bs[0], sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static void dup_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    DupAIOCB *acb = container_of(blockacb, DupAIOCB, common);
+    int i;
+
+    for (i = 0 ; i < 2; i++) {
+        if (!acb->aios[i].finished) {
+            bdrv_aio_cancel(acb->aios[i].aiocb);
+        }
+    }
+    qemu_aio_release(acb);
+}
+
+static AIOPool dup_aio_pool = {
+    .aiocb_size         = sizeof(DupAIOCB),
+    .cancel             = dup_aio_cancel,
+};
+
+static void blkmirror_aio_cb(void *opaque, int ret)
+{
+    SingleAIOCB *scb = opaque;
+    DupAIOCB *dcb = scb->parent;
+
+    scb->finished = 1;
+    dcb->count--;
+    assert(dcb->count >= 0);
+    if (ret < 0) {
+        dcb->ret = ret;
+    }
+    if (dcb->count == 0) {
+        dcb->common.cb(dcb->common.opaque, dcb->ret);
+        qemu_aio_release(dcb);
+    }
+}
+
+static DupAIOCB *dup_aio_get(BlockDriverState *bs,
+                             BlockDriverCompletionFunc *cb,
+                             void *opaque)
+{
+    DupAIOCB *dcb;
+    int i;
+
+    dcb = qemu_aio_get(&dup_aio_pool, bs, cb, opaque);
+    if (!dcb) {
+        return NULL;
+    }
+    dcb->count = 2;
+    for (i = 0; i < 2; i++) {
+        dcb->aios[i].parent = dcb;
+        dcb->aios[i].finished = 0;
+    }
+    dcb->ret = 0;
+
+    return dcb;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_writev(BlockDriverState *bs,
+                                              int64_t sector_num,
+                                              QEMUIOVector *qiov,
+                                              int nb_sectors,
+                                              BlockDriverCompletionFunc *cb,
+                                              void *opaque)
+{
+    BdrvMirrorState *m = bs->opaque;
+    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        dcb->aios[i].aiocb = bdrv_aio_writev(m->bs[i], sector_num, qiov,
+                                             nb_sectors, &blkmirror_aio_cb,
+                                             &dcb->aios[i]);
+        if (!dcb->aios[i].aiocb) {
+            int a;
+
+            for (a = 0; a < i; a++) {
+                bdrv_aio_cancel(dcb->aios[i].aiocb);
+            }
+            qemu_aio_release(dcb);
+            return NULL;
+        }
+    }
+
+    return &dcb->common;
+}
+
+static BlockDriverAIOCB *blkmirror_aio_flush(BlockDriverState *bs,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    BdrvMirrorState *m = bs->opaque;
+    DupAIOCB *dcb = dup_aio_get(bs, cb, opaque);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        dcb->aios[i].aiocb = bdrv_aio_flush(m->bs[i], &blkmirror_aio_cb,
+                                            &dcb->aios[i]);
+        if (!dcb->aios[i].aiocb) {
+            int a;
+
+            for (a = 0; a < i; a++) {
+                bdrv_aio_cancel(dcb->aios[i].aiocb);
+            }
+            qemu_aio_release(dcb);
+            return NULL;
+        }
+    }
+
+    return &dcb->common;
+}
+
+static int blkmirror_discard(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors)
+{
+    BdrvMirrorState *m = bs->opaque;
+    int ret;
+
+    ret = bdrv_discard(m->bs[0], sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_discard(m->bs[1], sector_num, nb_sectors);
+}
+
+
+static BlockDriver bdrv_blkmirror = {
+    .format_name        = "blkmirror",
+    .protocol_name      = "blkmirror",
+    .instance_size      = sizeof(BdrvMirrorState),
+
+    .bdrv_getlength     = blkmirror_getlength,
+
+    .bdrv_file_open     = blkmirror_open,
+    .bdrv_close         = blkmirror_close,
+    .bdrv_flush         = blkmirror_flush,
+    .bdrv_discard       = blkmirror_discard,
+
+    .bdrv_aio_readv     = blkmirror_aio_readv,
+    .bdrv_aio_writev    = blkmirror_aio_writev,
+    .bdrv_aio_flush     = blkmirror_aio_flush,
+};
+
+static void bdrv_blkmirror_init(void)
+{
+    bdrv_register(&bdrv_blkmirror);
+}
+
+block_init(bdrv_blkmirror_init);
Index: qemu-block-copy/Makefile.objs
===================================================================
--- qemu-block-copy.orig/Makefile.objs
+++ qemu-block-copy/Makefile.objs
@@ -22,7 +22,7 @@  block-nested-y += raw.o cow.o qcow.o vdi
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blkmirror.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
Index: qemu-block-copy/docs/blkmirror.txt
===================================================================
--- /dev/null
+++ qemu-block-copy/docs/blkmirror.txt
@@ -0,0 +1,15 @@ 
+Block mirror driver
+-------------------
+
+This driver will mirror writes to two distinct images.
+Its used internally by live block copy.
+
+Format
+------
+
+blkmirror:/image1.img:/image2.img
+
+'\' (backslash) can be used to escape colon processing
+as a separator.
+
+