Patchwork [2/5] add basic backup support to block driver

login
register
mail settings
Submitter Dietmar Maurer
Date Nov. 21, 2012, 9:01 a.m.
Message ID <1353488464-82756-2-git-send-email-dietmar@proxmox.com>
Download mbox | patch
Permalink /patch/200614/
State New
Headers show

Comments

Dietmar Maurer - Nov. 21, 2012, 9:01 a.m.
Function bdrv_backup_init() creates a block job to backup a block device.

We call brdv_co_backup_cow() for each write during backup. That function
reads the original data and pass it to backup_dump_cb().

The tracked_request infrastructure is used to serialize access.

Currently backup cluster size is hardcoded to 65536 bytes.

Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
---
 Makefile.objs |    1 +
 backup.c      |  296 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block.c       |   60 +++++++++++-
 block.h       |    6 +
 block_int.h   |   28 ++++++
 5 files changed, 388 insertions(+), 3 deletions(-)
 create mode 100644 backup.c
Stefan Hajnoczi - Nov. 22, 2012, 11:25 a.m.
On Wed, Nov 21, 2012 at 10:01:01AM +0100, Dietmar Maurer wrote:
Two comments from skimming the code, not a full review.

> +/* #define DEBUG_BACKUP */
> +
> +#ifdef DEBUG_BACKUP
> +#define DPRINTF(fmt, ...) \
> +    do { printf("backup: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif

Please don't #define debug macros like this.  It hides build breakage
because the debug code is #ifdef'd out.

Either use docs/tracing.txt or do the following so the code always gets
parsed by the compiler:

#define DEBUG_BACKUP 0
#define DPRINTF(fmt, ...) \
    do { \
        if (DEBUG_BACKUP) { \
            printf("backup: " fmt, ## __VA_ARGS__); \
	} \
    } while (0)

> +BackupInfo *
> +bdrv_backup_init(BlockDriverState *bs, BackupDumpFunc *backup_dump_cb,
> +                 BlockDriverCompletionFunc *backup_complete_cb,
> +                 void *opaque)
> +{
> +    assert(bs);
> +    assert(backup_dump_cb);
> +    assert(backup_complete_cb);
> +
> +    if (bs->backup_info) {
> +        DPRINTF("bdrv_backup_init already initialized %s\n",
> +                bdrv_get_device_name(bs));
> +        return NULL;
> +    }
> +
> +    BackupInfo *info = g_malloc0(sizeof(BackupInfo));
> +    int64_t bitmap_size;
> +    const char *devname = bdrv_get_device_name(bs);
> +
> +    if (!devname || !devname[0]) {
> +        return NULL;
> +    }
> +
> +    DPRINTF("bdrv_backup_init %s\n", bdrv_get_device_name(bs));
> +
> +    bitmap_size = bs->total_sectors +
> +        BACKUP_BLOCKS_PER_CLUSTER * BITS_PER_LONG - 1;
> +    bitmap_size /= BACKUP_BLOCKS_PER_CLUSTER * BITS_PER_LONG;
> +
> +    info->backup_dump_cb = backup_dump_cb;
> +    info->backup_complete_cb = backup_complete_cb;
> +    info->opaque = opaque;
> +    info->bitmap_size = bitmap_size;
> +    info->bitmap = g_new0(unsigned long, bitmap_size);
> +
> +    Error *errp;
> +    BackupBlockJob *job = block_job_create(&backup_job_type, bs, 0,
> +                                           backup_job_cleanup_cb, bs, &errp);

Is there a 1:1 relationship between BackupInfo and BackupBlockJob?  Then
it would be nicer to move BlockupInfo fields into BackupBlockJob (which
is empty right now).  Then you also don't need to add fields to
BlockDriverState because you know that if your blockjob is running you
can access it via bs->job, if necessary.  You can then drop BackupInfo
and any memory management code for it.
Dietmar Maurer - Nov. 22, 2012, 11:29 a.m.
> Is there a 1:1 relationship between BackupInfo and BackupBlockJob?  Then it
> would be nicer to move BlockupInfo fields into BackupBlockJob (which is
> empty right now

No, a backup create several block jobs - one for each blockdev you want to backup. Those
jobs run in parallel.
Dietmar Maurer - Nov. 23, 2012, 8:56 a.m.
> Is there a 1:1 relationship between BackupInfo and BackupBlockJob?  Then it
> would be nicer to move BlockupInfo fields into BackupBlockJob (which is
> empty right now).  Then you also don't need to add fields to BlockDriverState
> because you know that if your blockjob is running you can access it via bs-
> >job, if necessary.  You can then drop BackupInfo and any memory
> management code for it.

Oh, finally got your point.

There is an ongoing discussion about generic block filters, So I guess this totally 
depends on how we implement them.


- Dietmar

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 3c7abca..cb46be5 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -48,6 +48,7 @@  coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
 block-obj-y = iov.o cache-utils.o qemu-option.o module.o async.o
 block-obj-y += nbd.o block.o blockjob.o aes.o qemu-config.o
 block-obj-y += thread-pool.o qemu-progress.o qemu-sockets.o uri.o notify.o
+block-obj-y += backup.o
 block-obj-y += $(coroutine-obj-y) $(qobject-obj-y) $(version-obj-y)
 block-obj-$(CONFIG_POSIX) += event_notifier-posix.o aio-posix.o
 block-obj-$(CONFIG_WIN32) += event_notifier-win32.o aio-win32.o
diff --git a/backup.c b/backup.c
new file mode 100644
index 0000000..f8c21f8
--- /dev/null
+++ b/backup.c
@@ -0,0 +1,296 @@ 
+/*
+ * QEMU backup
+ *
+ * Copyright (C) Proxmox Server Solutions
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "block.h"
+#include "block_int.h"
+#include "blockjob.h"
+
+/* #define DEBUG_BACKUP */
+
+#ifdef DEBUG_BACKUP
+#define DPRINTF(fmt, ...) \
+    do { printf("backup: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+
+#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
+
+static int backup_get_bitmap(BlockDriverState *bs, int64_t cluster_num)
+{
+    assert(bs);
+    assert(bs->backup_info);
+    assert(bs->backup_info->bitmap);
+
+    unsigned long val, idx, bit;
+
+    idx = cluster_num / BITS_PER_LONG;
+
+    assert(bs->backup_info->bitmap_size > idx);
+
+    bit = cluster_num % BITS_PER_LONG;
+    val = bs->backup_info->bitmap[idx];
+
+    return !!(val & (1UL << bit));
+}
+
+static void backup_set_bitmap(BlockDriverState *bs, int64_t cluster_num,
+                              int dirty)
+{
+    assert(bs);
+    assert(bs->backup_info);
+    assert(bs->backup_info->bitmap);
+
+    unsigned long val, idx, bit;
+
+    idx = cluster_num / BITS_PER_LONG;
+
+    assert(bs->backup_info->bitmap_size > idx);
+
+    bit = cluster_num % BITS_PER_LONG;
+    val = bs->backup_info->bitmap[idx];
+    if (dirty) {
+        if (!(val & (1UL << bit))) {
+            val |= 1UL << bit;
+        }
+    } else {
+        if (val & (1UL << bit)) {
+            val &= ~(1UL << bit);
+        }
+    }
+    bs->backup_info->bitmap[idx] = val;
+}
+
+static int backup_in_progress_count;
+
+int coroutine_fn brdv_co_backup_cow(BlockDriverState *bs,
+                                     int64_t sector_num, int nb_sectors)
+{
+    assert(bs);
+
+    BackupInfo *info = bs->backup_info;
+    BlockDriver *drv = bs->drv;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buffer = NULL;
+    int ret = 0;
+
+    assert(info);
+
+    backup_in_progress_count++;
+
+    int64_t start, end;
+
+    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
+    end = (sector_num + nb_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
+        BACKUP_BLOCKS_PER_CLUSTER;
+
+    DPRINTF("brdv_co_backup_cow enter %s C%zd %zd %d\n",
+            bdrv_get_device_name(bs), start, sector_num, nb_sectors);
+
+    for (; start < end; start++) {
+        if (backup_get_bitmap(bs, start)) {
+            DPRINTF("brdv_co_backup_cow skip C%zd\n", start);
+            continue; /* already copied */
+        }
+
+        /* immediately set bitmap (avoid coroutine race) */
+        backup_set_bitmap(bs, start, 1);
+
+        DPRINTF("brdv_co_backup_cow C%zd\n", start);
+
+        if (!bounce_buffer) {
+            iov.iov_len = BACKUP_CLUSTER_SIZE;
+            iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+            qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+        }
+
+        ret = drv->bdrv_co_readv(bs, start * BACKUP_BLOCKS_PER_CLUSTER,
+                                 BACKUP_BLOCKS_PER_CLUSTER,
+                                 &bounce_qiov);
+        if (ret < 0) {
+            DPRINTF("brdv_co_backup_cow bdrv_read C%zd failed\n", start);
+            goto out;
+        }
+
+        ret = info->backup_dump_cb(info->opaque, bs, start, bounce_buffer);
+        if (ret < 0) {
+            DPRINTF("brdv_co_backup_cow dump_cluster_cb C%zd failed\n", start);
+            goto out;
+        }
+
+        DPRINTF("brdv_co_backup_cow done C%zd\n", start);
+    }
+
+out:
+    if (bounce_buffer) {
+        qemu_vfree(bounce_buffer);
+    }
+
+    backup_in_progress_count--;
+
+    return ret;
+}
+
+void bdrv_backup_deinit(BlockDriverState *bs)
+{
+    BackupInfo *info = bs->backup_info;
+
+    if (!info) {
+        return;
+    }
+
+    DPRINTF("backup_deinit %s\n", bdrv_get_device_name(bs));
+
+    g_free(info->bitmap);
+
+    bs->backup_info = NULL;
+}
+
+typedef struct BackupBlockJob {
+    BlockJob common;
+} BackupBlockJob;
+
+static BlockJobType backup_job_type = {
+    .instance_size = sizeof(BackupBlockJob),
+    .job_type      = "backup",
+};
+
+static void coroutine_fn backup_run(void *opaque)
+{
+    BackupBlockJob *job = opaque;
+    BlockDriverState *bs = job->common.bs;
+    assert(bs);
+    assert(bs->backup_info);
+
+    int64_t start, end;
+
+    start = 0;
+    end = (bs->total_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
+        BACKUP_BLOCKS_PER_CLUSTER;
+
+    DPRINTF("backup_run start %s %zd %zd\n", bdrv_get_device_name(bs),
+            start, end);
+
+    int ret = 0;
+
+    for (; start < end; start++) {
+        if (block_job_is_cancelled(&job->common)) {
+            ret = -1;
+            break;
+        }
+
+        if (backup_get_bitmap(bs, start)) {
+            continue; /* already copied */
+        }
+
+        /* we need to yield so that qemu_aio_flush() returns.
+         * (without, VM does not reboot)
+         * todo: can we avoid that?
+         */
+        co_sleep_ns(rt_clock, 0);
+        if (block_job_is_cancelled(&job->common)) {
+            ret = -1;
+            break;
+        }
+        DPRINTF("backup_run loop C%zd\n", start);
+
+        /**
+         * This triggers a cluster copy
+         * Note: avoid direct call to brdv_co_backup_cow, because
+         * this does not call tracked_request_begin()
+         */
+        ret = bdrv_co_backup(bs, start*BACKUP_BLOCKS_PER_CLUSTER, 1);
+        if (ret < 0) {
+            break;
+        }
+        /* Publish progress */
+        job->common.offset += BACKUP_CLUSTER_SIZE;
+    }
+
+    while (backup_in_progress_count > 0) {
+        DPRINTF("backup_run backup_in_progress_count != 0 (%d)",
+                backup_in_progress_count);
+        co_sleep_ns(rt_clock, 10000);
+    }
+
+    DPRINTF("backup_run complete %d\n", ret);
+    block_job_completed(&job->common, ret);
+}
+
+static void backup_job_cleanup_cb(void *opaque, int ret)
+{
+    BlockDriverState *bs = opaque;
+
+    DPRINTF("backup_job_cleanup_cb start %d\n", ret);
+
+    bs->backup_info->backup_complete_cb(bs->backup_info->opaque, ret);
+
+    DPRINTF("backup_job_cleanup_cb end\n");
+
+    bdrv_backup_deinit(bs);
+}
+
+BackupInfo *
+bdrv_backup_init(BlockDriverState *bs, BackupDumpFunc *backup_dump_cb,
+                 BlockDriverCompletionFunc *backup_complete_cb,
+                 void *opaque)
+{
+    assert(bs);
+    assert(backup_dump_cb);
+    assert(backup_complete_cb);
+
+    if (bs->backup_info) {
+        DPRINTF("bdrv_backup_init already initialized %s\n",
+                bdrv_get_device_name(bs));
+        return NULL;
+    }
+
+    BackupInfo *info = g_malloc0(sizeof(BackupInfo));
+    int64_t bitmap_size;
+    const char *devname = bdrv_get_device_name(bs);
+
+    if (!devname || !devname[0]) {
+        return NULL;
+    }
+
+    DPRINTF("bdrv_backup_init %s\n", bdrv_get_device_name(bs));
+
+    bitmap_size = bs->total_sectors +
+        BACKUP_BLOCKS_PER_CLUSTER * BITS_PER_LONG - 1;
+    bitmap_size /= BACKUP_BLOCKS_PER_CLUSTER * BITS_PER_LONG;
+
+    info->backup_dump_cb = backup_dump_cb;
+    info->backup_complete_cb = backup_complete_cb;
+    info->opaque = opaque;
+    info->bitmap_size = bitmap_size;
+    info->bitmap = g_new0(unsigned long, bitmap_size);
+
+    Error *errp;
+    BackupBlockJob *job = block_job_create(&backup_job_type, bs, 0,
+                                           backup_job_cleanup_cb, bs, &errp);
+
+    bs->backup_info = info;
+
+    job->common.len = bs->total_sectors*BDRV_SECTOR_SIZE;
+    job->common.co = qemu_coroutine_create(backup_run);
+    qemu_coroutine_enter(job->common.co, job);
+
+    return info;
+}
diff --git a/block.c b/block.c
index 854ebd6..48e9b68 100644
--- a/block.c
+++ b/block.c
@@ -54,6 +54,7 @@ 
 typedef enum {
     BDRV_REQ_COPY_ON_READ = 0x1,
     BDRV_REQ_ZERO_WRITE   = 0x2,
+    BDRV_REQ_BACKUP_ONLY  = 0x4,
 } BdrvRequestFlags;
 
 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
@@ -1132,6 +1133,11 @@  void bdrv_close(BlockDriverState *bs)
         block_job_cancel_sync(bs->job);
     }
     bdrv_drain_all();
+
+    if (bs->backup_info) {
+        bdrv_backup_deinit(bs);
+    }
+
     notifier_list_notify(&bs->close_notifiers, bs);
 
     if (bs->drv) {
@@ -1541,7 +1547,7 @@  int bdrv_commit(BlockDriverState *bs)
 
     if (!drv)
         return -ENOMEDIUM;
-    
+
     if (!bs->backing_hd) {
         return -ENOTSUP;
     }
@@ -1678,6 +1684,20 @@  static void round_to_clusters(BlockDriverState *bs,
     }
 }
 
+/**
+ * Round a region to backup cluster boundaries
+ */
+static void round_to_backup_clusters(BlockDriverState *bs,
+                                     int64_t sector_num, int nb_sectors,
+                                     int64_t *cluster_sector_num,
+                                     int *cluster_nb_sectors)
+{
+    int64_t c = BACKUP_BLOCKS_PER_CLUSTER;
+    *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+    *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                        nb_sectors, c);
+}
+
 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                      int64_t sector_num, int nb_sectors) {
     /*        aaaa   bbbb */
@@ -1708,6 +1728,11 @@  static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
     round_to_clusters(bs, sector_num, nb_sectors,
                       &cluster_sector_num, &cluster_nb_sectors);
 
+    if (bs->backup_info) {
+        round_to_backup_clusters(bs, sector_num, nb_sectors,
+                                 &cluster_sector_num, &cluster_nb_sectors);
+    }
+
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
@@ -2277,12 +2302,22 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         bs->copy_on_read_in_flight++;
     }
 
-    if (bs->copy_on_read_in_flight) {
+    if (bs->copy_on_read_in_flight || bs->backup_info) {
         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
     }
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
 
+    if (flags & BDRV_REQ_BACKUP_ONLY) {
+        /* Note: We do not return any data to the caller */
+        if (bs->backup_info) {
+            ret = brdv_co_backup_cow(bs, sector_num, nb_sectors);
+        } else {
+            ret = -1;
+        }
+        goto out;
+    }
+
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
 
@@ -2326,6 +2361,17 @@  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                             BDRV_REQ_COPY_ON_READ);
 }
 
+int coroutine_fn bdrv_co_backup(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    if (!bs->backup_info) {
+        return -ENOTSUP;
+    }
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, NULL,
+                            BDRV_REQ_BACKUP_ONLY);
+}
+
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors)
 {
@@ -2383,12 +2429,19 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, true, nb_sectors);
     }
 
-    if (bs->copy_on_read_in_flight) {
+    if (bs->copy_on_read_in_flight || bs->backup_info) {
         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
     }
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
+    if (bs->backup_info) {
+        ret = brdv_co_backup_cow(bs, sector_num, nb_sectors);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
     if (flags & BDRV_REQ_ZERO_WRITE) {
         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
     } else {
@@ -2407,6 +2460,7 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 
+out:
     tracked_request_end(&req);
 
     return ret;
diff --git a/block.h b/block.h
index 722c620..630fabd 100644
--- a/block.h
+++ b/block.h
@@ -90,6 +90,10 @@  typedef struct BlockDevOps {
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
 
+#define BACKUP_CLUSTER_BITS 16
+#define BACKUP_CLUSTER_SIZE (1<<BACKUP_CLUSTER_BITS)
+#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE/BDRV_SECTOR_SIZE)
+
 typedef enum {
     BDRV_ACTION_REPORT, BDRV_ACTION_IGNORE, BDRV_ACTION_STOP
 } BlockErrorAction;
@@ -172,6 +176,8 @@  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_backup(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 /*
diff --git a/block_int.h b/block_int.h
index 9deedb8..08122ba 100644
--- a/block_int.h
+++ b/block_int.h
@@ -68,6 +68,32 @@  typedef struct BlockIOBaseValue {
     uint64_t ios[2];
 } BlockIOBaseValue;
 
+/**
+ * Backup related definitions
+ */
+
+typedef int BackupDumpFunc(void *opaque, BlockDriverState *bs,
+                           int64_t cluster_num, unsigned char *buf);
+
+typedef struct BackupInfo {
+    unsigned long *bitmap;
+    int bitmap_size;
+    BackupDumpFunc *backup_dump_cb;
+    BlockDriverCompletionFunc *backup_complete_cb;
+    void *opaque;
+} BackupInfo;
+
+BackupInfo *bdrv_backup_init(BlockDriverState *bs,
+                             BackupDumpFunc *backup_dump_cb,
+                             BlockDriverCompletionFunc *backup_complete_cb,
+                             void *opaque);
+
+void bdrv_backup_deinit(BlockDriverState *bs);
+
+int coroutine_fn brdv_co_backup_cow(BlockDriverState *bs,
+                                    int64_t sector_num, int nb_sectors);
+
+
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -276,6 +302,8 @@  struct BlockDriverState {
 
     QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 
+    BackupInfo *backup_info;
+
     /* long-running background operation */
     BlockJob *job;