Patchwork [06/26] FVD: skeleton of Fast Virtual Disk

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-6-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84614/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the skeleton of the block device driver for
Fast Virtual Disk (FVD).

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile.objs      |    2 +-
 block/fvd-create.c |   21 +++++++
 block/fvd-flush.c  |   24 +++++++
 block/fvd-misc.c   |   37 +++++++++++
 block/fvd-open.c   |   17 +++++
 block/fvd-read.c   |   21 +++++++
 block/fvd-update.c |   21 +++++++
 block/fvd-write.c  |   21 +++++++
 block/fvd.c        |   60 ++++++++++++++++++
 block/fvd.h        |  171 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 394 insertions(+), 1 deletions(-)
 create mode 100644 block/fvd-create.c
 create mode 100644 block/fvd-flush.c
 create mode 100644 block/fvd-misc.c
 create mode 100644 block/fvd-open.c
 create mode 100644 block/fvd-read.c
 create mode 100644 block/fvd-update.c
 create mode 100644 block/fvd-write.c
 create mode 100644 block/fvd.c
 create mode 100644 block/fvd.h

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 264aab3..9185d3e 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,7 +23,7 @@  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
-block-nested-y += blksim.o
+block-nested-y += blksim.o fvd.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/fvd-create.c b/block/fvd-create.c
new file mode 100644
index 0000000..5593cea
--- /dev/null
+++ b/block/fvd-create.c
@@ -0,0 +1,21 @@ 
+/*
+ * QEMU Fast Virtual Disk Format bdrv_create()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static int fvd_create(const char *filename, QEMUOptionParameter * options)
+{
+    return -ENOTSUP;
+}
+
+static QEMUOptionParameter fvd_create_options[] = {
+    {NULL}
+};
diff --git a/block/fvd-flush.c b/block/fvd-flush.c
new file mode 100644
index 0000000..34bd5cb
--- /dev/null
+++ b/block/fvd-flush.c
@@ -0,0 +1,24 @@ 
+/*
+ * QEMU Fast Virtual Disk Format bdrv_flush() and bdrv_aio_flush()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    return NULL;
+}
+
+static int fvd_flush(BlockDriverState * bs)
+{
+    return -ENOTSUP;
+}
diff --git a/block/fvd-misc.c b/block/fvd-misc.c
new file mode 100644
index 0000000..f4e1038
--- /dev/null
+++ b/block/fvd-misc.c
@@ -0,0 +1,37 @@ 
+/*
+ * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static void fvd_close(BlockDriverState * bs)
+{
+}
+
+static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename)
+{
+    return 0;
+}
+
+static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    return 0;
+}
+
+static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
+{
+    return -ENOTSUP;
+}
+
+static int fvd_has_zero_init(BlockDriverState * bs)
+{
+    return 0;
+}
diff --git a/block/fvd-open.c b/block/fvd-open.c
new file mode 100644
index 0000000..056b994
--- /dev/null
+++ b/block/fvd-open.c
@@ -0,0 +1,17 @@ 
+/*
+ * QEMU Fast Virtual Disk Format bdrv_file_open()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
+{
+    return -ENOTSUP;
+}
diff --git a/block/fvd-read.c b/block/fvd-read.c
new file mode 100644
index 0000000..b9f3ac9
--- /dev/null
+++ b/block/fvd-read.c
@@ -0,0 +1,21 @@ 
+/*
+ * QEMU Fast Virtual Disk Format bdrv_aio_readv()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
+                                       int64_t sector_num, QEMUIOVector * qiov,
+                                       int nb_sectors,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd-update.c b/block/fvd-update.c
new file mode 100644
index 0000000..2498618
--- /dev/null
+++ b/block/fvd-update.c
@@ -0,0 +1,21 @@ 
+/*
+ * QEMU Fast Virtual Disk Format bdrv_update
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
+{
+    return -ENOTSUP;
+}
+
+static QEMUOptionParameter fvd_update_options[] = {
+    {NULL}
+};
diff --git a/block/fvd-write.c b/block/fvd-write.c
new file mode 100644
index 0000000..a736a37
--- /dev/null
+++ b/block/fvd-write.c
@@ -0,0 +1,21 @@ 
+/*
+ * QEMU Fast Virtual Disk Format bdrv_aio_writev()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
+                                        int64_t sector_num,
+                                        QEMUIOVector * qiov, int nb_sectors,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd.c b/block/fvd.c
new file mode 100644
index 0000000..bc2645c
--- /dev/null
+++ b/block/fvd.c
@@ -0,0 +1,60 @@ 
+/*
+ * QEMU Fast Virtual Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ *  See the following companion papers for a detailed description of FVD:
+ *  1. The so-called "FVD-cow paper":
+ *          "FVD: a High-Performance Virtual Machine Image Format for Cloud",
+ *      by Chunqiang Tang, 2010.
+ *  2. The so-called "FVD-compact paper":
+ *          "FVD: a High-Performance Virtual Machine Image Format for Cloud
+ *           with Sparse Image Capability", by Chunqiang Tang, 2010.
+ *============================================================================*/
+
+#include "block/fvd.h"
+
+/* Use include to avoid exposing too many FVD symbols, and to allow inline
+ * function optimization. */
+#include "block/fvd-flush.c"
+#include "block/fvd-update.c"
+#include "block/fvd-misc.c"
+#include "block/fvd-create.c"
+#include "block/fvd-open.c"
+#include "block/fvd-read.c"
+#include "block/fvd-write.c"
+
+static BlockDriver bdrv_fvd = {
+    .format_name = "fvd",
+    .instance_size = sizeof(BDRVFvdState),
+    .bdrv_create = fvd_create,
+    .bdrv_probe = fvd_probe,
+    .bdrv_file_open = fvd_open,
+    .bdrv_close = fvd_close,
+    .bdrv_is_allocated = fvd_is_allocated,
+    .bdrv_flush = fvd_flush,
+    .bdrv_aio_readv = fvd_aio_readv,
+    .bdrv_aio_writev = fvd_aio_writev,
+    .bdrv_aio_flush = fvd_aio_flush,
+    .create_options = fvd_create_options,
+    .update_options = fvd_update_options,
+    .bdrv_get_info = fvd_get_info,
+    .bdrv_update = fvd_update,
+    .bdrv_has_zero_init = fvd_has_zero_init
+};
+
+static void bdrv_fvd_init(void)
+{
+    bdrv_register(&bdrv_fvd);
+}
+
+block_init(bdrv_fvd_init);
diff --git a/block/fvd.h b/block/fvd.h
new file mode 100644
index 0000000..f2da330
--- /dev/null
+++ b/block/fvd.h
@@ -0,0 +1,171 @@ 
+/*
+ * QEMU Fast Virtual Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+
+enum {
+    FVD_MAGIC = ('F' | 'V' << 8 | 'D' << 16 | '\0' << 24),
+    FVD_VERSION = 1,
+    INCOMPATIBLE_FEATURES_SPACE = 4096, /* in bytes. */
+    DEF_PAGE_SIZE = 4096                /* in bytes. */
+};
+
+/*
+ * The FVD format consists of the following fields in little endian:
+ *   + Header fields of FvdHeader.
+ *   + Bitmap, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.bitmap_offset.
+ *   + Journal, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.journal_offset.
+ *   + Table, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.table_offset. When expanding the size of an existing FVD
+ *     image, the table can be expanded to borrow space from the next,
+ *     "virtual disk data" section, by relocating some data chunks.
+ *   + Virtual disk data,  starting on a 4KB page boundary. Optionally, disk
+ *     data can be stored in a separate data file specified by
+ *     FvdHeader.data_file.
+ */
+typedef struct __attribute__ ((__packed__)) FvdHeader {
+    uint32_t magic;             /* FVD_MAGIC */
+
+    /* Size of FvdHeader in bytes, rounded up to DEF_PAGE_SIZE. A new FVD
+     * version may add fields to FvdHeader and hence need to increase
+     * header_size. When an old FVD version reads an image created by a new
+     * FVD version, the old version only reads the beginning part of FvdHeader
+     * that it can understand and ignroes the new fields at the end of
+     * FvdHeader. */
+    uint32_t header_size;
+
+    /* Version of the FVD software that created the image. */
+    uint32_t create_version;
+
+    /* Version of the FVD software that openned the image most recently. This
+     * field is for forward compatibility. Consider one example. Suppos FVD
+     * version N+1 introduces a compatible feature, e.g., adding a
+     * 'last_modified' timestamp into the FVD image header. Even if FVD
+     * version N is unaware of this new feature, it can still open an image
+     * created by FVD version N+1 without problem, but won't update the
+     * last_modified field. FVD version N sets the image's
+     * 'last_open_version=N' when it opens the image.  When FVD version N+1
+     * opens this image, it knows that the 'last_modified' field cannot be
+     * trusted and may take some actions accordingly, e.g., being conservative
+     * in some optimization heuristics that depend on the value of
+     * 'last_modified' to avoid making the optimization counter effective. */
+    uint32_t last_open_version;
+
+    uint64_t virtual_disk_size;  /* in bytes. Disk size perceived by the VM. */
+    uint64_t data_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
+
+    /* Data can be optionally stored in a different file. */
+    char data_file[1024];
+    char data_file_fmt[16];
+
+    /* Base image. */
+    char base_img[1024];
+    char base_img_fmt[16];
+    uint64_t base_img_size;      /* in bytes. */
+
+    /* Bitmap for base image. */
+    uint64_t bitmap_offset;      /* in bytes. Aligned on DEF_PAGE_SIZE. */
+    uint64_t bitmap_size;        /* in bytes. Rounded up to DEF_PAGE_SIZE. */
+    uint64_t block_size;         /* in bytes. */
+
+    /* Journal */
+    uint64_t journal_offset;      /* in bytes. */
+    uint64_t journal_size;        /* in bytes. On-disk journal size. */
+    uint32_t clean_shutdown;      /* true if VM's last shutdown was graceful. */
+    uint64_t stable_journal_epoch; /* Needed only if a chunk can be relocated.*/
+    uint64_t journal_buf_size;     /* in bytes. In-memory buffer size. */
+    uint64_t journal_clean_buf_period; /* in milliseconds. */
+
+    /* Table for compact image. */
+    uint64_t table_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
+    uint64_t table_size;          /* in bytes. Rounded up to DEF_PAGE_SIZE. */
+    uint64_t chunk_size;          /* in bytes. */
+    uint64_t storage_grow_unit;   /* in bytes. */
+    char add_storage_cmd[1024];
+    uint32_t chunks_relocated;    /* Affect bdrv_has_zero_init(). */
+
+    /* Copy-on-read */
+    uint32_t copy_on_read;       /* true or false */
+    uint64_t max_outstanding_copy_on_read_data;        /* in bytes. */
+
+    /* Prefetching. */
+    int64_t prefetch_start_delay; /* in seconds. -1 means disabled. */
+    uint32_t base_img_fully_prefetched; /* true or false. */
+    uint32_t num_prefetch_slots; /* Max number of oustanding prefetch writes. */
+    uint64_t bytes_per_prefetch; /* For whole image prefetching. */
+    uint64_t prefetch_read_throughput_measure_time;  /* in milliseconds. */
+    uint64_t prefetch_write_throughput_measure_time; /* in milliseconds. */
+    uint64_t prefetch_min_read_throughput;  /* in KB/second. */
+    uint64_t prefetch_min_write_throughput; /* in KB/second. */
+    uint64_t prefetch_max_read_throughput;  /* in KB/second. */
+    uint64_t prefetch_max_write_throughput; /* in KB/second. */
+    uint64_t prefetch_throttle_time;  /* in milliseconds. */
+
+    /* need_zero_init is true if the image mandates that the storage layer
+     * (BDRVFvdState.fvd_data) must return true for bdrv_has_zero_init().
+     * This is the case if the optimization described in Section 3.3.3 of the
+     * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
+     * create' sets need_zero_init to true, 'qemu-img update' can be used to
+     * manually reset it to false, if the user always manually pre-fills the
+     * storage (e.g., a raw partition) with zeros. If the image is stored on a
+     * file system, it already supports zero_init, and hence there is no need
+     * to manually manipulate this field. */
+    uint32_t need_zero_init;
+
+    /* This field enables adding incompatible features. For example, Suppose
+     * FVD version N+1 adds image compression. A compressed image cannot be
+     * openned by FVD version N. Suppose in FVD version N, the value of
+     * INCOMPATIBLE_FEATURES_SPACE is 4096. Introducing image compression
+     * in FVD version N+1 causes the following changes to the header.
+     *   In FVD version N:
+     *          uint8_t incompatible_features[4096];
+     *   In FVD version N+1:
+     *          uint8_t image_compressed;
+     *          uint8_t incompatible_features[4095];
+     *
+     * When any FVD version X opens an image, it always scans through the
+     * entire array of 'incompatible_features', although the size of
+     * INCOMPATIBLE_FEATURES_SPACE may be different for different FVD
+     * versions. If any bit of 'incompatible_features' is non-zero, FVD
+     * version X refuses to open the image. In the example above, if FVD
+     * version N+1 creates a non-compressed image, it sets
+     * 'image_compressed=0', which then still allows FVD version N to open the
+     * image. Instead of using one byte to represent a new feature, it can
+     * also use one bit to represent a new feature, which then allows a total
+     * of 32768 incompatible features to be added in the future.
+     */
+    uint8_t incompatible_features[INCOMPATIBLE_FEATURES_SPACE];
+
+    /* When a new FVD version introduces a new feature (which may or may not
+     * be backward compatible), an arbitrary number of new fields can be added
+     * to the image header, but those new fields must be added at the end of
+     * 'FvdHeader'. Old FVD versions simply won't read or write those new
+     * fields. Old FVD versions can still correctly access the bitmap, the
+     * journal, and the table, because no FVD version assumes a fixed header
+     * size, but instead accesses the bitmap, the journal, and the table
+     * through bitmap_offset, journal_offset, and table_offset, respectively.
+     * Similarly, if a new data structure of a variable size is added to the
+     * image header in the future, it must also be indexed by an offset field
+     * and a size field. */
+} FvdHeader;
+
+typedef struct BDRVFvdState {
+} BDRVFvdState;