Patchwork [10/26] FVD: add impl of interface bdrv_file_open()

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-10-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84625/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_file_open() interface.
It supports openning an FVD image.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-journal.c  |    6 +
 block/fvd-open.c     |  445 +++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-prefetch.c |   17 ++
 block/fvd.c          |    1 +
 4 files changed, 468 insertions(+), 1 deletions(-)
 create mode 100644 block/fvd-prefetch.c

Patch

diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 246f425..5ba34bd 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -22,6 +22,12 @@  static inline int64_t calc_min_journal_size(int64_t table_entries)
     return 512;
 }
 
+static int init_journal(int read_only, BlockDriverState * bs,
+                        FvdHeader * header)
+{
+    return -ENOTSUP;
+}
+
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
diff --git a/block/fvd-open.c b/block/fvd-open.c
index 056b994..8caf8d3 100644
--- a/block/fvd-open.c
+++ b/block/fvd-open.c
@@ -11,7 +11,450 @@ 
  *
  */
 
+static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s);
+static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags);
+static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
+                       FvdHeader * header, const char *const filename);
+static int load_table(BDRVFvdState * s, FvdHeader * header,
+                      const char *const filename);
+static int init_journal(int read_only, BlockDriverState * bs,
+                        FvdHeader * header);
+static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
+                              const char *const filename);
+
 static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
 {
-    return -ENOTSUP;
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+    FvdHeader header;
+    BlockDriver *drv;
+    int i;
+
+    const char *protocol = strchr(filename, ':');
+    if (protocol) {
+        drv = bdrv_find_protocol(filename);
+        filename = protocol + 1;
+    } else {
+        /* Use "raw" instead of "file" to allow storing the image on device. */
+        drv = bdrv_find_format("raw");
+        if (!drv) {
+            fprintf(stderr, "Failed to find the block device driver\n");
+            return -EINVAL;
+        }
+    }
+
+    s->fvd_metadata = bdrv_new("");
+    ret = bdrv_open(s->fvd_metadata, filename, flags, drv);
+    if (ret < 0) {
+        fprintf(stderr, "Failed to open %s\n", filename);
+        return ret;
+    }
+
+    /* Initialize so that jumping to 'fail' would do cleanup properly. */
+    s->stale_bitmap = NULL;
+    s->fresh_bitmap = NULL;
+    s->table = NULL;
+    s->outstanding_copy_on_read_data = 0;
+    QLIST_INIT(&s->write_locks);
+    QLIST_INIT(&s->copy_locks);
+    s->prefetch_acb = NULL;
+    s->add_storage_cmd = NULL;
+#ifdef FVD_DEBUG
+    s->total_copy_on_read_data = s->total_prefetch_data = 0;
+#endif
+
+    if (bdrv_pread(s->fvd_metadata, 0, &header, sizeof(header)) !=
+        sizeof(header)) {
+        fprintf(stderr, "Failed to read the header of %s\n", filename);
+        ret = -EIO;
+        goto fail;
+    }
+
+    fvd_header_le_to_cpu(&header);
+
+    if (header.magic != FVD_MAGIC) {
+        fprintf(stderr, "Incorrect magic number in header: %0X\n",
+                header.magic);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Check incompatible features. */
+    for (i = 0; i < INCOMPATIBLE_FEATURES_SPACE; i++) {
+        if (header.incompatible_features[i] != 0) {
+            fprintf(stderr, "The image was created by FVD version %d "
+                    " and uses features not supported by this FVD version %d\n",
+                    header.create_version, FVD_VERSION);
+            ret = -ENOTSUP;
+        }
+    }
+
+    if (header.virtual_disk_size % 512 != 0) {
+        fprintf(stderr, "Disk size %" PRId64 " in the header of %s is not "
+                "a multple of 512.\n", header.virtual_disk_size, filename);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Initialize the fields of BDRVFvdState. */
+    s->chunks_relocated = header.chunks_relocated;
+    s->dirty_image = false;
+    s->metadata_err_prohibit_write = false;
+    s->block_size = header.block_size / 512;
+    s->bitmap_size = header.bitmap_size;
+    s->prefetch_timer = NULL;
+    s->sectors_per_prefetch = (header.bytes_per_prefetch + 511) / 512;
+    s->prefetch_throttle_time = header.prefetch_throttle_time;
+    s->prefetch_read_throughput_measure_time =
+        header.prefetch_read_throughput_measure_time;
+    s->prefetch_write_throughput_measure_time =
+        header.prefetch_write_throughput_measure_time;
+
+    /* Convert KB/s to bytes/millisec. */
+    s->prefetch_min_read_throughput =
+        ((double)header.prefetch_min_read_throughput) * 1024.0 / 1000.0;
+    s->prefetch_min_write_throughput =
+        ((double)header.prefetch_min_write_throughput) * 1024.0 / 1000.0;
+
+    if (header.base_img[0] != 0 && s->sectors_per_prefetch % s->block_size!=0) {
+        fprintf(stderr, "sectors_per_prefetch (%" PRIu64 ") is not a multiple "
+                "of block_size (%" PRIu64 ")\n",
+                s->sectors_per_prefetch * 512, s->block_size * 512);
+    }
+    s->max_outstanding_copy_on_read_data =
+        header.max_outstanding_copy_on_read_data;
+    if (s->max_outstanding_copy_on_read_data < header.block_size * 2) {
+        s->max_outstanding_copy_on_read_data = header.block_size;
+    }
+
+    if (header.num_prefetch_slots < 1) {
+        s->num_prefetch_slots = 1;
+    } else {
+        s->num_prefetch_slots = header.num_prefetch_slots;
+    }
+
+    const int read_only = !(flags & BDRV_O_RDWR);
+
+    if (read_only || IN_QEMU_TOOL) {
+        /* Disable prefetching and copy_on_read. */
+        s->prefetch_start_delay = -1;
+        s->copy_on_read = false;
+    } else {
+        s->prefetch_start_delay = header.prefetch_start_delay;
+        s->copy_on_read = header.copy_on_read;
+    }
+    s->virtual_disk_size = header.virtual_disk_size;
+    s->bitmap_offset = header.bitmap_offset / 512;
+    s->base_img_sectors = header.base_img_size / 512;
+    bs->total_sectors = s->virtual_disk_size / 512;
+
+    if ((ret = init_data_file(s, &header, flags))) {
+        goto fail;
+    }
+
+    if ((ret = init_bitmap(bs, s, &header, filename))) {
+        goto fail;
+    }
+
+    if ((ret = load_table(s, &header, filename))) {
+        goto fail;
+    }
+
+    if ((ret = init_journal(read_only, bs, &header))) {
+        goto fail;
+    }
+
+    /* This must be done after init_journal() because it may use metadata
+     * recovered from the journal. */
+    if ((ret = init_compact_image(s, &header, filename))) {
+        goto fail;
+    }
+
+    if (!read_only) {
+        /* This flag will be cleaned when the image is shut down gracefully. */
+        update_clean_shutdown_flag(s, false);
+        init_prefetch_timer(bs, s);
+    }
+
+    QDEBUG("copy_on_read=%s compact_image=%s block_size=%" PRIu64
+           " chunk_size=%"PRId64
+           " journal_size=%" PRId64 " prefetching_delay=%" PRId64
+           " prefetch_slots=%d "
+           "prefetch_read_threshold_KB=%.0lf "
+           "prefetch_write_threshold_KB=%.0lf "
+           "prefetch_throttle_time=%" PRIu64 " bytes_per_prefetch=%" PRIu64
+           " max_outstanding_copy_on_read_data=%" PRId64 "\n",
+           BOOL(s->copy_on_read), BOOL(s->table_offset > 0),
+           s->block_size * 512, s->chunk_size * 512,
+           s->journal_size * 512, s->prefetch_start_delay,
+           s->num_prefetch_slots,
+           s->prefetch_min_read_throughput * 1000.0 / 1024.0,
+           s->prefetch_min_write_throughput * 1000.0 / 1024.0,
+           s->prefetch_throttle_time, s->sectors_per_prefetch * 512,
+           s->max_outstanding_copy_on_read_data);
+
+    return 0;
+
+fail:
+    fprintf(stderr, "Failed to open %s using the FVD format.\n", filename);
+    fvd_close(bs);
+    return ret;
+}
+
+static int load_table(BDRVFvdState * s, FvdHeader * header,
+                      const char *const filename)
+{
+    if (header->table_offset <= 0) {
+        return 0;       /* Not a compact image and no table. */
+    }
+
+    /* Initialize the table. */
+    s->table_offset = header->table_offset / 512;
+    s->table_size = header->table_size;
+    s->chunk_size = header->chunk_size / 512;
+    s->table = my_qemu_blockalign(s->fvd_metadata, s->table_size);
+
+    if (bdrv_pread(s->fvd_metadata, header->table_offset, s->table,
+                   (int)s->table_size) != (int)s->table_size) {
+        fprintf(stderr, "Failed to read the table of %s\n", filename);
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
+                              const char *const filename)
+{
+    s->leaked_chunks = NULL;
+    s->num_leaked_chunks = 0;
+    s->next_avail_leaked_chunk = 0;
+
+    if (header->table_offset <= 0) {
+        /* Not a compact image. */
+        s->data_region_prepared = false;
+        return 0;
+    }
+
+    /* Scan the table to find the max used chunk and leaked chunks. */
+    uint32_t i;
+    uint32_t max_chunk = 0;
+    uint32_t table_entries = ROUND_UP(header->virtual_disk_size,
+                                      header->chunk_size) / header->chunk_size;
+    uint8_t *used_chunks = my_qemu_mallocz(table_entries);
+    for (i = 0; i < table_entries; i++) {
+        if (!IS_EMPTY(s->table[i])) {
+            uint32_t id = READ_TABLE(s->table[i]);
+            if (id >= max_chunk) {
+                max_chunk = id + 1;
+            }
+            if (used_chunks[id]) {
+                fprintf(stderr, "ERROR: corrupted image with multiple "
+                        "virtual chunks mapped to physical chunk %u\n", id);
+                my_qemu_free(used_chunks);
+                return -EIO;
+            }
+            used_chunks[id] = true;
+        }
+    }
+
+    /* Count the number of leaked chunks. */
+    uint32_t num_leaked_chunks = 0;
+    for (i = 0; i < max_chunk; i++) {
+        if (!used_chunks[i]) {
+            num_leaked_chunks++;
+        }
+    }
+    QDEBUG("leaked_chunks=%u max_chunk=%u\n", num_leaked_chunks, max_chunk);
+
+    /* Record leaked chunks, which will be used later. */
+    if (num_leaked_chunks > 0) {
+        s->num_leaked_chunks = num_leaked_chunks;
+        s->leaked_chunks = my_qemu_malloc(sizeof(uint32_t) * num_leaked_chunks);
+        num_leaked_chunks = 0;
+        for (i = 0; i < max_chunk; i++) {
+            if (!used_chunks[i]) {
+                s->leaked_chunks[num_leaked_chunks++] = i;
+                QDEBUG("Recover leaked physical chunk %u\n", i);
+            }
+        }
+    }
+    s->used_storage = max_chunk * s->chunk_size;
+    s->storage_grow_unit = header->storage_grow_unit / 512;
+    my_qemu_free(used_chunks);
+
+    /* Check if the image is directly stored on a raw device, including
+     * logical volume. If so, figure out the size of the device. */
+    struct stat stat_buf;
+    if (stat(filename, &stat_buf) != 0) {
+        fprintf(stderr, "Failed to stat() %s\n", filename);
+        return -EIO;
+    }
+
+    /* Check how much storage space is already allocated. */
+    int64_t size = bdrv_getlength(s->fvd_data);
+    if (size < 0) {
+        fprintf(stderr, "Failed in bdrv_getlength(%s)\n", filename);
+        return -EIO;
+    }
+
+    if (S_ISBLK(stat_buf.st_mode) || S_ISCHR(stat_buf.st_mode)) {
+        const int64_t min_size = (s->data_offset + s->used_storage) * 512;
+        if (size < min_size) {
+            fprintf(stderr, "The size of device %s is not even big enough to "
+                    "store already allocated data.\n", filename);
+            return -EIO;
+        }
+
+        /* Initialize the command to grow storage space. */
+        char cmd[2048];
+        if (header->add_storage_cmd[0] == 0) {
+            s->add_storage_cmd = NULL;
+        } else {
+            if (strcmp(header->add_storage_cmd, "builtin:lvextend") == 0) {
+                /* Note the following:
+                 *     1. lvextend may generate warning messages like "File
+                 *     descriptor...leaked...", * which is fine.  See the
+                 *     following from LVM manual: "On invocation, lvm requires
+                 *     that only  the  standard  file  descriptors stdin,
+                 *     stdout * and stderr are available.  If others are
+                 *     found, they get closed and messages are issued warning
+                 *     about the leak."
+                 *     2. Instead of using the lvextend command line, one
+                 *     option is to use liblvm directly, which avoids creating
+                 *     a process to resize a LV.
+                 *     3. On Ubuntu, /bin/sh is linked to /bin/dash, which
+                 *     does not support ">&" for stdout and stderr
+                 *     redirection. */
+                snprintf(cmd, sizeof(cmd) - 1, "/sbin/lvextend -L+%" PRIu64
+                         "B %s >/dev/null 2>/dev/null",
+                         header->storage_grow_unit,
+                         header->data_file[0] ? header->data_file : filename);
+            } else {
+                snprintf(cmd, sizeof(cmd) - 1, "%s %" PRIu64
+                         " %s >/dev/null 2>/dev/null",
+                         header->add_storage_cmd, header->storage_grow_unit,
+                         header->data_file[0] ? header->data_file : filename);
+            }
+            int len = strlen(cmd);
+            s->add_storage_cmd = my_qemu_malloc(len + 1);
+            memcpy(s->add_storage_cmd, cmd, len + 1);
+        }
+    }
+
+    s->avail_storage = size / 512 - s->data_offset;
+    s->fvd_data->growable = true;
+    s->data_region_prepared = true;
+
+    return 0;
+}
+
+static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags)
+{
+    int ret;
+
+    if (header->data_file[0]) {
+        /* Open a separate data file. */
+        s->data_offset = 0;
+        s->fvd_data = bdrv_new("");
+        if (!s->fvd_data) {
+            fprintf(stderr, "Failed to create a new block device driver.\n");
+            return -EIO;
+        }
+
+        if (header->data_file_fmt[0] == 0) {
+            ret = bdrv_open(s->fvd_data, header->data_file, flags, NULL);
+        } else {
+            BlockDriver *data_drv = bdrv_find_format(header->data_file_fmt);
+            if (!data_drv) {
+                fprintf(stderr, "Failed to find driver for image format "
+                        "'%s' of data file %s\n",
+                        header->data_file_fmt, header->data_file);
+                return -EINVAL;
+            }
+            ret = bdrv_open(s->fvd_data, header->data_file, flags, data_drv);
+        }
+        if (ret != 0) {
+            fprintf(stderr, "Failed to open data file %s\n", header->data_file);
+            return -EIO;
+        }
+    } else {
+        s->data_offset = header->data_offset / 512;     /* In sectors. */
+        s->fvd_data = s->fvd_metadata;
+    }
+
+    if (header->need_zero_init && !bdrv_has_zero_init(s->fvd_data)) {
+        if (IN_QEMU_TOOL) {
+            /* Only give a warning to allow 'qemu-img update' to modify
+             * need_zero_init if the user manually zero-init the device. */
+            fprintf(stderr, "Warning: image needs zero_init but it is not "
+                    "supported by the storage media.\n");
+        } else {
+            fprintf(stderr, "Error: image needs zero_init but it is not "
+                    "supported by the storage media.\n");
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
+static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
+                       FvdHeader * header, const char *const filename)
+{
+    if (header->base_img_fully_prefetched) {
+        /* This also covers the case of no base image. */
+        s->prefetch_state = PREFETCH_STATE_FINISHED;
+        s->copy_on_read = false;
+        s->prefetch_start_delay = -1;
+
+        if (bs->backing_file[0] != 0) {
+            /* No need to use the base image. It may operate without problem
+             * even if the base image is no longer accessible. */
+            bs->backing_file[0] = 0;
+        }
+    } else {
+        ASSERT(header->base_img[0] != 0);
+        pstrcpy(bs->backing_file, 1024, header->base_img);
+        pstrcpy(bs->backing_format, 16, header->base_img_fmt);
+
+        /* This will be enabled in init_prefetch() after a timer expires. */
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        s->stale_bitmap = my_qemu_blockalign(s->fvd_metadata, s->bitmap_size);
+        if (bdrv_pread(s->fvd_metadata, header->bitmap_offset,
+                       s->stale_bitmap, s->bitmap_size) != s->bitmap_size) {
+            fprintf(stderr, "Failed to read the bitmap of %s.\n", filename);
+            return -EIO;
+        }
+
+        if (s->copy_on_read || (s->prefetch_state != PREFETCH_STATE_FINISHED &&
+                                s->prefetch_start_delay > 0)) {
+            /* Use two bitmaps only if copy_on_read or prefetching is enabled.
+             * See Section 3.3.4 of the FVD-cow paper. */
+            s->fresh_bitmap = my_qemu_blockalign(s->fvd_metadata,
+                                                 s->bitmap_size);
+            memcpy(s->fresh_bitmap, s->stale_bitmap, s->bitmap_size);
+        } else {
+            s->fresh_bitmap = s->stale_bitmap;
+        }
+    }
+
+    return 0;
+}
+
+static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s)
+{
+    if (IN_QEMU_TOOL) {
+        return;
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
+        s->prefetch_start_delay <= 0) {
+        return;
+    }
+
+    /* Start prefetching after a delay. Times 1000 to convert sec to ms. */
+    int64_t expire = qemu_get_clock(rt_clock) + s->prefetch_start_delay * 1000;
+    s->prefetch_timer = qemu_new_timer(rt_clock, fvd_init_prefetch, bs);
+    qemu_mod_timer(s->prefetch_timer, expire);
 }
diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
new file mode 100644
index 0000000..5844aa7
--- /dev/null
+++ b/block/fvd-prefetch.c
@@ -0,0 +1,17 @@ 
+/*
+ * QEMU Fast Virtual Disk Format Adaptive Prefetching
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+void fvd_init_prefetch(void *opaque)
+{
+    /* To be implemented. */
+}
diff --git a/block/fvd.c b/block/fvd.c
index d6263e7..e41f419 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -33,6 +33,7 @@ 
 #include "block/fvd-read.c"
 #include "block/fvd-write.c"
 #include "block/fvd-journal.c"
+#include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
 
 static BlockDriver bdrv_fvd = {