Patchwork [13/26] FVD: add impl of storing data in compact image

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-13-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84592/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the implementation of storing data in a compact image. This
capability is needed for both copy-on-write (see fvd_aio_writev()) and
copy-on-read (see fvd_aio_readv()).

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-store.c |  459 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-utils.c |   65 ++++++++
 2 files changed, 524 insertions(+), 0 deletions(-)

Patch

diff --git a/block/fvd-store.c b/block/fvd-store.c
index 85e45d4..fe670eb 100644
--- a/block/fvd-store.c
+++ b/block/fvd-store.c
@@ -11,10 +11,469 @@ 
  *
  */
 
+static uint32_t allocate_chunk(BlockDriverState * bs);
+static inline FvdAIOCB *init_store_acb(int soft_write,
+                QEMUIOVector * orig_qiov, BlockDriverState * bs,
+                int64_t sector_num, int nb_sectors, FvdAIOCB * parent_acb,
+                BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
+            struct FvdAIOCB *parent_acb, BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static void store_data_in_compact_image_cb(void *opaque, int ret);
+
 static inline BlockDriverAIOCB *store_data(int soft_write,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
                 BlockDriverCompletionFunc * cb, void *opaque)
 {
+    BDRVFvdState *s = bs->opaque;
+
+    TRACE_STORE_IN_FVD("store_data", sector_num, nb_sectors);
+
+    if (!s->table) {
+        /* Write directly since it is not a compact image. */
+        return bdrv_aio_writev(s->fvd_data, s->data_offset + sector_num,
+                               orig_qiov, nb_sectors, cb, opaque);
+    } else {
+        return store_data_in_compact_image(soft_write, parent_acb, bs,
+                                           sector_num, orig_qiov, nb_sectors,
+                                           cb, opaque);
+    }
+}
+
+/* Store data in the compact image. The argument 'soft_write' means
+ * the store was caused by copy-on-read or prefetching, which need not
+ * update metadata immediately. */
+static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
+                                                     FvdAIOCB * parent_acb,
+                                                     BlockDriverState * bs,
+                                                     int64_t sector_num,
+                                                     QEMUIOVector * orig_qiov,
+                                                     const int nb_sectors,
+                                                     BlockDriverCompletionFunc
+                                                     * cb, void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    const uint32_t first_chunk = sector_num / s->chunk_size;
+    const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+    int table_dirty = false;
+    uint32_t chunk;
+    int64_t start_sec;
+
+    /* Check if storag space is allocated. */
+    for (chunk = first_chunk; chunk <= last_chunk; chunk++) {
+        if (IS_EMPTY(s->table[chunk])) {
+            uint32_t id = allocate_chunk(bs);
+            if (IS_EMPTY(id)) {
+                return NULL;
+            }
+            QDEBUG ("STORE: map chunk %u to %u\n", chunk, id);
+            id |= DIRTY_TABLE;
+            WRITE_TABLE(s->table[chunk], id);
+            table_dirty = true;
+        } else if (IS_DIRTY(s->table[chunk])) {
+            /* This is possible in several cases. 1) If a previous soft-write
+             * allocated the storage space but did not flush the table entry
+             * change to the journal and hence did not clean the dirty bit. 2)
+             * This is possible if a previous hard-write was canceled before
+             * it could write the table entry to disk. 3) Finally, this is
+             * also possible with two concurrent hard-writes. The first
+             * hard-write allocated the storage space but has not flushed the
+             * table entry change to the journal yet and hence the table entry
+             * remains dirty. In this case, the second hard-write will also
+             * try to flush this dirty table entry to the journal. The outcome
+             * is correct since they store the same metadata change in the
+             * journal (although twice). For this race condition, we prefer to
+             * have two writes to the journal rather than introducing a
+             * locking mechanism, because this happens rarely and those two
+             * writes to the journal are likely to be merged by the kernel
+             * into a single write since they are likely to update
+             * back-to-back sectors in the journal.  A locking mechanism would
+             * be less efficient, because the large size of chunks would cause
+             * unnecessary locking due to ``false sharing'' of a chunk by two
+             * writes. */
+            table_dirty = true;
+        }
+    }
+
+    if (!(acb = init_store_acb(soft_write, orig_qiov, bs, sector_num,
+                               nb_sectors, parent_acb, cb, opaque))) {
+        return NULL;
+    }
+
+    const bool update_table = (!soft_write && table_dirty);
+    size_t iov_left;
+    uint8_t *iov_buf;
+    int nb, iov_index, nqiov, niov;
+    uint32_t prev;
+
+    if (first_chunk == last_chunk) {
+        goto handle_one_continuous_region;
+    }
+
+    /* Count the number of qiov and iov needed to cover the continuous regions
+     * of the compact image. */
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    iov_index = 0;
+    nqiov = 0;
+    niov = 0;
+    prev = READ_TABLE(s->table[first_chunk]);
+
+    /* Data in the first chunk. */
+    nb = s->chunk_size - (sector_num % s->chunk_size);
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE(s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if (current == prev + 1) {
+            nb += data_size;    /* Continue the previous region. */
+        } else {
+            /* Terminate the previous region. */
+            niov += count_iov(orig_qiov->iov, &iov_index, &iov_buf,
+                              &iov_left, nb * 512);
+            nqiov++;
+            nb = data_size;     /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    if (nqiov == 0) {
+handle_one_continuous_region:
+        /* A simple case. All data can be written out in one qiov and no new
+         * chunks are allocated. */
+        start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
+            (sector_num % s->chunk_size);
+
+        acb->store.update_table = update_table;
+        acb->store.num_children = 1;
+        acb->store.one_child.hd_acb =
+            bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, orig_qiov,
+                            nb_sectors, store_data_in_compact_image_cb,
+                            &acb->store.one_child);
+        if (acb->store.one_child.hd_acb) {
+            acb->store.one_child.acb = acb;
+            return &acb->common;
+        } else {
+            my_qemu_aio_release(acb);
+            return NULL;
+        }
+    }
+
+    /* qiov for the last continuous region. */
+    niov += count_iov(orig_qiov->iov, &iov_index, &iov_buf,
+                      &iov_left, nb * 512);
+    nqiov++;
+    ASSERT(iov_index == orig_qiov->niov - 1 && iov_left == 0);
+
+    /* Need to submit multiple requests to the lower layer. */
+    acb->store.update_table = update_table;
+    acb->store.num_children = nqiov;
+
+    if (!parent_acb) {
+        QDEBUG("STORE: acb%llu-%p  start  sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->uuid, acb, acb->sector_num,
+               acb->nb_sectors);
+    }
+
+    /* Allocate memory and create multiple requests. */
+    const size_t metadata_size = nqiov * (sizeof(CompactChildCB) +
+                                          sizeof(QEMUIOVector)) +
+        niov * sizeof(struct iovec);
+    acb->store.children = (CompactChildCB *) my_qemu_malloc(metadata_size);
+    QEMUIOVector *q = (QEMUIOVector *) (acb->store.children + nqiov);
+    struct iovec *v = (struct iovec *)(q + nqiov);
+
+    start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
+        (sector_num % s->chunk_size);
+    nqiov = 0;
+    iov_index = 0;
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    prev = READ_TABLE(s->table[first_chunk]);
+
+    /* Data in the first chunk. */
+    if (first_chunk == last_chunk) {
+        nb = nb_sectors;
+    } else {
+        nb = s->chunk_size - (sector_num % s->chunk_size);
+    }
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE(s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if (current == prev + 1) {
+            nb += data_size;    /* Continue the previous region. */
+        } else {
+            /* Terminate the previous continuous region. */
+            niov = setup_iov(orig_qiov->iov, v, &iov_index,
+                             &iov_buf, &iov_left, nb * 512);
+            qemu_iovec_init_external(q, v, niov);
+            QDEBUG("STORE: acb%llu-%p  create_child %d sector_num=%" PRId64
+                   " nb_sectors=%zu niov=%d\n", acb->uuid, acb, nqiov,
+                   start_sec, q->size / 512, q->niov);
+            acb->store.children[nqiov].hd_acb =
+                bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, q,
+                                q->size / 512, store_data_in_compact_image_cb,
+                                &acb->store.children[nqiov]);
+            if (!acb->store.children[nqiov].hd_acb) {
+                goto fail;
+            }
+            acb->store.children[nqiov].acb = acb;
+            v += niov;
+            q++;
+            nqiov++;
+            start_sec = current * s->chunk_size; /* Begin of the new region. */
+            nb = data_size;     /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    /* Requst for the last chunk. */
+    niov = setup_iov(orig_qiov->iov, v, &iov_index, &iov_buf,
+                     &iov_left, nb * 512);
+    ASSERT(iov_index == orig_qiov->niov - 1 && iov_left == 0);
+    qemu_iovec_init_external(q, v, niov);
+
+    QDEBUG("STORE: acb%llu-%p  create_child_last %d sector_num=%" PRId64
+           " nb_sectors=%zu niov=%d\n", acb->uuid, acb, nqiov, start_sec,
+           q->size / 512, q->niov);
+    acb->store.children[nqiov].hd_acb =
+        bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, q,
+                        q->size / 512, store_data_in_compact_image_cb,
+                        &acb->store.children[nqiov]);
+    if (acb->store.children[nqiov].hd_acb) {
+        acb->store.children[nqiov].acb = acb;
+        return &acb->common;
+    }
+
+    int i;
+fail:
+    QDEBUG("STORE: acb%llu-%p  failed\n", acb->uuid, acb);
+    for (i = 0; i < nqiov; i++) {
+        bdrv_aio_cancel(acb->store.children[i].hd_acb);
+    }
+    my_qemu_free(acb->store.children);
+    my_qemu_aio_release(acb);
     return NULL;
 }
+
+static uint32_t allocate_chunk(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    uint32_t physical_chunk;
+
+    /* Reuse a previously leaked chunk if possible. */
+    if (s->next_avail_leaked_chunk < s->num_leaked_chunks) {
+        physical_chunk = s->leaked_chunks[s->next_avail_leaked_chunk++];
+        QDEBUG("Reuse leaked physical chunk %u\n", physical_chunk);
+        if (s->next_avail_leaked_chunk == s->num_leaked_chunks) {
+            /* All leaked chunks have been used. */
+            my_qemu_free(s->leaked_chunks);
+            s->leaked_chunks = NULL;
+            s->num_leaked_chunks = s->next_avail_leaked_chunk = 0;
+            QDEBUG("All leaked physical chunks reused\n");
+        }
+        if (!s->chunks_relocated) {
+            s->chunks_relocated = true;
+            /* Update the header. */
+            FvdHeader header;
+            if (read_fvd_header(s, &header)) {
+                s->metadata_err_prohibit_write = true;
+            } else {
+                header.chunks_relocated = true;
+                if (update_fvd_header(s, &header)
+                    || bdrv_flush(s->fvd_metadata)) {
+                    s->metadata_err_prohibit_write = true;
+                }
+            }
+        }
+        return physical_chunk;
+    }
+
+    /* Grow storage space if needed. */
+    if (s->add_storage_cmd &&
+        s->used_storage + s->chunk_size > s->avail_storage) {
+        if (system(s->add_storage_cmd)) {
+            fprintf(stderr, "Error in executing %s\n", s->add_storage_cmd);
+        }
+
+        /* Check how much storage is available now. */
+        int64_t size = bdrv_getlength(s->fvd_data);
+        if (size < 0) {
+            fprintf(stderr, "Error in bdrv_getlength(%s)\n", bs->filename);
+            return EMPTY_TABLE;
+        }
+        s->avail_storage = size / 512 - s->data_offset;
+        if (s->used_storage + s->chunk_size > s->avail_storage) {
+            fprintf(stderr, "Could not allocate more storage space.\n");
+            return EMPTY_TABLE;
+        }
+
+        QDEBUG("Increased storage to %" PRId64 " bytes.\n", size);
+    }
+
+    physical_chunk = s->used_storage / s->chunk_size;
+    s->used_storage += s->chunk_size;
+    return physical_chunk;
+}
+
+static void store_data_in_compact_image_cb(void *opaque, int ret)
+{
+    CompactChildCB *child = opaque;
+    FvdAIOCB *acb = child->acb;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    /* Now fvd_aio_cancel_store_compact() won't cancel this child request. */
+    child->hd_acb = NULL;
+
+    if (acb->store.ret == 0) {
+        acb->store.ret = ret;
+    } else {
+        QDEBUG("STORE: acb%llu-%p  store_child=%d total_children=%d error "
+               "ret=%d\n", acb->uuid, acb, acb->store.finished_children,
+               acb->store.num_children, ret);
+    }
+
+    acb->store.finished_children++;
+    if (acb->store.finished_children < acb->store.num_children) {
+        QDEBUG("STORE: acb%llu-%p  store_finished_children=%d "
+               "total_children=%d\n", acb->uuid, acb,
+               acb->store.finished_children, acb->store.num_children);
+        return;
+    }
+
+    /* All child requests finished. Free buffers. */
+    if (acb->store.children) {
+        my_qemu_free(acb->store.children);
+        acb->store.children = NULL;
+    }
+
+    if (acb->store.ret) {       /* error */
+        QDEBUG("STORE: acb%llu-%p  store_last_child_finished_with_error "
+               "ret=%d\n", acb->uuid, acb, acb->store.ret);
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+        return;
+    }
+
+    /* Update the frontier of sectors already written (i.e.,avail_storage).
+     * This affects load_data_from_compact_image(). A load from unwritten
+     * sectors in allocated chunks should return an array of zeros.  Also
+     * check whether the table entries are still dirty. Note that while saving
+     * this write to disk, other writes might have already flushed the dirty
+     * table entries to the journal. If those table entries are no longer
+     * dirty, depending on the behavior of parent_acb, it might be able to
+     * skip a journal update. */
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    const uint32_t first_chunk = acb->sector_num / s->chunk_size;
+    const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                                / s->chunk_size;
+    bool update_table = false;
+    uint32_t chunk;
+    for (chunk = first_chunk; chunk <= last_chunk; chunk ++) {
+        int64_t end;
+        if (chunk == last_chunk) {
+            int64_t data = (acb->sector_num + acb->nb_sectors) % s->chunk_size;
+            if (data == 0) {
+                data = s->chunk_size;
+            }
+            end = READ_TABLE(s->table[chunk]) * s->chunk_size + data;
+        } else {
+            end = (READ_TABLE(s->table[chunk]) + 1) * s->chunk_size;
+        }
+        if (end > s->avail_storage) {
+            s->avail_storage = end;
+        }
+
+        if (IS_DIRTY(s->table[chunk])) {
+            update_table = true;
+        }
+    }
+
+    if (!acb->store.update_table) {
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_without_table_update\n",
+               acb->uuid, acb);
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+        return;
+    }
+
+    if (acb->store.parent_acb) {
+        /* Metadata update will be handled by the parent write. */
+        ASSERT(acb->store.parent_acb->type == OP_WRITE);
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_with_parent_do_table_update\n",
+               acb->uuid, acb);
+        acb->store.parent_acb->write.update_table = update_table;
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+    } else if (update_table) {
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_and_start_table_update\n",
+               acb->uuid, acb);
+        write_metadata_to_journal(acb, false);
+    } else {
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_without_table_update\n",
+               acb->uuid, acb);
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+    }
+}
+
+static inline FvdAIOCB *init_store_acb(int soft_write,
+                                       QEMUIOVector * orig_qiov,
+                                       BlockDriverState * bs,
+                                       int64_t sector_num, int nb_sectors,
+                                       FvdAIOCB * parent_acb,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    FvdAIOCB *acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    acb->type = OP_STORE_COMPACT;
+    acb->cancel_in_progress = false;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->store.soft_write = soft_write;
+    acb->store.orig_qiov = orig_qiov;
+    acb->store.parent_acb = parent_acb;
+    acb->store.finished_children = 0;
+    acb->store.num_children = 0;
+    acb->store.one_child.hd_acb = NULL;
+    acb->store.children = NULL;
+    acb->store.ret = 0;
+    acb->jcb.iov.iov_base = NULL;
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL;
+    COPY_UUID(acb, parent_acb);
+    return acb;
+}
diff --git a/block/fvd-utils.c b/block/fvd-utils.c
index ff2bb8f..9feaa35 100644
--- a/block/fvd-utils.c
+++ b/block/fvd-utils.c
@@ -42,3 +42,68 @@  static inline void copy_iov(struct iovec *iov, int *p_index,
         left = iov[index].iov_len;
     }
 }
+
+static int count_iov(struct iovec *orig_iov, int *p_index, uint8_t ** p_buf,
+                     size_t * p_left, size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
+
+static int setup_iov(struct iovec *orig_iov, struct iovec *new_iov,
+                     int *p_index, uint8_t ** p_buf, size_t * p_left,
+                     size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            new_iov[count].iov_base = buf;
+            new_iov[count].iov_len = total;
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        new_iov[count].iov_base = buf;
+        new_iov[count].iov_len = left;
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}