From patchwork Fri Feb 25 22:37:51 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chunqiang Tang X-Patchwork-Id: 84620 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 646F9B7101 for ; Sat, 26 Feb 2011 11:21:05 +1100 (EST) Received: from localhost ([127.0.0.1]:47012 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Pt6zY-00035k-1M for incoming@patchwork.ozlabs.org; Fri, 25 Feb 2011 18:22:32 -0500 Received: from [140.186.70.92] (port=42341 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Pt6YG-0000ss-EU for qemu-devel@nongnu.org; Fri, 25 Feb 2011 17:54:50 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Pt6Xy-0006Az-Fc for qemu-devel@nongnu.org; Fri, 25 Feb 2011 17:54:08 -0500 Received: from e3.ny.us.ibm.com ([32.97.182.143]:55425) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Pt6Xy-0006Ao-C3 for qemu-devel@nongnu.org; Fri, 25 Feb 2011 17:54:02 -0500 Received: from d01dlp01.pok.ibm.com (d01dlp01.pok.ibm.com [9.56.224.56]) by e3.ny.us.ibm.com (8.14.4/8.13.1) with ESMTP id p1PMRvxK025540 for ; Fri, 25 Feb 2011 17:27:59 -0500 Received: from d01relay01.pok.ibm.com (d01relay01.pok.ibm.com [9.56.227.233]) by d01dlp01.pok.ibm.com (Postfix) with ESMTP id C08BB38C803D for ; Fri, 25 Feb 2011 17:47:53 -0500 (EST) Received: from d01av03.pok.ibm.com (d01av03.pok.ibm.com [9.56.224.217]) by d01relay01.pok.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id p1PMlrqZ366412 for ; Fri, 25 Feb 2011 17:47:53 -0500 Received: from d01av03.pok.ibm.com (loopback [127.0.0.1]) by d01av03.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id p1PMlrkp020800 for ; Fri, 25 Feb 2011 19:47:53 -0300 Received: from localhost.localdomain ([9.59.229.24]) by d01av03.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id p1PMlp99020693; Fri, 25 Feb 2011 19:47:53 -0300 From: Chunqiang Tang To: qemu-devel@nongnu.org Date: Fri, 25 Feb 2011 17:37:51 -0500 Message-Id: <1298673486-3573-11-git-send-email-ctang@us.ibm.com> X-Mailer: git-send-email 1.7.0.4 In-Reply-To: <1298673486-3573-1-git-send-email-ctang@us.ibm.com> References: <1298673486-3573-1-git-send-email-ctang@us.ibm.com> X-Content-Scanned: Fidelis XPS MAILER X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.6, seldom 2.4 (older, 4) X-Received-From: 32.97.182.143 Cc: Chunqiang Tang Subject: [Qemu-devel] [PATCH 11/26] FVD: add impl of interface bdrv_aio_writev() X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org This patch is part of the Fast Virtual Disk (FVD) proposal. See http://wiki.qemu.org/Features/FVD. This patch adds FVD's implementation of the bdrv_aio_writev() interface. It supports copy-on-write in FVD. Signed-off-by: Chunqiang Tang --- block/fvd-bitmap.c | 150 ++++++++++++++++ block/fvd-journal.c | 4 + block/fvd-store.c | 20 +++ block/fvd-write.c | 468 ++++++++++++++++++++++++++++++++++++++++++++++++++- block/fvd.c | 4 +- block/fvd.h | 1 + 6 files changed, 645 insertions(+), 2 deletions(-) create mode 100644 block/fvd-bitmap.c create mode 100644 block/fvd-store.c diff --git a/block/fvd-bitmap.c b/block/fvd-bitmap.c new file mode 100644 index 0000000..7e96201 --- /dev/null +++ b/block/fvd-bitmap.c @@ -0,0 +1,150 @@ +/* + * QEMU Fast Virtual Disk Format Utility Functions for Bitmap + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Chunqiang Tang + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +static inline bool stale_bitmap_show_sector_in_base_img(int64_t sector_num, + const BDRVFvdState * s) +{ + if (sector_num >= s->base_img_sectors) { + return false; + } + + int64_t block_num = sector_num / s->block_size; + int64_t bitmap_byte_offset = block_num / 8; + uint8_t bitmap_bit_offset = block_num % 8; + uint8_t b = s->stale_bitmap[bitmap_byte_offset]; + return 0 == (int)((b >> bitmap_bit_offset) & 0x01); +} + +static inline bool fresh_bitmap_show_sector_in_base_img(int64_t sector_num, + const BDRVFvdState * s) +{ + if (sector_num >= s->base_img_sectors) { + return false; + } + + int64_t block_num = sector_num / s->block_size; + int64_t bitmap_byte_offset = block_num / 8; + uint8_t bitmap_bit_offset = block_num % 8; + uint8_t b = s->fresh_bitmap[bitmap_byte_offset]; + return 0 == (int)((b >> bitmap_bit_offset) & 0x01); +} + +static inline void update_fresh_bitmap(int64_t sector_num, int nb_sectors, + const BDRVFvdState * s) +{ + if (sector_num >= s->base_img_sectors) { + return; + } + + int64_t end = sector_num + nb_sectors; + if (end > s->base_img_sectors) { + end = s->base_img_sectors; + } + + int64_t block_num = sector_num / s->block_size; + int64_t block_end = (end - 1) / s->block_size; + + for (; block_num <= block_end; block_num++) { + int64_t bitmap_byte_offset = block_num / 8; + uint8_t bitmap_bit_offset = block_num % 8; + uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset); + uint8_t b = s->fresh_bitmap[bitmap_byte_offset]; + if (!(b & mask)) { + b |= mask; + s->fresh_bitmap[bitmap_byte_offset] = b; + } + } +} + +static inline bool bitmap_show_sector_in_base_img(int64_t sector_num, + const BDRVFvdState * s, + int bitmap_offset, + uint8_t * bitmap) +{ + if (sector_num >= s->base_img_sectors) { + return false; + } + + int64_t block_num = sector_num / s->block_size; + int64_t bitmap_byte_offset = block_num / 8 - bitmap_offset; + uint8_t bitmap_bit_offset = block_num % 8; + uint8_t b = bitmap[bitmap_byte_offset]; + return 0 == (int)((b >> bitmap_bit_offset) & 0x01); +} + +static inline bool stale_bitmap_need_update(FvdAIOCB * acb) +{ + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + int64_t end = acb->sector_num + acb->nb_sectors; + + if (end > s->base_img_sectors) { + end = s->base_img_sectors; + } + int64_t block_end = (end - 1) / s->block_size; + int64_t block_num = acb->sector_num / s->block_size; + + for (; block_num <= block_end; block_num++) { + int64_t bitmap_byte_offset = block_num / 8; + uint8_t bitmap_bit_offset = block_num % 8; + uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset); + uint8_t b = s->stale_bitmap[bitmap_byte_offset]; + if (!(b & mask)) { + return true; + } + } + + return false; +} + +/* Return true if stable_bitmap needs update. */ +static bool update_fresh_bitmap_and_check_stale_bitmap(FvdAIOCB * acb) +{ + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + + if (acb->sector_num >= s->base_img_sectors) { + return false; + } + + bool need_update = false; + int64_t end = acb->sector_num + acb->nb_sectors; + + if (end > s->base_img_sectors) { + end = s->base_img_sectors; + } + + int64_t block_end = (end - 1) / s->block_size; + int64_t block_num = acb->sector_num / s->block_size; + + for (; block_num <= block_end; block_num++) { + int64_t bitmap_byte_offset = block_num / 8; + uint8_t bitmap_bit_offset = block_num % 8; + uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset); + uint8_t b = s->stale_bitmap[bitmap_byte_offset]; + if (b & mask) { + /* If the bit in stale_bitmap is set, the corresponding bit in + * fresh_bitmap must be set already. */ + continue; + } + + need_update = true; + b = s->fresh_bitmap[bitmap_byte_offset]; + if (!(b & mask)) { + b |= mask; + s->fresh_bitmap[bitmap_byte_offset] = b; + } + } + + return need_update; +} diff --git a/block/fvd-journal.c b/block/fvd-journal.c index 5ba34bd..2edfc70 100644 --- a/block/fvd-journal.c +++ b/block/fvd-journal.c @@ -28,6 +28,10 @@ static int init_journal(int read_only, BlockDriverState * bs, return -ENOTSUP; } +static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap) +{ +} + void fvd_emulate_host_crash(bool cond) { emulate_host_crash = cond; diff --git a/block/fvd-store.c b/block/fvd-store.c new file mode 100644 index 0000000..85e45d4 --- /dev/null +++ b/block/fvd-store.c @@ -0,0 +1,20 @@ +/* + * QEMU Fast Virtual Disk Format Store Data in Compact Image + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Chunqiang Tang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +static inline BlockDriverAIOCB *store_data(int soft_write, + FvdAIOCB * parent_acb, BlockDriverState * bs, + int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors, + BlockDriverCompletionFunc * cb, void *opaque) +{ + return NULL; +} diff --git a/block/fvd-write.c b/block/fvd-write.c index a736a37..f0580d4 100644 --- a/block/fvd-write.c +++ b/block/fvd-write.c @@ -11,11 +11,477 @@ * */ +static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap); +static int do_aio_write(struct FvdAIOCB *acb); +static void restart_dependent_writes(struct FvdAIOCB *acb); +static void free_write_resource(struct FvdAIOCB *acb); +static inline BlockDriverAIOCB *store_data(int soft_write, + FvdAIOCB * parent_acb, BlockDriverState * bs, + int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors, + BlockDriverCompletionFunc * cb, void *opaque); + +static inline void init_data_region(BDRVFvdState * s) +{ + bdrv_truncate(s->fvd_data, s->data_offset * 512 + s->virtual_disk_size); + s->data_region_prepared = true; +} + static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs, int64_t sector_num, QEMUIOVector * qiov, int nb_sectors, BlockDriverCompletionFunc * cb, void *opaque) { - return NULL; + BDRVFvdState *s = bs->opaque; + FvdAIOCB *acb; + + TRACE_REQUEST(true, sector_num, nb_sectors); + + if (s->metadata_err_prohibit_write) { + return NULL; + } + + if (!s->data_region_prepared) { + init_data_region(s); + } + + if (s->prefetch_state == PREFETCH_STATE_FINISHED + || sector_num >= s->base_img_sectors) { + /* This is an efficient case. See Section 3.3.5 of the FVD-cow paper. + * This also covers the case of no base image. */ + return store_data(false, NULL, bs, sector_num, qiov, + nb_sectors, cb, opaque); + } + + /* Check if all requested sectors are in the FVD data file. */ + int64_t sec = ROUND_DOWN(sector_num, s->block_size); + int64_t sec_in_last_block = ROUND_DOWN(sector_num + nb_sectors - 1, + s->block_size); + do { + if (stale_bitmap_show_sector_in_base_img(sec, s)) { + goto slow_path; + } + sec += s->block_size; + } while (sec <= sec_in_last_block); + + /* This is the fast path, as all requested data are in the FVD data file + * and no need to update the bitmap. */ + return store_data(false, NULL, bs, sector_num, qiov, + nb_sectors, cb, opaque); + +slow_path: + acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque); + if (!acb) { + return NULL; + } + + acb->type = OP_WRITE; + acb->cancel_in_progress = false; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->write.ret = 0; + acb->write.update_table = false; + acb->write.qiov = qiov; + acb->write.hd_acb = NULL; + acb->write.cow_buf = NULL; + acb->copy_lock.next.le_prev = NULL; + acb->write.next_write_lock.le_prev = NULL; + acb->write.next_dependent_write.le_prev = NULL; + acb->jcb.iov.iov_base = NULL; + acb->jcb.hd_acb = NULL; + acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL; + QLIST_INIT(&acb->copy_lock.dependent_writes); + + QDEBUG("WRITE: acb%llu-%p start sector_num=%" PRId64 " nb_sectors=%d\n", + acb->uuid, acb, acb->sector_num, acb->nb_sectors); + + if (do_aio_write(acb) < 0) { + my_qemu_aio_release(acb); + return NULL; + } +#ifdef FVD_DEBUG + pending_local_writes++; +#endif + return &acb->common; +} + +static void free_write_resource(FvdAIOCB * acb) +{ + if (acb->write.next_write_lock.le_prev) { + QLIST_REMOVE(acb, write.next_write_lock); + } + if (acb->copy_lock.next.le_prev) { + QLIST_REMOVE(acb, copy_lock.next); + restart_dependent_writes(acb); + } + if (acb->write.cow_buf) { + my_qemu_vfree(acb->write.cow_buf); + } + if (acb->jcb.iov.iov_base != NULL) { + my_qemu_vfree(acb->jcb.iov.iov_base); + } + + my_qemu_aio_release(acb); + +#ifdef FVD_DEBUG + pending_local_writes--; +#endif +} + +static inline void finish_write(FvdAIOCB * acb, int ret) +{ + QDEBUG("WRITE: acb%llu-%p completely_finished ret=%d\n", acb->uuid, acb, + ret); + acb->common.cb(acb->common.opaque, ret); + free_write_resource(acb); +} + +static void write_data_cb(void *opaque, int ret) +{ + FvdAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + + if (acb->cancel_in_progress) { + return; + } + + acb->write.ret = ret; + acb->write.hd_acb = NULL; + + if (ret != 0) { + QDEBUG("WRITE: acb%llu-%p write_data_cb error ret=%d\n", + acb->uuid, acb, ret); + finish_write(acb, ret); + return; + } + + QDEBUG("WRITE: acb%llu-%p write_data_cb\n", acb->uuid, acb); + + /* Figure out whether to update metadata or not. */ + if (s->fresh_bitmap == s->stale_bitmap) { + /* Neither copy_on_read nor prefetching is enabled. Cannot update + * fresh_bitmap until the on-disk metadata is updated. */ + if (stale_bitmap_need_update(acb)) { + write_metadata_to_journal(acb, true); + } else if (acb->write.update_table) { + write_metadata_to_journal(acb, false); + } else { + finish_write(acb, ret); /* No need to update metadata. */ + } + + return; + } + + /* stale_bitmap and fresh_bitmap are different. Update fresh_bitmap now + * and stale_bitmap will be updated after on-disk metadata are updated. */ + bool bitmap_need_update = update_fresh_bitmap_and_check_stale_bitmap(acb); + + /* Release lock on data now since fresh_bitmap has been updated. */ + QLIST_REMOVE(acb, write.next_write_lock); + acb->write.next_write_lock.le_prev = NULL; + if (acb->copy_lock.next.le_prev) { + QLIST_REMOVE(acb, copy_lock.next); + restart_dependent_writes(acb); + } + + if (bitmap_need_update) { + write_metadata_to_journal(acb, true); + } else if (acb->write.update_table) { + write_metadata_to_journal(acb, false); + } else { + finish_write(acb, ret); + } +} + +static void read_backing_for_copy_on_write_cb(void *opaque, int ret) +{ + FvdAIOCB *acb = (FvdAIOCB *) opaque; + BlockDriverState *bs = acb->common.bs; + + if (acb->cancel_in_progress) { + return; + } + + if (ret != 0) { + QDEBUG("WRITE: acb%llu-%p read_backing with error " + "ret=%d\n", acb->uuid, acb, ret); + finish_write(acb, ret); + } else { + QDEBUG("WRITE: acb%llu-%p " + "finish_read_from_backing_and_start_write_data\n", + acb->uuid, acb); + acb->write.hd_acb = store_data(false, acb, bs, + acb->write.cow_start_sector, + acb->write.cow_qiov, + acb->write.cow_qiov->size / 512, + write_data_cb, acb); + if (!acb->write.hd_acb) { + finish_write(acb, -EIO); + } + } +} + +static int do_aio_write(FvdAIOCB * acb) +{ + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + + /* Calculate the data region need be locked. */ + const int64_t sector_end = acb->sector_num + acb->nb_sectors; + const int64_t block_begin = ROUND_DOWN(acb->sector_num, s->block_size); + int64_t block_end = ROUND_UP(sector_end, s->block_size); + + /* Check for conflicting copy-on-reads. */ + FvdAIOCB *old; + QLIST_FOREACH(old, &s->copy_locks, copy_lock.next) { + if (old->copy_lock.end > acb->sector_num && + sector_end > old->copy_lock.begin) { + QLIST_INSERT_HEAD(&old->copy_lock.dependent_writes, acb, + write.next_dependent_write); + QDEBUG("WRITE: acb%llu-%p put_on_hold_due_to_data_conflict " + "with %s acb%llu-%p\n", acb->uuid, acb, + old->type == OP_WRITE ? "write" : "copy_on_read", + old->uuid, old); + return 0; + } + } + + /* No conflict. check if this write updates partial blocks and need to + * read those blocks from the base image and merge with this write. */ + int read_first_block, read_last_block; + if (acb->sector_num % s->block_size == 0) { + read_first_block = false; + } else if (fresh_bitmap_show_sector_in_base_img(acb->sector_num, s)) { + read_first_block = true; + } else { + read_first_block = false; + } + + if (sector_end % s->block_size == 0) { + read_last_block = false; + } else if (fresh_bitmap_show_sector_in_base_img(sector_end, s)) { + read_last_block = true; + } else { + read_last_block = false; + } + + if (read_first_block) { + if (read_last_block) { + /* Case 1: Read all the blocks involved from the base image. */ + const QEMUIOVector *old_qiov = acb->write.qiov; + if (block_end > s->base_img_sectors) { + block_end = s->base_img_sectors; + } + + int buf_size = (block_end - block_begin) * 512 + + 2 * sizeof(QEMUIOVector) + + sizeof(struct iovec) * (old_qiov->niov + 3); + buf_size = ROUND_UP(buf_size, 512); + acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size); + + /* For reading from the base image. */ + QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf + + (block_end - block_begin) * 512); + read_qiov->iov = (struct iovec *)(read_qiov + 1); + read_qiov->nalloc = -1; + read_qiov->niov = 1; + read_qiov->iov[0].iov_base = acb->write.cow_buf; + read_qiov->iov[0].iov_len = read_qiov->size = + (block_end - block_begin) * 512; + + /* For writing to the FVD data file. */ + QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1); + write_qiov->iov = (struct iovec *)(write_qiov + 1); + write_qiov->nalloc = -1; + write_qiov->niov = old_qiov->niov + 2; + write_qiov->size = read_qiov->size; + + /* The first entry is for data read from the base image. */ + write_qiov->iov[0].iov_base = acb->write.cow_buf; + write_qiov->iov[0].iov_len = (acb->sector_num - block_begin) * 512; + memcpy(&write_qiov->iov[1], old_qiov->iov, + sizeof(struct iovec) * old_qiov->niov); + + /* The last entry is for data read from the base image. */ + const int last = old_qiov->niov + 1; + write_qiov->iov[last].iov_base = acb->write.cow_buf + + (sector_end - block_begin) * 512; + write_qiov->iov[last].iov_len = (block_end - sector_end) * 512; + acb->write.cow_qiov = write_qiov; + acb->write.cow_start_sector = block_begin; + + acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd, block_begin, + read_qiov, block_end - block_begin, + read_backing_for_copy_on_write_cb, acb); + if (!acb->write.hd_acb) { + goto fail; + } + + acb->copy_lock.begin = block_begin; + acb->copy_lock.end = block_end; + QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next); + QDEBUG("WRITE: acb%llu-%p " + "read_first_last_partial_blocks_from_backing sector_num=%" + PRId64 " nb_sectors=%d\n", acb->uuid, acb, block_begin, + (int)(block_end - block_begin)); + } else { + /* Case 2: Read the first block from the base image. */ + int nb = acb->sector_num - block_begin; + const QEMUIOVector *old_qiov = acb->write.qiov; + + /* Space for data and metadata. */ + int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector) + + sizeof(struct iovec) * (old_qiov->niov + 2); + buf_size = ROUND_UP(buf_size, 512); + acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size); + + /* For reading from the base image. */ + QEMUIOVector *read_qiov = + (QEMUIOVector *) (acb->write.cow_buf + nb * 512); + read_qiov->iov = (struct iovec *)(read_qiov + 1); + read_qiov->nalloc = -1; + read_qiov->niov = 1; + read_qiov->iov[0].iov_base = acb->write.cow_buf; + read_qiov->iov[0].iov_len = read_qiov->size = nb * 512; + + /* For writing to the FVD data file. */ + QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1); + write_qiov->iov = (struct iovec *)(write_qiov + 1); + write_qiov->nalloc = -1; + write_qiov->niov = old_qiov->niov + 1; + write_qiov->size = old_qiov->size + read_qiov->size; + + /* The first entry is added for data read from the base image. */ + write_qiov->iov[0].iov_base = acb->write.cow_buf; + write_qiov->iov[0].iov_len = read_qiov->size; + memcpy(&write_qiov->iov[1], old_qiov->iov, + sizeof(struct iovec) * old_qiov->niov); + acb->write.cow_qiov = write_qiov; + acb->write.cow_start_sector = block_begin; + + acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd, + block_begin, read_qiov, nb, + read_backing_for_copy_on_write_cb, acb); + if (!acb->write.hd_acb) { + goto fail; + } + + acb->copy_lock.begin = block_begin; + acb->copy_lock.end = block_begin + s->block_size; + QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next); + QDEBUG("WRITE: acb%llu-%p read_first_partial_block_from_backing " + "sector_num=%" PRId64 " nb_sectors=%d\n", + acb->uuid, acb, block_begin, nb); + } + } else { + if (read_last_block) { + /* Case 3: Read the last block from the base image. */ + int nb; + if (block_end < s->base_img_sectors) { + nb = block_end - sector_end; + } else { + nb = s->base_img_sectors - sector_end; + } + const QEMUIOVector *old_qiov = acb->write.qiov; + + /* Space for data and metadata. */ + int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector) + + sizeof(struct iovec) * (old_qiov->niov + 2); + buf_size = ROUND_UP(buf_size, 512); + acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size); + + /* For reading from the base image. */ + QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf + + nb * 512); + read_qiov->iov = (struct iovec *)(read_qiov + 1); + read_qiov->nalloc = -1; + read_qiov->niov = 1; + read_qiov->iov[0].iov_base = acb->write.cow_buf; + read_qiov->iov[0].iov_len = read_qiov->size = nb * 512; + + /* For writing to the FVD data file. */ + QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1); + write_qiov->iov = (struct iovec *)(write_qiov + 1); + write_qiov->nalloc = -1; + write_qiov->niov = old_qiov->niov + 1; + write_qiov->size = old_qiov->size + read_qiov->size; + memcpy(write_qiov->iov, old_qiov->iov, + sizeof(struct iovec) * old_qiov->niov); + + /* The last appended entry is for data read from the base image. */ + write_qiov->iov[old_qiov->niov].iov_base = acb->write.cow_buf; + write_qiov->iov[old_qiov->niov].iov_len = read_qiov->size; + acb->write.cow_qiov = write_qiov; + acb->write.cow_start_sector = acb->sector_num; + + acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd, + sector_end, read_qiov, nb, + read_backing_for_copy_on_write_cb, acb); + if (!acb->write.hd_acb) { + goto fail; + } + + acb->copy_lock.end = block_end; + acb->copy_lock.begin = block_end - s->block_size; + QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next); + QDEBUG("WRITE: acb%llu-%p read_last_partial_block_from_backing " + "sector_num=%" PRId64 " nb_sectors=%d\n", + acb->uuid, acb, sector_end, nb); + } else { + /* Case 4: Can write directly and no need to merge with data from + * the base image. */ + QDEBUG("WRITE: acb%llu-%p " + "write_fvd_without_read_partial_block_from_backing\n", + acb->uuid, acb); + acb->write.hd_acb = store_data(false, acb, bs, acb->sector_num, + acb->write.qiov, acb->nb_sectors, + write_data_cb, acb); + if (!acb->write.hd_acb) { + goto fail; + } + } + } + + QLIST_INSERT_HEAD(&s->write_locks, acb, write.next_write_lock); + return 0; + +fail: + if (acb->write.cow_buf) { + my_qemu_vfree(acb->write.cow_buf); + } + return -EIO; +} + +static void restart_dependent_writes(FvdAIOCB * acb) +{ + acb->copy_lock.next.le_prev = NULL; + FvdAIOCB *req = acb->copy_lock.dependent_writes.lh_first; + + while (req) { + /* Keep a copy of 'next' as it may be changed in do_aiO_write(). */ + FvdAIOCB *next = req->write.next_dependent_write.le_next; + + /* Indicate that this write is no longer on any depedent list. This + * helps fvd_aio_cancel_read() work properly. */ + req->write.next_dependent_write.le_prev = NULL; + + if (acb->type == OP_WRITE) { + QDEBUG("WRITE: acb%llu-%p finished_and_restart_conflict_write " + "acb%llu-%p\n", acb->uuid, acb, req->uuid, req); + } else { + QDEBUG("READ: copy_on_read acb%llu-%p " + "finished_and_restart_conflict_write acb%llu-%p\n", + acb->uuid, acb, req->uuid, req); + } + + if (do_aio_write(req) < 0) { + QDEBUG("WRITE: acb%llu-%p finished with error ret=%d\n", + req->uuid, req, -1); + req->common.cb(req->common.opaque, -1); + my_qemu_aio_release(req); + } + + req = next; + } } diff --git a/block/fvd.c b/block/fvd.c index e41f419..5b3dcac 100644 --- a/block/fvd.c +++ b/block/fvd.c @@ -27,11 +27,13 @@ * function optimization. */ #include "block/fvd-debug.c" #include "block/fvd-flush.c" +#include "block/fvd-bitmap.c" #include "block/fvd-misc.c" #include "block/fvd-create.c" #include "block/fvd-open.c" -#include "block/fvd-read.c" #include "block/fvd-write.c" +#include "block/fvd-read.c" +#include "block/fvd-store.c" #include "block/fvd-journal.c" #include "block/fvd-prefetch.c" #include "block/fvd-update.c" diff --git a/block/fvd.h b/block/fvd.h index 9847e7f..34ea2b4 100644 --- a/block/fvd.h +++ b/block/fvd.h @@ -432,6 +432,7 @@ typedef struct FvdAIOCB { #endif } FvdAIOCB; +static AIOPool fvd_aio_pool; static BlockDriver bdrv_fvd; static QEMUOptionParameter fvd_create_options[]; static QEMUOptionParameter fvd_update_options[];