From patchwork Fri Feb 25 22:37:57 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chunqiang Tang X-Patchwork-Id: 84587 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) by ozlabs.org (Postfix) with ESMTP id C43E3B70B4 for ; Sat, 26 Feb 2011 10:08:35 +1100 (EST) Received: from localhost ([127.0.0.1]:34688 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Pt6g7-0004oQ-O4 for incoming@patchwork.ozlabs.org; Fri, 25 Feb 2011 18:02:27 -0500 Received: from [140.186.70.92] (port=49150 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Pt6SG-00065L-TO for qemu-devel@nongnu.org; Fri, 25 Feb 2011 17:48:15 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Pt6S4-0004jV-Er for qemu-devel@nongnu.org; Fri, 25 Feb 2011 17:48:08 -0500 Received: from e6.ny.us.ibm.com ([32.97.182.146]:46659) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Pt6S4-0004j9-4T for qemu-devel@nongnu.org; Fri, 25 Feb 2011 17:47:56 -0500 Received: from d01dlp01.pok.ibm.com (d01dlp01.pok.ibm.com [9.56.224.56]) by e6.ny.us.ibm.com (8.14.4/8.13.1) with ESMTP id p1PMNci8008312 for ; Fri, 25 Feb 2011 17:23:38 -0500 Received: from d01relay03.pok.ibm.com (d01relay03.pok.ibm.com [9.56.227.235]) by d01dlp01.pok.ibm.com (Postfix) with ESMTP id 26F5638C803A for ; Fri, 25 Feb 2011 17:47:55 -0500 (EST) Received: from d01av03.pok.ibm.com (d01av03.pok.ibm.com [9.56.224.217]) by d01relay03.pok.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id p1PMlt16339946 for ; Fri, 25 Feb 2011 17:47:55 -0500 Received: from d01av03.pok.ibm.com (loopback [127.0.0.1]) by d01av03.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id p1PMls4e020862 for ; Fri, 25 Feb 2011 19:47:54 -0300 Received: from localhost.localdomain ([9.59.229.24]) by d01av03.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id p1PMlp9F020693; Fri, 25 Feb 2011 19:47:54 -0300 From: Chunqiang Tang To: qemu-devel@nongnu.org Date: Fri, 25 Feb 2011 17:37:57 -0500 Message-Id: <1298673486-3573-17-git-send-email-ctang@us.ibm.com> X-Mailer: git-send-email 1.7.0.4 In-Reply-To: <1298673486-3573-1-git-send-email-ctang@us.ibm.com> References: <1298673486-3573-1-git-send-email-ctang@us.ibm.com> X-Content-Scanned: Fidelis XPS MAILER X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.6, seldom 2.4 (older, 4) X-Received-From: 32.97.182.146 Cc: Chunqiang Tang Subject: [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush() X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org This patch is part of the Fast Virtual Disk (FVD) proposal. See http://wiki.qemu.org/Features/FVD. This patch adds FVD's implementation of the bdrv_flush() and bdrv_aio_flush() interfaces. Signed-off-by: Chunqiang Tang --- block/fvd-flush.c | 176 +++++++++++++++++++++++++++++++++++++- block/fvd-journal-buf.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 390 insertions(+), 4 deletions(-) diff --git a/block/fvd-flush.c b/block/fvd-flush.c index 34bd5cb..6658d27 100644 --- a/block/fvd-flush.c +++ b/block/fvd-flush.c @@ -1,5 +1,5 @@ /* - * QEMU Fast Virtual Disk Format bdrv_flush() and bdrv_aio_flush() + * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface * * Copyright IBM, Corp. 2010 * @@ -11,14 +11,182 @@ * */ +static void aio_wrapper_bh(void *opaque); +static int bjnl_sync_flush(BlockDriverState * bs); +static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc * cb, + void *opaque, BlockDriverAIOCB **p_acb); +static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs, + BlockDriverCompletionFunc * cb, + void *opaque, FvdAIOCB *parent_acb); + +static int fvd_flush(BlockDriverState * bs) +{ + BDRVFvdState *s = bs->opaque; + int ret; + + QDEBUG("fvd_flush() invoked\n"); + + if (s->metadata_err_prohibit_write) { + return -EIO; + } + + if (!s->fvd_metadata->enable_write_cache) { + /* No need to flush since it uses O_DSYNC. */ + return 0; + } + + if (s->use_bjnl) { + return bjnl_sync_flush(bs); + } + + /* Simply flush for unbuffered journal update. */ + if ((ret = bdrv_flush(s->fvd_data))) { + return ret; + } + if (s->fvd_metadata == s->fvd_data) { + return 0; + } + return bdrv_flush(s->fvd_metadata); +} + static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs, BlockDriverCompletionFunc * cb, void *opaque) { - return NULL; + BDRVFvdState *s = bs->opaque; + BlockDriverAIOCB * pacb; + FvdAIOCB *acb; + + QDEBUG("fvd_aio_flush() invoked\n"); + + if (s->metadata_err_prohibit_write) { + return NULL; + } + + if (!s->fvd_data->enable_write_cache) { + /* Need to flush since it uses O_DSYNC. Use a QEMUBH to invoke the + * callback. */ + + if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) { + return NULL; + } + + acb->type = OP_WRAPPER; + acb->cancel_in_progress = false; + acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb); + qemu_bh_schedule(acb->wrapper.bh); + return &acb->common; + } + + if (!s->use_bjnl) { + QDEBUG("FLUSH: start now for unbuffered journal update"); + return fvd_aio_flush_start(bs, cb, opaque, NULL); + } + + if (bjnl_clean_buf_on_aio_flush(bs, cb, opaque, &pacb)) { + /* Waiting for the journal buffer to be cleaned first. */ + return pacb; + } + + /* No buffered journal data. Start flush now. */ + QDEBUG("FLUSH: start now as no buffered journal data"); + return fvd_aio_flush_start(bs, cb, opaque, NULL); +} + +static inline void finish_flush(FvdAIOCB * acb) +{ + QDEBUG("FLUSH: acb%llu-%p finish_flush ret=%d\n", + acb->uuid, acb, acb->flush.ret); + acb->common.cb(acb->common.opaque, acb->flush.ret); + my_qemu_aio_release(acb); } -static int fvd_flush(BlockDriverState * bs) +static void flush_data_cb(void *opaque, int ret) { - return -ENOTSUP; + FvdAIOCB *acb = opaque; + + if (acb->cancel_in_progress) { + return; + } + + QDEBUG("FLUSH: acb%llu-%p flush_data_cb ret=%d\n", acb->uuid, acb, ret); + + if (acb->flush.ret == 0) { + acb->flush.ret = ret; + } + + acb->flush.data_acb = NULL; + acb->flush.num_finished++; + if (acb->flush.num_finished == 2) { + finish_flush(acb); + } +} + +static void flush_metadata_cb(void *opaque, int ret) +{ + FvdAIOCB *acb = opaque; + + if (acb->cancel_in_progress) { + return; + } + + QDEBUG("FLUSH: acb%llu-%p flush_metadata_cb ret=%d\n", + acb->uuid, acb, ret); + + if (acb->flush.ret == 0) { + acb->flush.ret = ret; + } + + acb->flush.metadata_acb = NULL; + acb->flush.num_finished++; + if (acb->flush.num_finished == 2) { + finish_flush(acb); + } +} + +static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs, + BlockDriverCompletionFunc * cb, + void *opaque, FvdAIOCB *parent_acb) +{ + BDRVFvdState *s = bs->opaque; + FvdAIOCB *acb; + + if (s->fvd_data == s->fvd_metadata) { + if (parent_acb) { + QDEBUG("FLUSH: acb%llu-%p started.\n",parent_acb->uuid,parent_acb); + } + return bdrv_aio_flush(s->fvd_metadata, cb, opaque); + } + + acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque); + if (!acb) { + return NULL; + } + COPY_UUID(acb, parent_acb); /* UUID helps debugging. */ + + /* fvd_data and fvd_metadata are different. Need to flush both. The order + * is not important. If (cache != writethrough && bitmap_updated), a flush + * on fvd_data must have already been performed by write_journal_buf(). */ + + acb->type = OP_FLUSH; + acb->cancel_in_progress = false; + acb->flush.num_finished = 0; + acb->flush.ret = 0; + acb->flush.data_acb = bdrv_aio_flush(s->fvd_data, flush_data_cb, acb); + if (!acb->flush.data_acb) { + my_qemu_aio_release(acb); + return NULL; + } + + acb->flush.metadata_acb = bdrv_aio_flush(s->fvd_metadata, + flush_metadata_cb, acb); + if (!acb->flush.metadata_acb) { + bdrv_aio_cancel(acb->flush.data_acb); + my_qemu_aio_release(acb); + return NULL; + } + + QDEBUG("FLUSH: acb%llu-%p started.\n", acb->uuid, acb); + return &acb->common; } diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c index b4077ce..e99a585 100644 --- a/block/fvd-journal-buf.c +++ b/block/fvd-journal-buf.c @@ -23,6 +23,48 @@ static inline int bjnl_write_buf(FvdAIOCB *acb); static void bjnl_send_current_buf_to_write_queue(BlockDriverState *bs); +/* Return false if no buffered journal data. Invoked by fvd_aio_flush(). */ +static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc * cb, + void *opaque, BlockDriverAIOCB **p_acb) +{ + BDRVFvdState *s = bs->opaque; + FvdAIOCB *acb; + + if (!s->bjnl.buf || s->bjnl.buf_used == 0) { + /* The current journal buffer is empty. */ + + if (QTAILQ_EMPTY(&s->bjnl.queued_bufs)) { + return false; /* Indicatte no previously buffered journal data. */ + } + } else { + QDEBUG("JOURNAL: bjnl_clean_buf_on_aio_flush invoke " + "bjnl_send_current_buf_to_write_queue\n"); + bjnl_send_current_buf_to_write_queue(bs); + } + + /* Append an acb at the tail of bjnl.queued_bufs to invoke the aio_flush + * callback after all previous pending journal writes finish. See + * bjnl_write_next_buf() -> bjnl_write_buf(). */ + + acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque); + if (!acb) { + *p_acb = NULL; /* Indicate failure. */ + return true; + } + + acb->type = OP_BJNL_FLUSH; + acb->cancel_in_progress = false; + acb->jcb.iov.iov_base = NULL; /* Indicate no data. */ + acb->jcb.hd_acb = NULL; + acb->jcb.bitmap_updated = false; + QTAILQ_INSERT_TAIL(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf); + *p_acb = &acb->common; + + QDEBUG("JOURNAL: inserted OP_BJNL_FLUSH acb%llu-%p\n", acb->uuid, acb); + return true; +} + static inline void bjnl_finish_write_buf(FvdAIOCB *acb, int ret) { ASSERT (acb->type == OP_BJNL_BUF_WRITE); @@ -65,6 +107,30 @@ static inline void bjnl_aio_flush_cb(void *opaque, int ret) my_qemu_aio_release(acb); } +/* This acb is inserted by clean_journal_buf() on behalf of a pending + * bdrv_aio_flush(). */ +static inline void bjnl_handle_aio_flush(FvdAIOCB *acb) +{ + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + + QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf); + + if (!s->metadata_err_prohibit_write) { + /* Buffered data have been written to journal. Now start flush. */ + QDEBUG("JOURNAL: bjnl_start_flush for acb%llu-%p\n", acb->uuid, acb); + acb->jcb.hd_acb = fvd_aio_flush_start(bs, bjnl_aio_flush_cb, acb, acb); + if (acb->jcb.hd_acb) { + return; + } + } + + QDEBUG("JOURNAL: bjnl_handle_aio_flush err acb%llu-%p\n", acb->uuid, acb); + /* Failed. Invoke aio_flush callback. */ + acb->common.cb(acb->common.opaque, -EIO); + my_qemu_aio_release(acb); +} + static inline void bjnl_write_buf_cb(void *opaque, int ret) { FvdAIOCB *acb = (FvdAIOCB *) opaque; @@ -153,6 +219,14 @@ static inline int bjnl_write_buf(FvdAIOCB *acb) QDEBUG("JOURNAL: bjnl_write_buf acb%llu-%p\n", acb->uuid, acb); + if (acb->type == OP_BJNL_FLUSH) { + bjnl_handle_aio_flush(acb); + + /* Return -1 to tell bjnl_write_next_buf() to move on to the next + * buffer write as no buffered journal data are being written.*/ + return -1; + } + if (!acb->jcb.bitmap_updated) { return bjnl_write_buf_start(acb); } @@ -313,6 +387,150 @@ static void bjnl_clean_buf_timer_cb(BlockDriverState * bs) s->bjnl.timer_scheduled = false; } +/* Perform a synchronous flush. Invoked by fvd_close() and fvd_flush(). */ +static int bjnl_sync_flush(BlockDriverState * bs) +{ + BDRVFvdState *s = bs->opaque; + FvdAIOCB *acb, *a; + int ret = 0; + size_t buf_size; + uint8_t *p, *buf = NULL; + bool bitmap_updated = false; + int nb_sectors; + int64_t journal_sec; + + /* Calculate the total buffered metadata updates. Check the current buffer + * first. */ + if (!s->bjnl.buf) { + buf_size = 0; + } else if (s->bjnl.buf_used == 0) { + buf_size = 0; + } else { + if (s->bjnl.buf_used < s->bjnl.buf_size) { + /* Mark the end of the buffer as EMPTY_JRECORD. */ + *((uint32_t*)(s->bjnl.buf + s->bjnl.buf_used)) = EMPTY_JRECORD; + } + buf_size = s->bjnl.buf_used = ROUND_UP(s->bjnl.buf_used, 512); + bitmap_updated = s->bjnl.buf_contains_bitmap_update; + } + + /* Go through the queued buffers. */ + acb = QTAILQ_FIRST(&s->bjnl.queued_bufs); + if (acb) { + if (acb->jcb.hd_acb) { + /* The first acb is the ongoing operation. Cancel and re-do it + * synchronously below. */ + QDEBUG("JOURNAL: bjnl_sync_flush cancel ongoing buf_write " + "acb%llu-%p\n", acb->uuid, acb); + bdrv_aio_cancel(acb->jcb.hd_acb); + } + + /* Calcualte buf_size. */ + while (acb) { + if (acb->type == OP_BJNL_BUF_WRITE) { + buf_size += acb->jcb.iov.iov_len; + if (acb->jcb.bitmap_updated) { + bitmap_updated = true; + } + } + acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf); + } + } + + if (buf_size == 0) { + QDEBUG("JOURNAL: bjnl_sync_flush no_data\n"); + goto done; /* No buffered metadata updates. */ + } + + if (bitmap_updated) { + /* Need a flush to ensure the correct semantics of copy-on-write in + * the event of a host crash. */ + QDEBUG("JOURNAL: bjnl_sync_flush bitmap_updated flush_fvd_data\n"); + if ((ret = bdrv_flush(s->fvd_data))) { + goto cleanup; + } + } + + /* Allocate journal sectors. */ + ASSERT(buf_size % 512 == 0); + nb_sectors = buf_size / 512; + if (s->next_journal_sector + nb_sectors > s->journal_size) { + QDEBUG("JOURNAL: bjnl_sync_flush recycle_journal\n"); + ret = recycle_journal(bs); + /* Journal recycle writes out the entire bitmap and table. Therefore, + * there is no need to write buffered metadata updates to journal. */ + goto done; + } + journal_sec = s->next_journal_sector; + s->next_journal_sector += nb_sectors; + + /* Copy all metadata updates into one buffer. */ + p = buf = my_qemu_blockalign(s->fvd_metadata, buf_size); + acb = QTAILQ_FIRST(&s->bjnl.queued_bufs); + while (acb) { + if (acb->type == OP_BJNL_BUF_WRITE) { + QDEBUG("JOURNAL: bjnl_sync_flush takes care buf_write acb%llu-%p\n", + acb->uuid, acb); + ASSERT(acb->jcb.iov.iov_len > 0); + memcpy(p, acb->jcb.iov.iov_base, acb->jcb.iov.iov_len); + PRINT_JRECORDS(p, acb->jcb.iov.iov_len); + p += acb->jcb.iov.iov_len; + } + acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf); + } + + if (s->bjnl.buf && s->bjnl.buf_used > 0) { + /* Copy the current buffer. */ + memcpy(p, s->bjnl.buf, s->bjnl.buf_used); + PRINT_JRECORDS(p, s->bjnl.buf_used); + } + + /* Write all metadata updates synchronously. */ + QDEBUG("JOURNAL: bjnl_sync_flush write_buffer\n"); + if ((ret=bdrv_write(s->fvd_metadata, s->journal_offset + journal_sec, + buf, nb_sectors)) < 0) { + goto cleanup; + } + +done: + /* Flush finally. */ + QDEBUG("JOURNAL: bjnl_sync_flush do final flush\n"); + if (s->fvd_data != s->fvd_metadata) { + if ((ret = bdrv_flush(s->fvd_data)) != 0) { + goto cleanup; + } + } + ret = bdrv_flush(s->fvd_metadata); + +cleanup: + if (buf) { + my_qemu_vfree(buf); + } + if (s->bjnl.buf) { + my_qemu_vfree (s->bjnl.buf); + s->bjnl.buf = NULL; + } + + acb = QTAILQ_FIRST(&s->bjnl.queued_bufs); + QTAILQ_INIT(&s->bjnl.queued_bufs); + while (acb) { + if (acb->type == OP_BJNL_BUF_WRITE) { + my_qemu_vfree(acb->jcb.iov.iov_base); + } else { + ASSERT(acb->type == OP_BJNL_FLUSH); + /* Invoke the callback for bdrv_aio_flush(). */ + QDEBUG("JOURNAL: aio_flush acb%llu-%p finished by sync_flush\n", + acb->uuid, acb); + acb->common.cb(acb->common.opaque, ret); + } + a = acb; + acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf); + my_qemu_aio_release(a); + } + + return ret; +} + #ifdef ENABLE_QDEBUG static void print_jrecords(const uint8_t *sector, size_t len) {