From patchwork Tue Sep  1 13:51:52 2009
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Kevin Wolf <kwolf@redhat.com>
X-Patchwork-Id: 32749
Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from lists.gnu.org (lists.gnu.org [199.232.76.165])
	(using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits))
	(Client did not present a certificate)
	by bilbo.ozlabs.org (Postfix) with ESMTPS id A110AB7BB8
	for <incoming@patchwork.ozlabs.org>;
	Tue,  1 Sep 2009 23:58:49 +1000 (EST)
Received: from localhost ([127.0.0.1]:42962 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1MiTsk-0005js-Dk
	for incoming@patchwork.ozlabs.org; Tue, 01 Sep 2009 09:58:46 -0400
Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43)
	id 1MiTnI-000355-Od
	for qemu-devel@nongnu.org; Tue, 01 Sep 2009 09:53:08 -0400
Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43)
	id 1MiTnD-0002xM-Kl
	for qemu-devel@nongnu.org; Tue, 01 Sep 2009 09:53:08 -0400
Received: from [199.232.76.173] (port=57666 helo=monty-python.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1MiTnD-0002x8-GI
	for qemu-devel@nongnu.org; Tue, 01 Sep 2009 09:53:03 -0400
Received: from mx1.redhat.com ([209.132.183.28]:2409)
	by monty-python.gnu.org with esmtp (Exim 4.60)
	(envelope-from <kwolf@redhat.com>) id 1MiTnC-00081A-Ri
	for qemu-devel@nongnu.org; Tue, 01 Sep 2009 09:53:03 -0400
Received: from int-mx04.intmail.prod.int.phx2.redhat.com
	(int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.17])
	by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id n81Dr1Fv008919
	for <qemu-devel@nongnu.org>; Tue, 1 Sep 2009 09:53:02 -0400
Received: from localhost.localdomain (dhcp-5-217.str.redhat.com
	[10.32.5.217])
	by int-mx04.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with
	ESMTP id n81DqvGn018215; Tue, 1 Sep 2009 09:53:01 -0400
From: Kevin Wolf <kwolf@redhat.com>
To: qemu-devel@nongnu.org
Date: Tue,  1 Sep 2009 15:51:52 +0200
Message-Id: <1251813112-17408-4-git-send-email-kwolf@redhat.com>
In-Reply-To: <1251813112-17408-1-git-send-email-kwolf@redhat.com>
References: <1251813112-17408-1-git-send-email-kwolf@redhat.com>
X-Scanned-By: MIMEDefang 2.67 on 10.5.11.17
X-detected-operating-system: by monty-python.gnu.org: Genre and OS details
	not recognized.
Cc: Kevin Wolf <kwolf@redhat.com>
Subject: [Qemu-devel] [PATCH 3/3] qcow2: Add bdrv_aio_multiwrite
	implementation
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: qemu-devel.nongnu.org
List-Unsubscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.gnu.org/pipermail/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org

One performance problem of qcow2 during the initial image growth are
sequential writes that are not cluster aligned. In this case, when a first
requests requires to allocate a new cluster but writes only to the first
couple of sectors in that cluster, the rest of the cluster is zeroed - just
to be overwritten by the following second request that fills up the cluster.

Let's try to merge sequential write requests to the same cluster, so we can
avoid to write the zero padding to the disk in the first place.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c |  134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 cutils.c      |   17 +++++++
 qemu-common.h |    1 +
 3 files changed, 152 insertions(+), 0 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 8579e01..9fc5cec 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -646,6 +646,139 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
     return &acb->common;
 }
 
+typedef struct QcowMultiwriteCB {
+    int error;
+    int num_requests;
+    int num_callbacks;
+    struct {
+        BlockDriverCompletionFunc *cb;
+        void *opaque;
+        QEMUIOVector *free_qiov;
+    } callbacks[];
+} QcowMultiwriteCB;
+
+static void qcow_multiwrite_user_cb(QcowMultiwriteCB *mcb)
+{
+    int i;
+
+    for (i = 0; i < mcb->num_callbacks; i++) {
+        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+        qemu_free(mcb->callbacks[i].free_qiov);
+    }
+}
+
+static void qcow_multiwrite_cb(void *opaque, int ret)
+{
+    QcowMultiwriteCB *mcb = opaque;
+
+    if (ret < 0) {
+        mcb->error = ret;
+        qcow_multiwrite_user_cb(mcb);
+    }
+
+    mcb->num_requests--;
+    if (mcb->num_requests == 0) {
+        if (mcb->error == 0) {
+            qcow_multiwrite_user_cb(mcb);
+        }
+        qemu_free(mcb);
+    }
+}
+
+static int qcow_multiwrite_req_compare(const void *a, const void *b)
+{
+    return (((BlockRequest*) a)->sector - ((BlockRequest*) b)->sector);
+}
+
+static int qcow_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QcowMultiwriteCB *mcb;
+    BlockDriverAIOCB *acb;
+    int i, outidx;
+
+    // Sort requests by start sector
+    qsort(reqs, num_reqs, sizeof(*reqs), &qcow_multiwrite_req_compare);
+
+    // Create QcowMultiwriteCB structure
+    mcb = qemu_mallocz(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+    mcb->num_requests = 0;
+    mcb->num_callbacks = num_reqs;
+
+    for (i = 0; i < num_reqs; i++) {
+        mcb->callbacks[i].cb = reqs[i].cb;
+        mcb->callbacks[i].opaque = reqs[i].opaque;
+    }
+
+    // Check if adjacent requests touch the same clusters. If so, combine them,
+    // filling up gaps with zero sectors.
+    outidx = 0;
+    for (i = 1; i < num_reqs; i++) {
+        int merge = 0;
+        uint64_t start_cluster = reqs[i].sector >> (s->cluster_bits - 9);
+        uint64_t end_cluster_prev =
+            (reqs[outidx].sector + reqs[outidx].nb_sectors - 1)
+            >> (s->cluster_bits - 9);
+
+        if (start_cluster == end_cluster_prev) {
+#ifdef DEBUG_MERGE
+            fprintf(stderr, "Possible merge: %lx -- %lx\n",
+                (reqs[outidx].sector + reqs[outidx].nb_sectors - 1),
+                reqs[i].sector);
+#endif
+            // TODO This is only handling exactly sequential writes. When we
+            // know that the cluster is unallocated, we could even fill in some
+            // zero padding and merge more requests.
+            if (reqs[i].sector == reqs[outidx].sector + reqs[outidx].nb_sectors) {
+#ifdef DEBUG_MERGE
+                fprintf(stderr, "  Merging\n");
+#endif
+                merge = 1;
+            }
+        }
+
+        if (merge) {
+            reqs[outidx].nb_sectors += reqs[i].nb_sectors;
+            reqs[outidx].qiov =
+                qemu_iovec_concat(reqs[outidx].qiov, reqs[i].qiov);
+            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+        } else {
+            outidx++;
+            reqs[outidx].sector     = reqs[i].sector;
+            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+            reqs[outidx].qiov       = reqs[i].qiov;
+        }
+    }
+
+    // Run the aio requests
+    for (i = 0; i < num_reqs; i++) {
+        acb = qcow_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
+            reqs[i].nb_sectors, qcow_multiwrite_cb, mcb);
+
+        if (acb == NULL) {
+            // We can only fail the whole thing if no request has been
+            // submitted yet. Otherwise we'll wait for the submitted AIOs to
+            // complete and report the error in the callback.
+            if (mcb->num_requests == 0) {
+                reqs[i].error = EIO;
+                goto fail;
+            } else {
+                mcb->error = EIO;
+                break;
+            }
+        } else {
+            mcb->num_requests++;
+        }
+    }
+
+    return 0;
+
+fail:
+    free(mcb);
+    return -1;
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -1124,6 +1257,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_aio_readv	= qcow_aio_readv,
     .bdrv_aio_writev	= qcow_aio_writev,
     .bdrv_write_compressed = qcow_write_compressed,
+    .bdrv_aio_multiwrite = qcow_aio_multiwrite,
 
     .bdrv_snapshot_create   = qcow2_snapshot_create,
     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
diff --git a/cutils.c b/cutils.c
index bd9a019..e12013a 100644
--- a/cutils.c
+++ b/cutils.c
@@ -151,6 +151,23 @@ void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
     ++qiov->niov;
 }
 
+QEMUIOVector *qemu_iovec_concat(QEMUIOVector *a, QEMUIOVector *b)
+{
+    int i;
+    QEMUIOVector *qiov = qemu_malloc(sizeof(*qiov));
+
+    qemu_iovec_init(qiov, a->niov + b->niov);
+
+    for (i = 0; i < a->niov; i++) {
+        qemu_iovec_add(qiov, a->iov[i].iov_base, a->iov[i].iov_len);
+    }
+    for (i = 0; i < b->niov; i++) {
+        qemu_iovec_add(qiov, b->iov[i].iov_base, b->iov[i].iov_len);
+    }
+
+    return qiov;
+}
+
 void qemu_iovec_destroy(QEMUIOVector *qiov)
 {
     assert(qiov->nalloc != -1);
diff --git a/qemu-common.h b/qemu-common.h
index 74ac88f..2871820 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -223,6 +223,7 @@ typedef struct QEMUIOVector {
 void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
+QEMUIOVector *qemu_iovec_concat(QEMUIOVector *a, QEMUIOVector *b);
 void qemu_iovec_destroy(QEMUIOVector *qiov);
 void qemu_iovec_reset(QEMUIOVector *qiov);
 void qemu_iovec_to_buffer(QEMUIOVector *qiov, void *buf);