From patchwork Mon Aug 31 14:48:49 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kevin Wolf X-Patchwork-Id: 32663 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [199.232.76.165]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by bilbo.ozlabs.org (Postfix) with ESMTPS id 0DC6EB7B74 for ; Tue, 1 Sep 2009 01:43:09 +1000 (EST) Received: from localhost ([127.0.0.1]:36642 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Mi929-0004I5-Ge for incoming@patchwork.ozlabs.org; Mon, 31 Aug 2009 11:43:05 -0400 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1Mi8Cn-0005gS-AY for qemu-devel@nongnu.org; Mon, 31 Aug 2009 10:50:01 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1Mi8Ci-0005dF-Jr for qemu-devel@nongnu.org; Mon, 31 Aug 2009 10:50:00 -0400 Received: from [199.232.76.173] (port=33683 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Mi8Ch-0005d1-Tp for qemu-devel@nongnu.org; Mon, 31 Aug 2009 10:49:55 -0400 Received: from mx1.redhat.com ([209.132.183.28]:15247) by monty-python.gnu.org with esmtp (Exim 4.60) (envelope-from ) id 1Mi8Ch-00023M-6t for qemu-devel@nongnu.org; Mon, 31 Aug 2009 10:49:55 -0400 Received: from int-mx05.intmail.prod.int.phx2.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.18]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id n7VEnsTv020463 for ; Mon, 31 Aug 2009 10:49:54 -0400 Received: from localhost.localdomain (dhcp-5-217.str.redhat.com [10.32.5.217]) by int-mx05.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id n7VEnqpx027502; Mon, 31 Aug 2009 10:49:53 -0400 From: Kevin Wolf To: qemu-devel@nongnu.org Date: Mon, 31 Aug 2009 16:48:49 +0200 Message-Id: <1251730129-18693-1-git-send-email-kwolf@redhat.com> X-Scanned-By: MIMEDefang 2.67 on 10.5.11.18 X-detected-operating-system: by monty-python.gnu.org: Genre and OS details not recognized. Cc: Kevin Wolf Subject: [Qemu-devel] [PATCH] qcow2: Order concurrent AIO requests on the same unallocated cluster X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org When two AIO requests write to the same cluster, and this cluster is unallocated, currently both requests allocate a new cluster and the second one merges the first one when it is completed. This means an cluster allocation, a read and a cluster deallocation which cause some overhead. If we simply let the second request wait until the first one is done, we improve overall performance with AIO requests (specifially, qcow2/virtio combinations). This patch maintains a list of in-flight requests that have allocated new clusters. A second request touching the same cluster is limited so that it either doesn't touch the allocation of the first request (so it can have a non-overlapping allocation) or it waits for the first request to complete. Signed-off-by: Kevin Wolf --- block/qcow2-cluster.c | 39 ++++++++++++++++++++++++++++++++++ block/qcow2.c | 55 ++++++++++++++++++++++++++++++++++++++++++++---- block/qcow2.h | 7 ++++++ 3 files changed, 96 insertions(+), 5 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 057dac5..d4631c3 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -684,6 +684,7 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, int l2_index, ret; uint64_t l2_offset, *l2_table, cluster_offset; int nb_clusters, i = 0; + QCowL2Meta *old_alloc; ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); if (ret == 0) @@ -732,6 +733,44 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, } nb_clusters = i; + /* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + */ + LIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + + uint64_t end_offset = offset + nb_clusters * s->cluster_size; + uint64_t old_offset = old_alloc->offset; + uint64_t old_end_offset = old_alloc->offset + + old_alloc->nb_clusters * s->cluster_size; + + if (end_offset < old_offset || offset > old_end_offset) { + /* No intersection */ + } else { + if (offset < old_offset) { + /* Stop at the start of a running allocation */ + nb_clusters = (old_offset - offset) >> s->cluster_bits; + } else { + nb_clusters = 0; + } + + if (nb_clusters == 0) { + /* Set dependency and wait for a callback */ + m->depends_on = old_alloc; + m->nb_clusters = 0; + *num = 0; + return 0; + } + } + } + + if (!nb_clusters) { + abort(); + } + + LIST_INSERT_HEAD(&s->cluster_allocs, m, next_in_flight); + /* allocate a new cluster */ cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s->cluster_size); diff --git a/block/qcow2.c b/block/qcow2.c index b8eae90..8579e01 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -219,6 +219,8 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) if (qcow2_refcount_init(bs) < 0) goto fail; + LIST_INIT(&s->cluster_allocs); + /* read qcow2 extensions */ if (header.backing_file_offset) ext_end = header.backing_file_offset; @@ -338,6 +340,7 @@ typedef struct QCowAIOCB { QEMUIOVector hd_qiov; QEMUBH *bh; QCowL2Meta l2meta; + LIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) @@ -500,6 +503,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, acb->n = 0; acb->cluster_offset = 0; acb->l2meta.nb_clusters = 0; + LIST_INIT(&acb->l2meta.dependent_requests); return acb; } @@ -517,6 +521,33 @@ static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, return &acb->common; } +static void qcow_aio_write_cb(void *opaque, int ret); + +static void run_dependent_requests(QCowL2Meta *m) +{ + QCowAIOCB *req; + QCowAIOCB *next; + + /* Take the request off the list of running requests */ + if (m->nb_clusters != 0) { + LIST_REMOVE(m, next_in_flight); + } + + /* + * Restart all dependent requests. + * Can't use LIST_FOREACH here - the next link might not be the same + * any more after the callback (request could depend on a different + * request now) + */ + for (req = m->dependent_requests.lh_first; req != NULL; req = next) { + next = req->next_depend.le_next; + qcow_aio_write_cb(req, 0); + } + + /* Empty the list for the next part of the request */ + LIST_INIT(&m->dependent_requests); +} + static void qcow_aio_write_cb(void *opaque, int ret) { QCowAIOCB *acb = opaque; @@ -528,14 +559,15 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = NULL; + if (ret >= 0) { + ret = qcow2_alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta); + } + + run_dependent_requests(&acb->l2meta); + if (ret < 0) goto done; - if (qcow2_alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { - qcow2_free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); - goto done; - } - acb->nb_sectors -= acb->n; acb->sector_num += acb->n; acb->buf += acb->n * 512; @@ -555,6 +587,14 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->cluster_offset = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9, index_in_cluster, n_end, &acb->n, &acb->l2meta); + + /* Need to wait for another request? If so, we are done for now. */ + if (!acb->cluster_offset && acb->l2meta.depends_on != NULL) { + LIST_INSERT_HEAD(&acb->l2meta.depends_on->dependent_requests, + acb, next_depend); + return; + } + if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { ret = -EIO; goto done; @@ -650,6 +690,7 @@ static int preallocate(BlockDriverState *bs) nb_sectors = bdrv_getlength(bs) >> 9; offset = 0; + LIST_INIT(&meta.dependent_requests); while (nb_sectors) { num = MIN(nb_sectors, INT_MAX >> 9); @@ -665,6 +706,10 @@ static int preallocate(BlockDriverState *bs) return -1; } + /* There are no dependent requests, but we need to remove our request + * from the list of in-flight requests */ + run_dependent_requests(&meta); + /* TODO Preallocate data if requested */ nb_sectors -= num; diff --git a/block/qcow2.h b/block/qcow2.h index c99b374..965a2f4 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -98,6 +98,7 @@ typedef struct BDRVQcowState { uint8_t *cluster_cache; uint8_t *cluster_data; uint64_t cluster_cache_offset; + LIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; uint64_t *refcount_table; uint64_t refcount_table_offset; @@ -128,6 +129,8 @@ typedef struct QCowCreateState { int64_t refcount_block_offset; } QCowCreateState; +struct QCowAIOCB; + /* XXX This could be private for qcow2-cluster.c */ typedef struct QCowL2Meta { @@ -135,6 +138,10 @@ typedef struct QCowL2Meta int n_start; int nb_available; int nb_clusters; + struct QCowL2Meta *depends_on; + LIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests; + + LIST_ENTRY(QCowL2Meta) next_in_flight; } QCowL2Meta; static inline int size_to_clusters(BDRVQcowState *s, int64_t size)