From patchwork Fri Jun 27 19:08:38 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Kevin Wolf X-Patchwork-Id: 365126 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 2821E140087 for ; Sat, 28 Jun 2014 05:16:04 +1000 (EST) Received: from localhost ([::1]:52048 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X0bcw-0000AO-7U for incoming@patchwork.ozlabs.org; Fri, 27 Jun 2014 15:16:02 -0400 Received: from eggs.gnu.org ([2001:4830:134:3::10]:34311) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X0bWo-0005AK-3A for qemu-devel@nongnu.org; Fri, 27 Jun 2014 15:09:47 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1X0bWi-0006v8-Tw for qemu-devel@nongnu.org; Fri, 27 Jun 2014 15:09:41 -0400 Received: from mx1.redhat.com ([209.132.183.28]:8638) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X0bWi-0006v0-MM for qemu-devel@nongnu.org; Fri, 27 Jun 2014 15:09:36 -0400 Received: from int-mx13.intmail.prod.int.phx2.redhat.com (int-mx13.intmail.prod.int.phx2.redhat.com [10.5.11.26]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s5RJ9aZt005923 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK) for ; Fri, 27 Jun 2014 15:09:36 -0400 Received: from noname.redhat.com (ovpn-116-40.ams2.redhat.com [10.36.116.40]) by int-mx13.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id s5RJ97bb027690; Fri, 27 Jun 2014 15:09:35 -0400 From: Kevin Wolf To: qemu-devel@nongnu.org Date: Fri, 27 Jun 2014 21:08:38 +0200 Message-Id: <1403896146-3063-20-git-send-email-kwolf@redhat.com> In-Reply-To: <1403896146-3063-1-git-send-email-kwolf@redhat.com> References: <1403896146-3063-1-git-send-email-kwolf@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.68 on 10.5.11.26 X-MIME-Autoconverted: from 8bit to quoted-printable by mx1.redhat.com id s5RJ9aZt005923 X-detected-operating-system: by eggs.gnu.org: GNU/Linux 3.x X-Received-From: 209.132.183.28 Cc: kwolf@redhat.com Subject: [Qemu-devel] [PULL 19/47] quorum: Add the rewrite-corrupted parameter to quorum X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org From: BenoƮt Canet On read operations when this parameter is set and some replicas are corrupted while quorum can be reached quorum will proceed to rewrite the correct version of the data to fix the corrupted replicas. This will shine with SSD where the FTL will remap the same block at another place on rewrite. Signed-off-by: Benoit Canet Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/quorum.c | 97 +++++++++++++++++++++++++++++++++++++++++++--- qapi/block-core.json | 5 ++- tests/qemu-iotests/081 | 15 ++++++- tests/qemu-iotests/081.out | 10 +++++ 4 files changed, 119 insertions(+), 8 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index 86802d3..d5ee9c0 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -23,6 +23,7 @@ #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" #define QUORUM_OPT_BLKVERIFY "blkverify" +#define QUORUM_OPT_REWRITE "rewrite-corrupted" /* This union holds a vote hash value */ typedef union QuorumVoteValue { @@ -70,6 +71,9 @@ typedef struct BDRVQuorumState { * It is useful to debug other block drivers by * comparing them with a reference one. */ + bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted + * block if Quorum is reached. + */ } BDRVQuorumState; typedef struct QuorumAIOCB QuorumAIOCB; @@ -105,13 +109,17 @@ struct QuorumAIOCB { int count; /* number of completed AIOCB */ int success_count; /* number of successfully completed AIOCB */ + int rewrite_count; /* number of replica to rewrite: count down to + * zero once writes are fired + */ + QuorumVotes votes; bool is_read; int vote_ret; }; -static void quorum_vote(QuorumAIOCB *acb); +static bool quorum_vote(QuorumAIOCB *acb); static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) { @@ -183,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, acb->qcrs = g_new0(QuorumChildRequest, s->num_children); acb->count = 0; acb->success_count = 0; + acb->rewrite_count = 0; acb->votes.compare = quorum_sha256_compare; QLIST_INIT(&acb->votes.vote_list); acb->is_read = false; @@ -232,11 +241,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) return false; } +static void quorum_rewrite_aio_cb(void *opaque, int ret) +{ + QuorumAIOCB *acb = opaque; + + /* one less rewrite to do */ + acb->rewrite_count--; + + /* wait until all rewrite callbacks have completed */ + if (acb->rewrite_count) { + return; + } + + quorum_aio_finalize(acb); +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb = opaque; QuorumAIOCB *acb = sacb->parent; BDRVQuorumState *s = acb->common.bs->opaque; + bool rewrite = false; sacb->ret = ret; acb->count++; @@ -253,12 +278,15 @@ static void quorum_aio_cb(void *opaque, int ret) /* Do the vote on read */ if (acb->is_read) { - quorum_vote(acb); + rewrite = quorum_vote(acb); } else { quorum_has_too_much_io_failed(acb); } - quorum_aio_finalize(acb); + /* if no rewrite is done the code will finish right away */ + if (!rewrite) { + quorum_aio_finalize(acb); + } } static void quorum_report_bad_versions(BDRVQuorumState *s, @@ -278,6 +306,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, } } +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + int count = 0; + + /* first count the number of bad versions: done first to avoid concurrency + * issues. + */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + count++; + } + } + + /* quorum_rewrite_aio_cb will count down this to zero */ + acb->rewrite_count = count; + + /* now fire the correcting rewrites */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, + acb->nb_sectors, quorum_rewrite_aio_cb, acb); + } + } + + /* return true if any rewrite is done else false */ + return count; +} + static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) { int i; @@ -468,16 +533,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) return ret; } -static void quorum_vote(QuorumAIOCB *acb) +static bool quorum_vote(QuorumAIOCB *acb) { bool quorum = true; + bool rewrite = false; int i, j, ret; QuorumVoteValue hash; BDRVQuorumState *s = acb->common.bs->opaque; QuorumVoteVersion *winner; if (quorum_has_too_much_io_failed(acb)) { - return; + return false; } /* get the index of the first successful read */ @@ -505,7 +571,7 @@ static void quorum_vote(QuorumAIOCB *acb) /* Every successful read agrees */ if (quorum) { quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return; + return false; } /* compute hashes for each successful read, also store indexes */ @@ -538,9 +604,15 @@ static void quorum_vote(QuorumAIOCB *acb) /* some versions are bad print them */ quorum_report_bad_versions(s, acb, &winner->value); + /* corruption correction is enabled */ + if (s->rewrite_corrupted) { + rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); + } + free_exit: /* free lists */ quorum_free_vote_list(&acb->votes); + return rewrite; } static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, @@ -705,6 +777,11 @@ static QemuOptsList quorum_runtime_opts = { .type = QEMU_OPT_BOOL, .help = "Trigger block verify mode if set", }, + { + .name = QUORUM_OPT_REWRITE, + .type = QEMU_OPT_BOOL, + .help = "Rewrite corrupted block on read quorum", + }, { /* end of list */ } }, }; @@ -766,6 +843,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, "and using two files with vote_threshold=2\n"); } + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false); + if (s->rewrite_corrupted && s->is_blkverify) { + error_setg(&local_err, + "rewrite-corrupted=on cannot be used with blkverify=on"); + ret = -EINVAL; + goto exit; + } + /* allocate the children BlockDriverState array */ s->bs = g_new0(BlockDriverState *, s->num_children); opened = g_new0(bool, s->num_children); diff --git a/qapi/block-core.json b/qapi/block-core.json index af6b436..de31f9f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1329,12 +1329,15 @@ # # @vote-threshold: the vote limit under which a read will fail # +# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached +# (Since 2.1) +# # Since: 2.0 ## { 'type': 'BlockdevOptionsQuorum', 'data': { '*blkverify': 'bool', 'children': [ 'BlockdevRef' ], - 'vote-threshold': 'int' } } + 'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } } ## # @BlockdevOptions diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081 index b512d00..7ae4be2 100755 --- a/tests/qemu-iotests/081 +++ b/tests/qemu-iotests/081 @@ -134,15 +134,28 @@ run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <