Message ID | 1401448724-27851-2-git-send-email-benoit.canet@irqsave.net |
---|---|
State | New |
Headers | show |
The Friday 30 May 2014 à 13:18:42 (+0200), Benoît Canet wrote : Oh sorry I made again commits lines ending with a dot. Will fix this in the next iteration. Best regards Benoît > On read operations when this parameter is set and some replicas are corrupted > while quorum can be reached quorum will proceed to rewrite the correct version > of the data to fix the corrupted replicas. > > This will shine with SSD where the FTL will remap the same block at another > place on rewrite. > > Signed-off-by: Benoit Canet <benoit@irqsave.net> > Reviewed-by: Max Reitz <mreitz@redhat.com> > --- > block/quorum.c | 97 +++++++++++++++++++++++++++++++++++++++++++--- > qapi-schema.json | 5 ++- > tests/qemu-iotests/081 | 15 ++++++- > tests/qemu-iotests/081.out | 10 +++++ > 4 files changed, 119 insertions(+), 8 deletions(-) > > diff --git a/block/quorum.c b/block/quorum.c > index ecec3a5..330029c 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -22,6 +22,7 @@ > > #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" > #define QUORUM_OPT_BLKVERIFY "blkverify" > +#define QUORUM_OPT_REWRITE "rewrite-corrupted" > > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > @@ -69,6 +70,9 @@ typedef struct BDRVQuorumState { > * It is useful to debug other block drivers by > * comparing them with a reference one. > */ > + bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted > + * block if Quorum is reached. > + */ > } BDRVQuorumState; > > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -104,13 +108,17 @@ struct QuorumAIOCB { > int count; /* number of completed AIOCB */ > int success_count; /* number of successfully completed AIOCB */ > > + int rewrite_count; /* number of replica to rewrite: count down to > + * zero once writes are fired > + */ > + > QuorumVotes votes; > > bool is_read; > int vote_ret; > }; > > -static void quorum_vote(QuorumAIOCB *acb); > +static bool quorum_vote(QuorumAIOCB *acb); > > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > { > @@ -182,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > acb->qcrs = g_new0(QuorumChildRequest, s->num_children); > acb->count = 0; > acb->success_count = 0; > + acb->rewrite_count = 0; > acb->votes.compare = quorum_sha256_compare; > QLIST_INIT(&acb->votes.vote_list); > acb->is_read = false; > @@ -241,11 +250,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) > return false; > } > > +static void quorum_rewrite_aio_cb(void *opaque, int ret) > +{ > + QuorumAIOCB *acb = opaque; > + > + /* one less rewrite to do */ > + acb->rewrite_count--; > + > + /* wait until all rewrite callbacks have completed */ > + if (acb->rewrite_count) { > + return; > + } > + > + quorum_aio_finalize(acb); > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb = opaque; > QuorumAIOCB *acb = sacb->parent; > BDRVQuorumState *s = acb->common.bs->opaque; > + bool rewrite = false; > > sacb->ret = ret; > acb->count++; > @@ -262,12 +287,15 @@ static void quorum_aio_cb(void *opaque, int ret) > > /* Do the vote on read */ > if (acb->is_read) { > - quorum_vote(acb); > + rewrite = quorum_vote(acb); > } else { > quorum_has_too_much_io_failed(acb); > } > > - quorum_aio_finalize(acb); > + /* if no rewrite is done the code will finish right away */ > + if (!rewrite) { > + quorum_aio_finalize(acb); > + } > } > > static void quorum_report_bad_versions(BDRVQuorumState *s, > @@ -287,6 +315,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, > } > } > > +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, > + QuorumVoteValue *value) > +{ > + QuorumVoteVersion *version; > + QuorumVoteItem *item; > + int count = 0; > + > + /* first count the number of bad versions: done first to avoid concurrency > + * issues. > + */ > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > + if (acb->votes.compare(&version->value, value)) { > + continue; > + } > + QLIST_FOREACH(item, &version->items, next) { > + count++; > + } > + } > + > + /* quorum_rewrite_aio_cb will count down this to zero */ > + acb->rewrite_count = count; > + > + /* now fire the correcting rewrites */ > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > + if (acb->votes.compare(&version->value, value)) { > + continue; > + } > + QLIST_FOREACH(item, &version->items, next) { > + bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, > + acb->nb_sectors, quorum_rewrite_aio_cb, acb); > + } > + } > + > + /* return true if any rewrite is done else false */ > + return count; > +} > + > static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > { > int i; > @@ -477,16 +542,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) > return ret; > } > > -static void quorum_vote(QuorumAIOCB *acb) > +static bool quorum_vote(QuorumAIOCB *acb) > { > bool quorum = true; > + bool rewrite = false; > int i, j, ret; > QuorumVoteValue hash; > BDRVQuorumState *s = acb->common.bs->opaque; > QuorumVoteVersion *winner; > > if (quorum_has_too_much_io_failed(acb)) { > - return; > + return false; > } > > /* get the index of the first successful read */ > @@ -514,7 +580,7 @@ static void quorum_vote(QuorumAIOCB *acb) > /* Every successful read agrees */ > if (quorum) { > quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); > - return; > + return false; > } > > /* compute hashes for each successful read, also store indexes */ > @@ -547,9 +613,15 @@ static void quorum_vote(QuorumAIOCB *acb) > /* some versions are bad print them */ > quorum_report_bad_versions(s, acb, &winner->value); > > + /* corruption correction is enabled */ > + if (s->rewrite_corrupted) { > + rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); > + } > + > free_exit: > /* free lists */ > quorum_free_vote_list(&acb->votes); > + return rewrite; > } > > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > @@ -714,6 +786,11 @@ static QemuOptsList quorum_runtime_opts = { > .type = QEMU_OPT_BOOL, > .help = "Trigger block verify mode if set", > }, > + { > + .name = QUORUM_OPT_REWRITE, > + .type = QEMU_OPT_BOOL, > + .help = "Rewrite corrupted block on read quorum", > + }, > { /* end of list */ } > }, > }; > @@ -775,6 +852,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, > "and using two files with vote_threshold=2\n"); > } > > + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false); > + if (s->rewrite_corrupted && s->is_blkverify) { > + error_setg(&local_err, > + "rewrite-corrupted=on cannot be used with blkverify=on"); > + ret = -EINVAL; > + goto exit; > + } > + > /* allocate the children BlockDriverState array */ > s->bs = g_new0(BlockDriverState *, s->num_children); > opened = g_new0(bool, s->num_children); > diff --git a/qapi-schema.json b/qapi-schema.json > index 7bc33ea..7bb1d47 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -4566,12 +4566,15 @@ > # > # @vote-threshold: the vote limit under which a read will fail > # > +# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached > +# (Since 2.1) > +# > # Since: 2.0 > ## > { 'type': 'BlockdevOptionsQuorum', > 'data': { '*blkverify': 'bool', > 'children': [ 'BlockdevRef' ], > - 'vote-threshold': 'int' } } > + 'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } } > > ## > # @BlockdevOptions > diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081 > index b512d00..7ae4be2 100755 > --- a/tests/qemu-iotests/081 > +++ b/tests/qemu-iotests/081 > @@ -134,15 +134,28 @@ run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF > EOF > > echo > +echo "== using quorum rewrite corrupted mode ==" > + > +quorum="$quorum,file.rewrite-corrupted=on" > + > +$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io > + > +echo > +echo "== checking that quorum has corrected the corrupted file ==" > + > +$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io > + > +echo > echo "== breaking quorum ==" > > $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io > +$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io > + > echo > echo "== checking that quorum is broken ==" > > $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io > > - > # success, all done > echo "*** done" > rm -f $seq.full > diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out > index 84aeb0c..1629d2a 100644 > --- a/tests/qemu-iotests/081.out > +++ b/tests/qemu-iotests/081.out > @@ -40,9 +40,19 @@ read 10485760/10485760 bytes at offset 0 > {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "floppy0", "tray-open": true}} > > > +== using quorum rewrite corrupted mode == > +read 10485760/10485760 bytes at offset 0 > +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > + > +== checking that quorum has corrected the corrupted file == > +read 10485760/10485760 bytes at offset 0 > +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > + > == breaking quorum == > wrote 10485760/10485760 bytes at offset 0 > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > +wrote 10485760/10485760 bytes at offset 0 > +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > > == checking that quorum is broken == > qemu-io: can't open device (null): Could not read image for determining its format: Input/output error > -- > 1.9.1 >
On Fri, May 30, 2014 at 01:18:42PM +0200, Benoît Canet wrote: > On read operations when this parameter is set and some replicas are corrupted > while quorum can be reached quorum will proceed to rewrite the correct version > of the data to fix the corrupted replicas. > > This will shine with SSD where the FTL will remap the same block at another > place on rewrite. > > Signed-off-by: Benoit Canet <benoit@irqsave.net> > Reviewed-by: Max Reitz <mreitz@redhat.com> > --- > block/quorum.c | 97 +++++++++++++++++++++++++++++++++++++++++++--- > qapi-schema.json | 5 ++- > tests/qemu-iotests/081 | 15 ++++++- > tests/qemu-iotests/081.out | 10 +++++ > 4 files changed, 119 insertions(+), 8 deletions(-) Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Am 30.05.2014 um 13:18 hat Benoît Canet geschrieben: > On read operations when this parameter is set and some replicas are corrupted > while quorum can be reached quorum will proceed to rewrite the correct version > of the data to fix the corrupted replicas. > > This will shine with SSD where the FTL will remap the same block at another > place on rewrite. > > Signed-off-by: Benoit Canet <benoit@irqsave.net> > Reviewed-by: Max Reitz <mreitz@redhat.com> Reviewed-by: Kevin Wolf <kwolf@redhat.com>
diff --git a/block/quorum.c b/block/quorum.c index ecec3a5..330029c 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -22,6 +22,7 @@ #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" #define QUORUM_OPT_BLKVERIFY "blkverify" +#define QUORUM_OPT_REWRITE "rewrite-corrupted" /* This union holds a vote hash value */ typedef union QuorumVoteValue { @@ -69,6 +70,9 @@ typedef struct BDRVQuorumState { * It is useful to debug other block drivers by * comparing them with a reference one. */ + bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted + * block if Quorum is reached. + */ } BDRVQuorumState; typedef struct QuorumAIOCB QuorumAIOCB; @@ -104,13 +108,17 @@ struct QuorumAIOCB { int count; /* number of completed AIOCB */ int success_count; /* number of successfully completed AIOCB */ + int rewrite_count; /* number of replica to rewrite: count down to + * zero once writes are fired + */ + QuorumVotes votes; bool is_read; int vote_ret; }; -static void quorum_vote(QuorumAIOCB *acb); +static bool quorum_vote(QuorumAIOCB *acb); static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) { @@ -182,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, acb->qcrs = g_new0(QuorumChildRequest, s->num_children); acb->count = 0; acb->success_count = 0; + acb->rewrite_count = 0; acb->votes.compare = quorum_sha256_compare; QLIST_INIT(&acb->votes.vote_list); acb->is_read = false; @@ -241,11 +250,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) return false; } +static void quorum_rewrite_aio_cb(void *opaque, int ret) +{ + QuorumAIOCB *acb = opaque; + + /* one less rewrite to do */ + acb->rewrite_count--; + + /* wait until all rewrite callbacks have completed */ + if (acb->rewrite_count) { + return; + } + + quorum_aio_finalize(acb); +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb = opaque; QuorumAIOCB *acb = sacb->parent; BDRVQuorumState *s = acb->common.bs->opaque; + bool rewrite = false; sacb->ret = ret; acb->count++; @@ -262,12 +287,15 @@ static void quorum_aio_cb(void *opaque, int ret) /* Do the vote on read */ if (acb->is_read) { - quorum_vote(acb); + rewrite = quorum_vote(acb); } else { quorum_has_too_much_io_failed(acb); } - quorum_aio_finalize(acb); + /* if no rewrite is done the code will finish right away */ + if (!rewrite) { + quorum_aio_finalize(acb); + } } static void quorum_report_bad_versions(BDRVQuorumState *s, @@ -287,6 +315,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, } } +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + int count = 0; + + /* first count the number of bad versions: done first to avoid concurrency + * issues. + */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + count++; + } + } + + /* quorum_rewrite_aio_cb will count down this to zero */ + acb->rewrite_count = count; + + /* now fire the correcting rewrites */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, + acb->nb_sectors, quorum_rewrite_aio_cb, acb); + } + } + + /* return true if any rewrite is done else false */ + return count; +} + static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) { int i; @@ -477,16 +542,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) return ret; } -static void quorum_vote(QuorumAIOCB *acb) +static bool quorum_vote(QuorumAIOCB *acb) { bool quorum = true; + bool rewrite = false; int i, j, ret; QuorumVoteValue hash; BDRVQuorumState *s = acb->common.bs->opaque; QuorumVoteVersion *winner; if (quorum_has_too_much_io_failed(acb)) { - return; + return false; } /* get the index of the first successful read */ @@ -514,7 +580,7 @@ static void quorum_vote(QuorumAIOCB *acb) /* Every successful read agrees */ if (quorum) { quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return; + return false; } /* compute hashes for each successful read, also store indexes */ @@ -547,9 +613,15 @@ static void quorum_vote(QuorumAIOCB *acb) /* some versions are bad print them */ quorum_report_bad_versions(s, acb, &winner->value); + /* corruption correction is enabled */ + if (s->rewrite_corrupted) { + rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); + } + free_exit: /* free lists */ quorum_free_vote_list(&acb->votes); + return rewrite; } static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, @@ -714,6 +786,11 @@ static QemuOptsList quorum_runtime_opts = { .type = QEMU_OPT_BOOL, .help = "Trigger block verify mode if set", }, + { + .name = QUORUM_OPT_REWRITE, + .type = QEMU_OPT_BOOL, + .help = "Rewrite corrupted block on read quorum", + }, { /* end of list */ } }, }; @@ -775,6 +852,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, "and using two files with vote_threshold=2\n"); } + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false); + if (s->rewrite_corrupted && s->is_blkverify) { + error_setg(&local_err, + "rewrite-corrupted=on cannot be used with blkverify=on"); + ret = -EINVAL; + goto exit; + } + /* allocate the children BlockDriverState array */ s->bs = g_new0(BlockDriverState *, s->num_children); opened = g_new0(bool, s->num_children); diff --git a/qapi-schema.json b/qapi-schema.json index 7bc33ea..7bb1d47 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4566,12 +4566,15 @@ # # @vote-threshold: the vote limit under which a read will fail # +# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached +# (Since 2.1) +# # Since: 2.0 ## { 'type': 'BlockdevOptionsQuorum', 'data': { '*blkverify': 'bool', 'children': [ 'BlockdevRef' ], - 'vote-threshold': 'int' } } + 'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } } ## # @BlockdevOptions diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081 index b512d00..7ae4be2 100755 --- a/tests/qemu-iotests/081 +++ b/tests/qemu-iotests/081 @@ -134,15 +134,28 @@ run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF EOF echo +echo "== using quorum rewrite corrupted mode ==" + +quorum="$quorum,file.rewrite-corrupted=on" + +$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io + +echo +echo "== checking that quorum has corrected the corrupted file ==" + +$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io + +echo echo "== breaking quorum ==" $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io +$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io + echo echo "== checking that quorum is broken ==" $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io - # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out index 84aeb0c..1629d2a 100644 --- a/tests/qemu-iotests/081.out +++ b/tests/qemu-iotests/081.out @@ -40,9 +40,19 @@ read 10485760/10485760 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "floppy0", "tray-open": true}} +== using quorum rewrite corrupted mode == +read 10485760/10485760 bytes at offset 0 +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +== checking that quorum has corrected the corrupted file == +read 10485760/10485760 bytes at offset 0 +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + == breaking quorum == wrote 10485760/10485760 bytes at offset 0 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 10485760/10485760 bytes at offset 0 +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) == checking that quorum is broken == qemu-io: can't open device (null): Could not read image for determining its format: Input/output error