diff mbox

[1/3] quorum: Add the rewrite-corrupted parameter to quorum.

Message ID 1394555780-27665-2-git-send-email-benoit.canet@irqsave.net
State New
Headers show

Commit Message

Benoît Canet March 11, 2014, 4:36 p.m. UTC
On read operations when this parameter is set and some replicas are corrupted
while quorum can be reached quorum will proceed to rewrite the correct version
of the data to fix the corrupted replicas.

This will shine with SSD where the FTL will remap the same block at another
place on rewrite.

Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block/quorum.c             | 97 +++++++++++++++++++++++++++++++++++++++++++---
 qapi-schema.json           |  5 ++-
 tests/qemu-iotests/081     | 14 +++++++
 tests/qemu-iotests/081.out | 11 +++++-
 4 files changed, 119 insertions(+), 8 deletions(-)

Comments

Max Reitz March 11, 2014, 7:58 p.m. UTC | #1
On 11.03.2014 17:36, Benoît Canet wrote:
> On read operations when this parameter is set and some replicas are corrupted
> while quorum can be reached quorum will proceed to rewrite the correct version
> of the data to fix the corrupted replicas.
>
> This will shine with SSD where the FTL will remap the same block at another
> place on rewrite.
>
> Signed-off-by: Benoit Canet <benoit@irqsave.net>
> ---
>   block/quorum.c             | 97 +++++++++++++++++++++++++++++++++++++++++++---
>   qapi-schema.json           |  5 ++-
>   tests/qemu-iotests/081     | 14 +++++++
>   tests/qemu-iotests/081.out | 11 +++++-
>   4 files changed, 119 insertions(+), 8 deletions(-)
>
> diff --git a/block/quorum.c b/block/quorum.c
> index bd997b7..af4fd3c 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -22,6 +22,7 @@
>   
>   #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
>   #define QUORUM_OPT_BLKVERIFY      "blkverify"
> +#define QUORUM_OPT_REWRITE        "rewrite-corrupted"
>   
>   /* This union holds a vote hash value */
>   typedef union QuorumVoteValue {
> @@ -69,6 +70,9 @@ typedef struct BDRVQuorumState {
>                               * It is useful to debug other block drivers by
>                               * comparing them with a reference one.
>                               */
> +    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
> +                            * block if Quorum is reached.
> +                            */
>   } BDRVQuorumState;
>   
>   typedef struct QuorumAIOCB QuorumAIOCB;
> @@ -104,13 +108,17 @@ struct QuorumAIOCB {
>       int count;                  /* number of completed AIOCB */
>       int success_count;          /* number of successfully completed AIOCB */
>   
> +    int rewrite_count;          /* number of replica to rewrite: count down to
> +                                 * zero once writes are fired
> +                                 */
> +
>       QuorumVotes votes;
>   
>       bool is_read;
>       int vote_ret;
>   };
>   
> -static void quorum_vote(QuorumAIOCB *acb);
> +static bool quorum_vote(QuorumAIOCB *acb);
>   
>   static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
>   {
> @@ -182,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>       acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
>       acb->count = 0;
>       acb->success_count = 0;
> +    acb->rewrite_count = 0;
>       acb->votes.compare = quorum_sha256_compare;
>       QLIST_INIT(&acb->votes.vote_list);
>       acb->is_read = false;
> @@ -241,11 +250,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
>       return false;
>   }
>   
> +static void quorum_rewrite_aio_cb(void *opaque, int ret)
> +{
> +    QuorumAIOCB *acb = opaque;
> +
> +    /* one less rewrite to do */
> +    acb->rewrite_count--;
> +
> +    /* wait until all rewrite callback have completed */

*callbacks

> +    if (acb->rewrite_count) {
> +        return;
> +    }
> +
> +    quorum_aio_finalize(acb);
> +}
> +
>   static void quorum_aio_cb(void *opaque, int ret)
>   {
>       QuorumChildRequest *sacb = opaque;
>       QuorumAIOCB *acb = sacb->parent;
>       BDRVQuorumState *s = acb->common.bs->opaque;
> +    bool rewrite = false;
>   
>       sacb->ret = ret;
>       acb->count++;
> @@ -262,12 +287,15 @@ static void quorum_aio_cb(void *opaque, int ret)
>   
>       /* Do the vote on read */
>       if (acb->is_read) {
> -        quorum_vote(acb);
> +        rewrite = quorum_vote(acb);
>       } else {
>           quorum_has_too_much_io_failed(acb);
>       }
>   
> -    quorum_aio_finalize(acb);
> +    /* if no rewrite is done the code will finish right away */
> +    if (!rewrite) {
> +        quorum_aio_finalize(acb);
> +    }
>   }
>   
>   static void quorum_report_bad_versions(BDRVQuorumState *s,
> @@ -287,6 +315,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
>       }
>   }
>   
> +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
> +                                        QuorumVoteValue *value)
> +{
> +    QuorumVoteVersion *version;
> +    QuorumVoteItem *item;
> +    int count = 0;
> +
> +    /* first count the number of bad versions: done first to avoid concurency

*concurrency

> +     * issues.
> +     */
> +    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> +        if (acb->votes.compare(&version->value, value)) {
> +            continue;
> +        }
> +        QLIST_FOREACH(item, &version->items, next) {
> +            count++;
> +        }
> +    }
> +
> +    /* quorum_rewrite_aio_cb will count down this to zero */
> +    acb->rewrite_count = count;
> +
> +    /* now fire the correcting rewrites */
> +    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> +        if (acb->votes.compare(&version->value, value)) {
> +            continue;
> +        }
> +        QLIST_FOREACH(item, &version->items, next) {
> +            bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov,
> +                            acb->nb_sectors, quorum_rewrite_aio_cb, acb);
> +        }
> +    }
> +
> +    /* return true if any rewrite is done else false */
> +    return count;
> +}
> +
>   static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
>   {
>       int i;
> @@ -477,16 +542,17 @@ static int quorum_vote_error(QuorumAIOCB *acb)
>       return ret;
>   }
>   
> -static void quorum_vote(QuorumAIOCB *acb)
> +static bool quorum_vote(QuorumAIOCB *acb)
>   {
>       bool quorum = true;
> +    bool rewrite = false;
>       int i, j, ret;
>       QuorumVoteValue hash;
>       BDRVQuorumState *s = acb->common.bs->opaque;
>       QuorumVoteVersion *winner;
>   
>       if (quorum_has_too_much_io_failed(acb)) {
> -        return;
> +        return false;;

Two semicolons here.

>       }
>   
>       /* get the index of the first successful read */
> @@ -514,7 +580,7 @@ static void quorum_vote(QuorumAIOCB *acb)
>       /* Every successful read agrees */
>       if (quorum) {
>           quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
> -        return;
> +        return false;;

And here.

>       }
>   
>       /* compute hashes for each successful read, also store indexes */
> @@ -547,9 +613,15 @@ static void quorum_vote(QuorumAIOCB *acb)
>       /* some versions are bad print them */
>       quorum_report_bad_versions(s, acb, &winner->value);
>   
> +    /* corruption correction is actived */

I'd vote for "enabled" rather than "actived". But I like "corruption 
correction". :-)

> +    if (s->rewrite_corrupted) {
> +        rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value);
> +    }
> +
>   free_exit:
>       /* free lists */
>       quorum_free_vote_list(&acb->votes);
> +    return rewrite;
>   }
>   
>   static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
> @@ -709,6 +781,11 @@ static QemuOptsList quorum_runtime_opts = {
>               .type = QEMU_OPT_BOOL,
>               .help = "Trigger block verify mode if set",
>           },
> +        {
> +            .name = QUORUM_OPT_REWRITE,
> +            .type = QEMU_OPT_BOOL,
> +            .help = "Rewrite corrupted block on read quorum",
> +        },
>           { /* end of list */ }
>       },
>   };
> @@ -770,6 +847,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>                   "and using two files with vote_threshold=2\n");
>       }
>   
> +    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
> +    if (s->rewrite_corrupted && s->is_blkverify) {
> +        error_setg(&local_err,
> +                   "rewrite-corrupted=on cannot be used with blkverify=on");
> +        ret = -EINVAL;
> +        goto exit;
> +    }
> +
>       /* allocate the children BlockDriverState array */
>       s->bs = g_new0(BlockDriverState *, s->num_children);
>       opened = g_new0(bool, s->num_children);
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 6476d4a..f5a8767 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -4542,12 +4542,15 @@
>   #
>   # @vote-threshold: the vote limit under which a read will fail
>   #
> +# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached
> +#                     (Since 2.1)
> +#
>   # Since: 2.0
>   ##
>   { 'type': 'BlockdevOptionsQuorum',
>     'data': { '*blkverify': 'bool',
>               'children': [ 'BlockdevRef' ],
> -            'vote-threshold': 'int' } }
> +            'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } }
>   
>   ##
>   # @BlockdevOptions
> diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081
> index b512d00..bb211d6 100755
> --- a/tests/qemu-iotests/081
> +++ b/tests/qemu-iotests/081
> @@ -95,6 +95,18 @@ echo "== checking quorum correction =="
>   $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
>   
>   echo
> +echo "== using quorum rewrite corrupted mode =="
> +
> +quorum="$quorum,file.rewrite-corrupted=on"
> +
> +$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
> +
> +echo
> +echo "== checking that quorum has corrected the corrupted file =="
> +
> +$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
> +

I'd prefer it if you could put these new test cases below the "checking 
mixed reference/option specification" test, just because I'd like that 
test to output the quorum warning - especially there is no other point 
in the test where we can see that QMP message.

But that's up to you and if you at least fix the double semicolons:

Reviewed-by: Max Reitz <mreitz@redhat.com>

> +echo
>   echo "== checking mixed reference/option specification =="
>   
>   run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF
> @@ -137,6 +149,8 @@ echo
>   echo "== breaking quorum =="
>   
>   $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io
> +$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
> +
>   echo
>   echo "== checking that quorum is broken =="
>   
> diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out
> index 84aeb0c..2d8b290 100644
> --- a/tests/qemu-iotests/081.out
> +++ b/tests/qemu-iotests/081.out
> @@ -25,12 +25,19 @@ wrote 10485760/10485760 bytes at offset 0
>   read 10485760/10485760 bytes at offset 0
>   10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>   
> +== using quorum rewrite corrupted mode ==
> +read 10485760/10485760 bytes at offset 0
> +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> +
> +== checking that quorum has corrected the corrupted file ==
> +read 10485760/10485760 bytes at offset 0
> +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> +
>   == checking mixed reference/option specification ==
>   Testing: -drive file=TEST_DIR/2.IMGFMT,format=IMGFMT,if=none,id=drive2
>   QMP_VERSION
>   {"return": {}}
>   {"return": {}}
> -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "", "sectors-count": 20480, "sector-num": 0}}
>   read 10485760/10485760 bytes at offset 0
>   10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>   {"return": ""}
> @@ -43,6 +50,8 @@ read 10485760/10485760 bytes at offset 0
>   == breaking quorum ==
>   wrote 10485760/10485760 bytes at offset 0
>   10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> +wrote 10485760/10485760 bytes at offset 0
> +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>   
>   == checking that quorum is broken ==
>   qemu-io: can't open device (null): Could not read image for determining its format: Input/output error
Benoît Canet March 11, 2014, 8:14 p.m. UTC | #2
The Tuesday 11 Mar 2014 à 20:58:06 (+0100), Max Reitz wrote :
> On 11.03.2014 17:36, Benoît Canet wrote:
> >On read operations when this parameter is set and some replicas are corrupted
> >while quorum can be reached quorum will proceed to rewrite the correct version
> >of the data to fix the corrupted replicas.
> >
> >This will shine with SSD where the FTL will remap the same block at another
> >place on rewrite.
> >
> >Signed-off-by: Benoit Canet <benoit@irqsave.net>
> >---
> >  block/quorum.c             | 97 +++++++++++++++++++++++++++++++++++++++++++---
> >  qapi-schema.json           |  5 ++-
> >  tests/qemu-iotests/081     | 14 +++++++
> >  tests/qemu-iotests/081.out | 11 +++++-
> >  4 files changed, 119 insertions(+), 8 deletions(-)
> >
> >diff --git a/block/quorum.c b/block/quorum.c
> >index bd997b7..af4fd3c 100644
> >--- a/block/quorum.c
> >+++ b/block/quorum.c
> >@@ -22,6 +22,7 @@
> >  #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
> >  #define QUORUM_OPT_BLKVERIFY      "blkverify"
> >+#define QUORUM_OPT_REWRITE        "rewrite-corrupted"
> >  /* This union holds a vote hash value */
> >  typedef union QuorumVoteValue {
> >@@ -69,6 +70,9 @@ typedef struct BDRVQuorumState {
> >                              * It is useful to debug other block drivers by
> >                              * comparing them with a reference one.
> >                              */
> >+    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
> >+                            * block if Quorum is reached.
> >+                            */
> >  } BDRVQuorumState;
> >  typedef struct QuorumAIOCB QuorumAIOCB;
> >@@ -104,13 +108,17 @@ struct QuorumAIOCB {
> >      int count;                  /* number of completed AIOCB */
> >      int success_count;          /* number of successfully completed AIOCB */
> >+    int rewrite_count;          /* number of replica to rewrite: count down to
> >+                                 * zero once writes are fired
> >+                                 */
> >+
> >      QuorumVotes votes;
> >      bool is_read;
> >      int vote_ret;
> >  };
> >-static void quorum_vote(QuorumAIOCB *acb);
> >+static bool quorum_vote(QuorumAIOCB *acb);
> >  static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> >  {
> >@@ -182,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> >      acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
> >      acb->count = 0;
> >      acb->success_count = 0;
> >+    acb->rewrite_count = 0;
> >      acb->votes.compare = quorum_sha256_compare;
> >      QLIST_INIT(&acb->votes.vote_list);
> >      acb->is_read = false;
> >@@ -241,11 +250,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
> >      return false;
> >  }
> >+static void quorum_rewrite_aio_cb(void *opaque, int ret)
> >+{
> >+    QuorumAIOCB *acb = opaque;
> >+
> >+    /* one less rewrite to do */
> >+    acb->rewrite_count--;
> >+
> >+    /* wait until all rewrite callback have completed */
> 
> *callbacks
> 
> >+    if (acb->rewrite_count) {
> >+        return;
> >+    }
> >+
> >+    quorum_aio_finalize(acb);
> >+}
> >+
> >  static void quorum_aio_cb(void *opaque, int ret)
> >  {
> >      QuorumChildRequest *sacb = opaque;
> >      QuorumAIOCB *acb = sacb->parent;
> >      BDRVQuorumState *s = acb->common.bs->opaque;
> >+    bool rewrite = false;
> >      sacb->ret = ret;
> >      acb->count++;
> >@@ -262,12 +287,15 @@ static void quorum_aio_cb(void *opaque, int ret)
> >      /* Do the vote on read */
> >      if (acb->is_read) {
> >-        quorum_vote(acb);
> >+        rewrite = quorum_vote(acb);
> >      } else {
> >          quorum_has_too_much_io_failed(acb);
> >      }
> >-    quorum_aio_finalize(acb);
> >+    /* if no rewrite is done the code will finish right away */
> >+    if (!rewrite) {
> >+        quorum_aio_finalize(acb);
> >+    }
> >  }
> >  static void quorum_report_bad_versions(BDRVQuorumState *s,
> >@@ -287,6 +315,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
> >      }
> >  }
> >+static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
> >+                                        QuorumVoteValue *value)
> >+{
> >+    QuorumVoteVersion *version;
> >+    QuorumVoteItem *item;
> >+    int count = 0;
> >+
> >+    /* first count the number of bad versions: done first to avoid concurency
> 
> *concurrency
> 
> >+     * issues.
> >+     */
> >+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> >+        if (acb->votes.compare(&version->value, value)) {
> >+            continue;
> >+        }
> >+        QLIST_FOREACH(item, &version->items, next) {
> >+            count++;
> >+        }
> >+    }
> >+
> >+    /* quorum_rewrite_aio_cb will count down this to zero */
> >+    acb->rewrite_count = count;
> >+
> >+    /* now fire the correcting rewrites */
> >+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> >+        if (acb->votes.compare(&version->value, value)) {
> >+            continue;
> >+        }
> >+        QLIST_FOREACH(item, &version->items, next) {
> >+            bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov,
> >+                            acb->nb_sectors, quorum_rewrite_aio_cb, acb);
> >+        }
> >+    }
> >+
> >+    /* return true if any rewrite is done else false */
> >+    return count;
> >+}
> >+
> >  static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> >  {
> >      int i;
> >@@ -477,16 +542,17 @@ static int quorum_vote_error(QuorumAIOCB *acb)
> >      return ret;
> >  }
> >-static void quorum_vote(QuorumAIOCB *acb)
> >+static bool quorum_vote(QuorumAIOCB *acb)
> >  {
> >      bool quorum = true;
> >+    bool rewrite = false;
> >      int i, j, ret;
> >      QuorumVoteValue hash;
> >      BDRVQuorumState *s = acb->common.bs->opaque;
> >      QuorumVoteVersion *winner;
> >      if (quorum_has_too_much_io_failed(acb)) {
> >-        return;
> >+        return false;;
> 
> Two semicolons here.
> 
> >      }
> >      /* get the index of the first successful read */
> >@@ -514,7 +580,7 @@ static void quorum_vote(QuorumAIOCB *acb)
> >      /* Every successful read agrees */
> >      if (quorum) {
> >          quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
> >-        return;
> >+        return false;;
> 
> And here.
> 
> >      }
> >      /* compute hashes for each successful read, also store indexes */
> >@@ -547,9 +613,15 @@ static void quorum_vote(QuorumAIOCB *acb)
> >      /* some versions are bad print them */
> >      quorum_report_bad_versions(s, acb, &winner->value);
> >+    /* corruption correction is actived */
> 
> I'd vote for "enabled" rather than "actived". But I like "corruption
> correction". :-)
> 
> >+    if (s->rewrite_corrupted) {
> >+        rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value);
> >+    }
> >+
> >  free_exit:
> >      /* free lists */
> >      quorum_free_vote_list(&acb->votes);
> >+    return rewrite;
> >  }
> >  static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
> >@@ -709,6 +781,11 @@ static QemuOptsList quorum_runtime_opts = {
> >              .type = QEMU_OPT_BOOL,
> >              .help = "Trigger block verify mode if set",
> >          },
> >+        {
> >+            .name = QUORUM_OPT_REWRITE,
> >+            .type = QEMU_OPT_BOOL,
> >+            .help = "Rewrite corrupted block on read quorum",
> >+        },
> >          { /* end of list */ }
> >      },
> >  };
> >@@ -770,6 +847,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
> >                  "and using two files with vote_threshold=2\n");
> >      }
> >+    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
> >+    if (s->rewrite_corrupted && s->is_blkverify) {
> >+        error_setg(&local_err,
> >+                   "rewrite-corrupted=on cannot be used with blkverify=on");
> >+        ret = -EINVAL;
> >+        goto exit;
> >+    }
> >+
> >      /* allocate the children BlockDriverState array */
> >      s->bs = g_new0(BlockDriverState *, s->num_children);
> >      opened = g_new0(bool, s->num_children);
> >diff --git a/qapi-schema.json b/qapi-schema.json
> >index 6476d4a..f5a8767 100644
> >--- a/qapi-schema.json
> >+++ b/qapi-schema.json
> >@@ -4542,12 +4542,15 @@
> >  #
> >  # @vote-threshold: the vote limit under which a read will fail
> >  #
> >+# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached
> >+#                     (Since 2.1)
> >+#
> >  # Since: 2.0
> >  ##
> >  { 'type': 'BlockdevOptionsQuorum',
> >    'data': { '*blkverify': 'bool',
> >              'children': [ 'BlockdevRef' ],
> >-            'vote-threshold': 'int' } }
> >+            'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } }
> >  ##
> >  # @BlockdevOptions
> >diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081
> >index b512d00..bb211d6 100755
> >--- a/tests/qemu-iotests/081
> >+++ b/tests/qemu-iotests/081
> >@@ -95,6 +95,18 @@ echo "== checking quorum correction =="
> >  $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
> >  echo
> >+echo "== using quorum rewrite corrupted mode =="
> >+
> >+quorum="$quorum,file.rewrite-corrupted=on"
> >+
> >+$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
> >+
> >+echo
> >+echo "== checking that quorum has corrected the corrupted file =="
> >+
> >+$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
> >+
> 
> I'd prefer it if you could put these new test cases below the
> "checking mixed reference/option specification" test, just because
> I'd like that test to output the quorum warning - especially there
> is no other point in the test where we can see that QMP message.
> 
> But that's up to you and if you at least fix the double semicolons:

Ok I will do these changes.

Thanks for the review

Best regards

Benoît

> 
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> 
> >+echo
> >  echo "== checking mixed reference/option specification =="
> >  run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF
> >@@ -137,6 +149,8 @@ echo
> >  echo "== breaking quorum =="
> >  $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io
> >+$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
> >+
> >  echo
> >  echo "== checking that quorum is broken =="
> >diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out
> >index 84aeb0c..2d8b290 100644
> >--- a/tests/qemu-iotests/081.out
> >+++ b/tests/qemu-iotests/081.out
> >@@ -25,12 +25,19 @@ wrote 10485760/10485760 bytes at offset 0
> >  read 10485760/10485760 bytes at offset 0
> >  10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> >+== using quorum rewrite corrupted mode ==
> >+read 10485760/10485760 bytes at offset 0
> >+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> >+
> >+== checking that quorum has corrected the corrupted file ==
> >+read 10485760/10485760 bytes at offset 0
> >+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> >+
> >  == checking mixed reference/option specification ==
> >  Testing: -drive file=TEST_DIR/2.IMGFMT,format=IMGFMT,if=none,id=drive2
> >  QMP_VERSION
> >  {"return": {}}
> >  {"return": {}}
> >-{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "", "sectors-count": 20480, "sector-num": 0}}
> >  read 10485760/10485760 bytes at offset 0
> >  10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> >  {"return": ""}
> >@@ -43,6 +50,8 @@ read 10485760/10485760 bytes at offset 0
> >  == breaking quorum ==
> >  wrote 10485760/10485760 bytes at offset 0
> >  10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> >+wrote 10485760/10485760 bytes at offset 0
> >+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> >  == checking that quorum is broken ==
> >  qemu-io: can't open device (null): Could not read image for determining its format: Input/output error
> 
>
diff mbox

Patch

diff --git a/block/quorum.c b/block/quorum.c
index bd997b7..af4fd3c 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -22,6 +22,7 @@ 
 
 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
 #define QUORUM_OPT_BLKVERIFY      "blkverify"
+#define QUORUM_OPT_REWRITE        "rewrite-corrupted"
 
 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
@@ -69,6 +70,9 @@  typedef struct BDRVQuorumState {
                             * It is useful to debug other block drivers by
                             * comparing them with a reference one.
                             */
+    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
+                            * block if Quorum is reached.
+                            */
 } BDRVQuorumState;
 
 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -104,13 +108,17 @@  struct QuorumAIOCB {
     int count;                  /* number of completed AIOCB */
     int success_count;          /* number of successfully completed AIOCB */
 
+    int rewrite_count;          /* number of replica to rewrite: count down to
+                                 * zero once writes are fired
+                                 */
+
     QuorumVotes votes;
 
     bool is_read;
     int vote_ret;
 };
 
-static void quorum_vote(QuorumAIOCB *acb);
+static bool quorum_vote(QuorumAIOCB *acb);
 
 static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
 {
@@ -182,6 +190,7 @@  static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
     acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
     acb->count = 0;
     acb->success_count = 0;
+    acb->rewrite_count = 0;
     acb->votes.compare = quorum_sha256_compare;
     QLIST_INIT(&acb->votes.vote_list);
     acb->is_read = false;
@@ -241,11 +250,27 @@  static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
     return false;
 }
 
+static void quorum_rewrite_aio_cb(void *opaque, int ret)
+{
+    QuorumAIOCB *acb = opaque;
+
+    /* one less rewrite to do */
+    acb->rewrite_count--;
+
+    /* wait until all rewrite callback have completed */
+    if (acb->rewrite_count) {
+        return;
+    }
+
+    quorum_aio_finalize(acb);
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
     QuorumChildRequest *sacb = opaque;
     QuorumAIOCB *acb = sacb->parent;
     BDRVQuorumState *s = acb->common.bs->opaque;
+    bool rewrite = false;
 
     sacb->ret = ret;
     acb->count++;
@@ -262,12 +287,15 @@  static void quorum_aio_cb(void *opaque, int ret)
 
     /* Do the vote on read */
     if (acb->is_read) {
-        quorum_vote(acb);
+        rewrite = quorum_vote(acb);
     } else {
         quorum_has_too_much_io_failed(acb);
     }
 
-    quorum_aio_finalize(acb);
+    /* if no rewrite is done the code will finish right away */
+    if (!rewrite) {
+        quorum_aio_finalize(acb);
+    }
 }
 
 static void quorum_report_bad_versions(BDRVQuorumState *s,
@@ -287,6 +315,43 @@  static void quorum_report_bad_versions(BDRVQuorumState *s,
     }
 }
 
+static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
+                                        QuorumVoteValue *value)
+{
+    QuorumVoteVersion *version;
+    QuorumVoteItem *item;
+    int count = 0;
+
+    /* first count the number of bad versions: done first to avoid concurency
+     * issues.
+     */
+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
+        if (acb->votes.compare(&version->value, value)) {
+            continue;
+        }
+        QLIST_FOREACH(item, &version->items, next) {
+            count++;
+        }
+    }
+
+    /* quorum_rewrite_aio_cb will count down this to zero */
+    acb->rewrite_count = count;
+
+    /* now fire the correcting rewrites */
+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
+        if (acb->votes.compare(&version->value, value)) {
+            continue;
+        }
+        QLIST_FOREACH(item, &version->items, next) {
+            bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov,
+                            acb->nb_sectors, quorum_rewrite_aio_cb, acb);
+        }
+    }
+
+    /* return true if any rewrite is done else false */
+    return count;
+}
+
 static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
 {
     int i;
@@ -477,16 +542,17 @@  static int quorum_vote_error(QuorumAIOCB *acb)
     return ret;
 }
 
-static void quorum_vote(QuorumAIOCB *acb)
+static bool quorum_vote(QuorumAIOCB *acb)
 {
     bool quorum = true;
+    bool rewrite = false;
     int i, j, ret;
     QuorumVoteValue hash;
     BDRVQuorumState *s = acb->common.bs->opaque;
     QuorumVoteVersion *winner;
 
     if (quorum_has_too_much_io_failed(acb)) {
-        return;
+        return false;;
     }
 
     /* get the index of the first successful read */
@@ -514,7 +580,7 @@  static void quorum_vote(QuorumAIOCB *acb)
     /* Every successful read agrees */
     if (quorum) {
         quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
-        return;
+        return false;;
     }
 
     /* compute hashes for each successful read, also store indexes */
@@ -547,9 +613,15 @@  static void quorum_vote(QuorumAIOCB *acb)
     /* some versions are bad print them */
     quorum_report_bad_versions(s, acb, &winner->value);
 
+    /* corruption correction is actived */
+    if (s->rewrite_corrupted) {
+        rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value);
+    }
+
 free_exit:
     /* free lists */
     quorum_free_vote_list(&acb->votes);
+    return rewrite;
 }
 
 static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
@@ -709,6 +781,11 @@  static QemuOptsList quorum_runtime_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Trigger block verify mode if set",
         },
+        {
+            .name = QUORUM_OPT_REWRITE,
+            .type = QEMU_OPT_BOOL,
+            .help = "Rewrite corrupted block on read quorum",
+        },
         { /* end of list */ }
     },
 };
@@ -770,6 +847,14 @@  static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                 "and using two files with vote_threshold=2\n");
     }
 
+    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
+    if (s->rewrite_corrupted && s->is_blkverify) {
+        error_setg(&local_err,
+                   "rewrite-corrupted=on cannot be used with blkverify=on");
+        ret = -EINVAL;
+        goto exit;
+    }
+
     /* allocate the children BlockDriverState array */
     s->bs = g_new0(BlockDriverState *, s->num_children);
     opened = g_new0(bool, s->num_children);
diff --git a/qapi-schema.json b/qapi-schema.json
index 6476d4a..f5a8767 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4542,12 +4542,15 @@ 
 #
 # @vote-threshold: the vote limit under which a read will fail
 #
+# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached
+#                     (Since 2.1)
+#
 # Since: 2.0
 ##
 { 'type': 'BlockdevOptionsQuorum',
   'data': { '*blkverify': 'bool',
             'children': [ 'BlockdevRef' ],
-            'vote-threshold': 'int' } }
+            'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } }
 
 ##
 # @BlockdevOptions
diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081
index b512d00..bb211d6 100755
--- a/tests/qemu-iotests/081
+++ b/tests/qemu-iotests/081
@@ -95,6 +95,18 @@  echo "== checking quorum correction =="
 $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
 
 echo
+echo "== using quorum rewrite corrupted mode =="
+
+quorum="$quorum,file.rewrite-corrupted=on"
+
+$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
+
+echo
+echo "== checking that quorum has corrected the corrupted file =="
+
+$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
+
+echo
 echo "== checking mixed reference/option specification =="
 
 run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF
@@ -137,6 +149,8 @@  echo
 echo "== breaking quorum =="
 
 $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io
+$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
+
 echo
 echo "== checking that quorum is broken =="
 
diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out
index 84aeb0c..2d8b290 100644
--- a/tests/qemu-iotests/081.out
+++ b/tests/qemu-iotests/081.out
@@ -25,12 +25,19 @@  wrote 10485760/10485760 bytes at offset 0
 read 10485760/10485760 bytes at offset 0
 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
+== using quorum rewrite corrupted mode ==
+read 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== checking that quorum has corrected the corrupted file ==
+read 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
 == checking mixed reference/option specification ==
 Testing: -drive file=TEST_DIR/2.IMGFMT,format=IMGFMT,if=none,id=drive2
 QMP_VERSION
 {"return": {}}
 {"return": {}}
-{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "QUORUM_REPORT_BAD", "data": {"node-name": "", "sectors-count": 20480, "sector-num": 0}}
 read 10485760/10485760 bytes at offset 0
 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 {"return": ""}
@@ -43,6 +50,8 @@  read 10485760/10485760 bytes at offset 0
 == breaking quorum ==
 wrote 10485760/10485760 bytes at offset 0
 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 == checking that quorum is broken ==
 qemu-io: can't open device (null): Could not read image for determining its format: Input/output error