Patchwork [135/241] md/raid10: close race that lose writes lost when replacement completes.

login
register
mail settings
Submitter Herton Ronaldo Krzesinski
Date Dec. 13, 2012, 1:58 p.m.
Message ID <1355407206-17100-136-git-send-email-herton.krzesinski@canonical.com>
Download mbox | patch
Permalink /patch/205985/
State New
Headers show

Comments

Herton Ronaldo Krzesinski - Dec. 13, 2012, 1:58 p.m.
3.5.7.2 -stable review patch.  If anyone has any objections, please let me know.

------------------

From: NeilBrown <neilb@suse.de>

commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream.

When a replacement operation completes there is a small window
when the original device is marked 'faulty' and the replacement
still looks like a replacement.  The faulty should be removed and
the replacement moved in place very quickly, bit it isn't instant.

So the code write out to the array must handle the possibility that
the only working device for some slot in the replacement - but it
doesn't.  If the primary device is faulty it just gives up.  This
can lead to corruption.

So make the code more robust: if either  the primary or the
replacement is present and working, write to them.  Only when
neither are present do we give up.

This bug has been present since replacement was introduced in
3.3, so it is suitable for any -stable kernel since then.

Reported-by: "George Spelvin" <linux@horizon.com>
Signed-off-by: NeilBrown <neilb@suse.de>
[ herton: hairy code adjustment on 3rd hunk (conf->copies for loop) ]
Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
---
 drivers/md/raid10.c |  113 +++++++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 54 deletions(-)
Ben Hutchings - Dec. 15, 2012, 6:40 p.m.
On Thu, 2012-12-13 at 11:58 -0200, Herton Ronaldo Krzesinski wrote:
> 3.5.7.2 -stable review patch.  If anyone has any objections, please let me know.
> 
> ------------------
> 
> From: NeilBrown <neilb@suse.de>
> 
> commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream.
> 
> When a replacement operation completes there is a small window
> when the original device is marked 'faulty' and the replacement
> still looks like a replacement.  The faulty should be removed and
> the replacement moved in place very quickly, bit it isn't instant.
> 
> So the code write out to the array must handle the possibility that
> the only working device for some slot in the replacement - but it
> doesn't.  If the primary device is faulty it just gives up.  This
> can lead to corruption.
> 
> So make the code more robust: if either  the primary or the
> replacement is present and working, write to them.  Only when
> neither are present do we give up.
> 
> This bug has been present since replacement was introduced in
> 3.3, so it is suitable for any -stable kernel since then.

This is missing from 3.4, so Greg will presumably want to apply this (if
the backport is correct).

Ben.

> Reported-by: "George Spelvin" <linux@horizon.com>
> Signed-off-by: NeilBrown <neilb@suse.de>
> [ herton: hairy code adjustment on 3rd hunk (conf->copies for loop) ]
> Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
> ---
>  drivers/md/raid10.c |  113 +++++++++++++++++++++++++++------------------------
>  1 file changed, 59 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 17fae37..0920adf 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1267,18 +1267,21 @@ retry_write:
>  			blocked_rdev = rrdev;
>  			break;
>  		}
> +		if (rdev && (test_bit(Faulty, &rdev->flags)
> +			     || test_bit(Unmerged, &rdev->flags)))
> +			rdev = NULL;
>  		if (rrdev && (test_bit(Faulty, &rrdev->flags)
>  			      || test_bit(Unmerged, &rrdev->flags)))
>  			rrdev = NULL;
>  
>  		r10_bio->devs[i].bio = NULL;
>  		r10_bio->devs[i].repl_bio = NULL;
> -		if (!rdev || test_bit(Faulty, &rdev->flags) ||
> -		    test_bit(Unmerged, &rdev->flags)) {
> +
> +		if (!rdev && !rrdev) {
>  			set_bit(R10BIO_Degraded, &r10_bio->state);
>  			continue;
>  		}
> -		if (test_bit(WriteErrorSeen, &rdev->flags)) {
> +		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
>  			sector_t first_bad;
>  			sector_t dev_sector = r10_bio->devs[i].addr;
>  			int bad_sectors;
> @@ -1320,8 +1323,10 @@ retry_write:
>  					max_sectors = good_sectors;
>  			}
>  		}
> -		r10_bio->devs[i].bio = bio;
> -		atomic_inc(&rdev->nr_pending);
> +		if (rdev) {
> +			r10_bio->devs[i].bio = bio;
> +			atomic_inc(&rdev->nr_pending);
> +		}
>  		if (rrdev) {
>  			r10_bio->devs[i].repl_bio = bio;
>  			atomic_inc(&rrdev->nr_pending);
> @@ -1377,58 +1382,58 @@ retry_write:
>  	for (i = 0; i < conf->copies; i++) {
>  		struct bio *mbio;
>  		int d = r10_bio->devs[i].devnum;
> -		if (!r10_bio->devs[i].bio)
> -			continue;
> -
> -		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> -		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> -			    max_sectors);
> -		r10_bio->devs[i].bio = mbio;
> -
> -		mbio->bi_sector	= (r10_bio->devs[i].addr+
> -				   choose_data_offset(r10_bio,
> -						      conf->mirrors[d].rdev));
> -		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
> -		mbio->bi_end_io	= raid10_end_write_request;
> -		mbio->bi_rw = WRITE | do_sync | do_fua;
> -		mbio->bi_private = r10_bio;
> -
> -		atomic_inc(&r10_bio->remaining);
> -		spin_lock_irqsave(&conf->device_lock, flags);
> -		bio_list_add(&conf->pending_bio_list, mbio);
> -		conf->pending_count++;
> -		spin_unlock_irqrestore(&conf->device_lock, flags);
> -		if (!mddev_check_plugged(mddev))
> -			md_wakeup_thread(mddev->thread);
> -
> -		if (!r10_bio->devs[i].repl_bio)
> -			continue;
> +		if (r10_bio->devs[i].bio) {
> +			struct md_rdev *rdev = conf->mirrors[d].rdev;
> +			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> +			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> +				    max_sectors);
> +			r10_bio->devs[i].bio = mbio;
> +
> +			mbio->bi_sector	= (r10_bio->devs[i].addr+
> +					   choose_data_offset(r10_bio,
> +							      rdev));
> +			mbio->bi_bdev = rdev->bdev;
> +			mbio->bi_end_io = raid10_end_write_request;
> +			mbio->bi_rw = WRITE | do_sync | do_fua;
> +			mbio->bi_private = r10_bio;
>  
> -		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> -		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> -			    max_sectors);
> -		r10_bio->devs[i].repl_bio = mbio;
> +			atomic_inc(&r10_bio->remaining);
> +			spin_lock_irqsave(&conf->device_lock, flags);
> +			bio_list_add(&conf->pending_bio_list, mbio);
> +			conf->pending_count++;
> +			spin_unlock_irqrestore(&conf->device_lock, flags);
> +			if (!mddev_check_plugged(mddev))
> +				md_wakeup_thread(mddev->thread);
> +		}
>  
> -		/* We are actively writing to the original device
> -		 * so it cannot disappear, so the replacement cannot
> -		 * become NULL here
> -		 */
> -		mbio->bi_sector	= (r10_bio->devs[i].addr +
> -				   choose_data_offset(
> -					   r10_bio,
> -					   conf->mirrors[d].replacement));
> -		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
> -		mbio->bi_end_io	= raid10_end_write_request;
> -		mbio->bi_rw = WRITE | do_sync | do_fua;
> -		mbio->bi_private = r10_bio;
> +		if (r10_bio->devs[i].repl_bio) {
> +			struct md_rdev *rdev = conf->mirrors[d].replacement;
> +			if (rdev == NULL) {
> +				/* Replacement just got moved to main 'rdev' */
> +				smp_mb();
> +				rdev = conf->mirrors[d].rdev;
> +			}
> +			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> +			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> +				    max_sectors);
> +			r10_bio->devs[i].repl_bio = mbio;
> +
> +			mbio->bi_sector = (r10_bio->devs[i].addr +
> +					   choose_data_offset(
> +						   r10_bio, rdev));
> +			mbio->bi_bdev = rdev->bdev;
> +			mbio->bi_end_io = raid10_end_write_request;
> +			mbio->bi_rw = WRITE | do_sync | do_fua;
> +			mbio->bi_private = r10_bio;
>  
> -		atomic_inc(&r10_bio->remaining);
> -		spin_lock_irqsave(&conf->device_lock, flags);
> -		bio_list_add(&conf->pending_bio_list, mbio);
> -		conf->pending_count++;
> -		spin_unlock_irqrestore(&conf->device_lock, flags);
> -		if (!mddev_check_plugged(mddev))
> -			md_wakeup_thread(mddev->thread);
> +			atomic_inc(&r10_bio->remaining);
> +			spin_lock_irqsave(&conf->device_lock, flags);
> +			bio_list_add(&conf->pending_bio_list, mbio);
> +			conf->pending_count++;
> +			spin_unlock_irqrestore(&conf->device_lock, flags);
> +			if (!mddev_check_plugged(mddev))
> +				md_wakeup_thread(mddev->thread);
> +		}
>  	}
>  
>  	/* Don't remove the bias on 'remaining' (one_write_done) until

Patch

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 17fae37..0920adf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1267,18 +1267,21 @@  retry_write:
 			blocked_rdev = rrdev;
 			break;
 		}
+		if (rdev && (test_bit(Faulty, &rdev->flags)
+			     || test_bit(Unmerged, &rdev->flags)))
+			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)
 			      || test_bit(Unmerged, &rrdev->flags)))
 			rrdev = NULL;
 
 		r10_bio->devs[i].bio = NULL;
 		r10_bio->devs[i].repl_bio = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags) ||
-		    test_bit(Unmerged, &rdev->flags)) {
+
+		if (!rdev && !rrdev) {
 			set_bit(R10BIO_Degraded, &r10_bio->state);
 			continue;
 		}
-		if (test_bit(WriteErrorSeen, &rdev->flags)) {
+		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
 			sector_t first_bad;
 			sector_t dev_sector = r10_bio->devs[i].addr;
 			int bad_sectors;
@@ -1320,8 +1323,10 @@  retry_write:
 					max_sectors = good_sectors;
 			}
 		}
-		r10_bio->devs[i].bio = bio;
-		atomic_inc(&rdev->nr_pending);
+		if (rdev) {
+			r10_bio->devs[i].bio = bio;
+			atomic_inc(&rdev->nr_pending);
+		}
 		if (rrdev) {
 			r10_bio->devs[i].repl_bio = bio;
 			atomic_inc(&rrdev->nr_pending);
@@ -1377,58 +1382,58 @@  retry_write:
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
 		int d = r10_bio->devs[i].devnum;
-		if (!r10_bio->devs[i].bio)
-			continue;
-
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-			    max_sectors);
-		r10_bio->devs[i].bio = mbio;
-
-		mbio->bi_sector	= (r10_bio->devs[i].addr+
-				   choose_data_offset(r10_bio,
-						      conf->mirrors[d].rdev));
-		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua;
-		mbio->bi_private = r10_bio;
-
-		atomic_inc(&r10_bio->remaining);
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio_list_add(&conf->pending_bio_list, mbio);
-		conf->pending_count++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!mddev_check_plugged(mddev))
-			md_wakeup_thread(mddev->thread);
-
-		if (!r10_bio->devs[i].repl_bio)
-			continue;
+		if (r10_bio->devs[i].bio) {
+			struct md_rdev *rdev = conf->mirrors[d].rdev;
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+				    max_sectors);
+			r10_bio->devs[i].bio = mbio;
+
+			mbio->bi_sector	= (r10_bio->devs[i].addr+
+					   choose_data_offset(r10_bio,
+							      rdev));
+			mbio->bi_bdev = rdev->bdev;
+			mbio->bi_end_io = raid10_end_write_request;
+			mbio->bi_rw = WRITE | do_sync | do_fua;
+			mbio->bi_private = r10_bio;
 
-		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-			    max_sectors);
-		r10_bio->devs[i].repl_bio = mbio;
+			atomic_inc(&r10_bio->remaining);
+			spin_lock_irqsave(&conf->device_lock, flags);
+			bio_list_add(&conf->pending_bio_list, mbio);
+			conf->pending_count++;
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			if (!mddev_check_plugged(mddev))
+				md_wakeup_thread(mddev->thread);
+		}
 
-		/* We are actively writing to the original device
-		 * so it cannot disappear, so the replacement cannot
-		 * become NULL here
-		 */
-		mbio->bi_sector	= (r10_bio->devs[i].addr +
-				   choose_data_offset(
-					   r10_bio,
-					   conf->mirrors[d].replacement));
-		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
-		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua;
-		mbio->bi_private = r10_bio;
+		if (r10_bio->devs[i].repl_bio) {
+			struct md_rdev *rdev = conf->mirrors[d].replacement;
+			if (rdev == NULL) {
+				/* Replacement just got moved to main 'rdev' */
+				smp_mb();
+				rdev = conf->mirrors[d].rdev;
+			}
+			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+				    max_sectors);
+			r10_bio->devs[i].repl_bio = mbio;
+
+			mbio->bi_sector = (r10_bio->devs[i].addr +
+					   choose_data_offset(
+						   r10_bio, rdev));
+			mbio->bi_bdev = rdev->bdev;
+			mbio->bi_end_io = raid10_end_write_request;
+			mbio->bi_rw = WRITE | do_sync | do_fua;
+			mbio->bi_private = r10_bio;
 
-		atomic_inc(&r10_bio->remaining);
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio_list_add(&conf->pending_bio_list, mbio);
-		conf->pending_count++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!mddev_check_plugged(mddev))
-			md_wakeup_thread(mddev->thread);
+			atomic_inc(&r10_bio->remaining);
+			spin_lock_irqsave(&conf->device_lock, flags);
+			bio_list_add(&conf->pending_bio_list, mbio);
+			conf->pending_count++;
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			if (!mddev_check_plugged(mddev))
+				md_wakeup_thread(mddev->thread);
+		}
 	}
 
 	/* Don't remove the bias on 'remaining' (one_write_done) until