Message ID | 1355407206-17100-136-git-send-email-herton.krzesinski@canonical.com |
---|---|
State | New |
Headers | show |
On Thu, 2012-12-13 at 11:58 -0200, Herton Ronaldo Krzesinski wrote: > 3.5.7.2 -stable review patch. If anyone has any objections, please let me know. > > ------------------ > > From: NeilBrown <neilb@suse.de> > > commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream. > > When a replacement operation completes there is a small window > when the original device is marked 'faulty' and the replacement > still looks like a replacement. The faulty should be removed and > the replacement moved in place very quickly, bit it isn't instant. > > So the code write out to the array must handle the possibility that > the only working device for some slot in the replacement - but it > doesn't. If the primary device is faulty it just gives up. This > can lead to corruption. > > So make the code more robust: if either the primary or the > replacement is present and working, write to them. Only when > neither are present do we give up. > > This bug has been present since replacement was introduced in > 3.3, so it is suitable for any -stable kernel since then. This is missing from 3.4, so Greg will presumably want to apply this (if the backport is correct). Ben. > Reported-by: "George Spelvin" <linux@horizon.com> > Signed-off-by: NeilBrown <neilb@suse.de> > [ herton: hairy code adjustment on 3rd hunk (conf->copies for loop) ] > Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> > --- > drivers/md/raid10.c | 113 +++++++++++++++++++++++++++------------------------ > 1 file changed, 59 insertions(+), 54 deletions(-) > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > index 17fae37..0920adf 100644 > --- a/drivers/md/raid10.c > +++ b/drivers/md/raid10.c > @@ -1267,18 +1267,21 @@ retry_write: > blocked_rdev = rrdev; > break; > } > + if (rdev && (test_bit(Faulty, &rdev->flags) > + || test_bit(Unmerged, &rdev->flags))) > + rdev = NULL; > if (rrdev && (test_bit(Faulty, &rrdev->flags) > || test_bit(Unmerged, &rrdev->flags))) > rrdev = NULL; > > r10_bio->devs[i].bio = NULL; > r10_bio->devs[i].repl_bio = NULL; > - if (!rdev || test_bit(Faulty, &rdev->flags) || > - test_bit(Unmerged, &rdev->flags)) { > + > + if (!rdev && !rrdev) { > set_bit(R10BIO_Degraded, &r10_bio->state); > continue; > } > - if (test_bit(WriteErrorSeen, &rdev->flags)) { > + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { > sector_t first_bad; > sector_t dev_sector = r10_bio->devs[i].addr; > int bad_sectors; > @@ -1320,8 +1323,10 @@ retry_write: > max_sectors = good_sectors; > } > } > - r10_bio->devs[i].bio = bio; > - atomic_inc(&rdev->nr_pending); > + if (rdev) { > + r10_bio->devs[i].bio = bio; > + atomic_inc(&rdev->nr_pending); > + } > if (rrdev) { > r10_bio->devs[i].repl_bio = bio; > atomic_inc(&rrdev->nr_pending); > @@ -1377,58 +1382,58 @@ retry_write: > for (i = 0; i < conf->copies; i++) { > struct bio *mbio; > int d = r10_bio->devs[i].devnum; > - if (!r10_bio->devs[i].bio) > - continue; > - > - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); > - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, > - max_sectors); > - r10_bio->devs[i].bio = mbio; > - > - mbio->bi_sector = (r10_bio->devs[i].addr+ > - choose_data_offset(r10_bio, > - conf->mirrors[d].rdev)); > - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; > - mbio->bi_end_io = raid10_end_write_request; > - mbio->bi_rw = WRITE | do_sync | do_fua; > - mbio->bi_private = r10_bio; > - > - atomic_inc(&r10_bio->remaining); > - spin_lock_irqsave(&conf->device_lock, flags); > - bio_list_add(&conf->pending_bio_list, mbio); > - conf->pending_count++; > - spin_unlock_irqrestore(&conf->device_lock, flags); > - if (!mddev_check_plugged(mddev)) > - md_wakeup_thread(mddev->thread); > - > - if (!r10_bio->devs[i].repl_bio) > - continue; > + if (r10_bio->devs[i].bio) { > + struct md_rdev *rdev = conf->mirrors[d].rdev; > + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); > + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, > + max_sectors); > + r10_bio->devs[i].bio = mbio; > + > + mbio->bi_sector = (r10_bio->devs[i].addr+ > + choose_data_offset(r10_bio, > + rdev)); > + mbio->bi_bdev = rdev->bdev; > + mbio->bi_end_io = raid10_end_write_request; > + mbio->bi_rw = WRITE | do_sync | do_fua; > + mbio->bi_private = r10_bio; > > - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); > - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, > - max_sectors); > - r10_bio->devs[i].repl_bio = mbio; > + atomic_inc(&r10_bio->remaining); > + spin_lock_irqsave(&conf->device_lock, flags); > + bio_list_add(&conf->pending_bio_list, mbio); > + conf->pending_count++; > + spin_unlock_irqrestore(&conf->device_lock, flags); > + if (!mddev_check_plugged(mddev)) > + md_wakeup_thread(mddev->thread); > + } > > - /* We are actively writing to the original device > - * so it cannot disappear, so the replacement cannot > - * become NULL here > - */ > - mbio->bi_sector = (r10_bio->devs[i].addr + > - choose_data_offset( > - r10_bio, > - conf->mirrors[d].replacement)); > - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; > - mbio->bi_end_io = raid10_end_write_request; > - mbio->bi_rw = WRITE | do_sync | do_fua; > - mbio->bi_private = r10_bio; > + if (r10_bio->devs[i].repl_bio) { > + struct md_rdev *rdev = conf->mirrors[d].replacement; > + if (rdev == NULL) { > + /* Replacement just got moved to main 'rdev' */ > + smp_mb(); > + rdev = conf->mirrors[d].rdev; > + } > + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); > + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, > + max_sectors); > + r10_bio->devs[i].repl_bio = mbio; > + > + mbio->bi_sector = (r10_bio->devs[i].addr + > + choose_data_offset( > + r10_bio, rdev)); > + mbio->bi_bdev = rdev->bdev; > + mbio->bi_end_io = raid10_end_write_request; > + mbio->bi_rw = WRITE | do_sync | do_fua; > + mbio->bi_private = r10_bio; > > - atomic_inc(&r10_bio->remaining); > - spin_lock_irqsave(&conf->device_lock, flags); > - bio_list_add(&conf->pending_bio_list, mbio); > - conf->pending_count++; > - spin_unlock_irqrestore(&conf->device_lock, flags); > - if (!mddev_check_plugged(mddev)) > - md_wakeup_thread(mddev->thread); > + atomic_inc(&r10_bio->remaining); > + spin_lock_irqsave(&conf->device_lock, flags); > + bio_list_add(&conf->pending_bio_list, mbio); > + conf->pending_count++; > + spin_unlock_irqrestore(&conf->device_lock, flags); > + if (!mddev_check_plugged(mddev)) > + md_wakeup_thread(mddev->thread); > + } > } > > /* Don't remove the bias on 'remaining' (one_write_done) until
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 17fae37..0920adf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1267,18 +1267,21 @@ retry_write: blocked_rdev = rrdev; break; } + if (rdev && (test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags))) + rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags) || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) { + + if (!rdev && !rrdev) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; int bad_sectors; @@ -1320,8 +1323,10 @@ retry_write: max_sectors = good_sectors; } } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } if (rrdev) { r10_bio->devs[i].repl_bio = bio; atomic_inc(&rrdev->nr_pending); @@ -1377,58 +1382,58 @@ retry_write: for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; - - mbio->bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - conf->mirrors[d].rdev)); - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) - md_wakeup_thread(mddev->thread); - - if (!r10_bio->devs[i].repl_bio) - continue; + if (r10_bio->devs[i].bio) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, + rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_private = r10_bio; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); + } - /* We are actively writing to the original device - * so it cannot disappear, so the replacement cannot - * become NULL here - */ - mbio->bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, - conf->mirrors[d].replacement)); - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; - mbio->bi_private = r10_bio; + if (r10_bio->devs[i].repl_bio) { + struct md_rdev *rdev = conf->mirrors[d].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_private = r10_bio; - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) - md_wakeup_thread(mddev->thread); + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); + } } /* Don't remove the bias on 'remaining' (one_write_done) until