diff mbox

[1/2] mtd: brcmnand: Add check for erased page bitflips

Message ID 1461961285-24159-1-git-send-email-kdasu.kdev@gmail.com
State Superseded
Headers show

Commit Message

Kamal Dasu April 29, 2016, 8:21 p.m. UTC
Check for erased page bitflips in a page. And if well within
threshold return data as all 0xff.

Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
---
 drivers/mtd/nand/brcmnand/brcmnand.c | 83 +++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 5 deletions(-)

Comments

Boris Brezillon May 30, 2016, 8:42 a.m. UTC | #1
Hi Kamal,

On Fri, 29 Apr 2016 16:21:24 -0400
Kamal Dasu <kdasu.kdev@gmail.com> wrote:

> Check for erased page bitflips in a page. And if well within
> threshold return data as all 0xff.
> 
> Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
> ---
>  drivers/mtd/nand/brcmnand/brcmnand.c | 83 +++++++++++++++++++++++++++++++++---
>  1 file changed, 78 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c
> index e052839..29a9abd 100644
> --- a/drivers/mtd/nand/brcmnand/brcmnand.c
> +++ b/drivers/mtd/nand/brcmnand/brcmnand.c
> @@ -1490,6 +1490,64 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>  	return ret;
>  }
>  
> +/*
> + * Check a page to see if it is erased (w/ bitflips) after an uncorrectable ECC
> + * error
> + *
> + * Because the HW ECC signals an ECC error if an erase paged has even a single
> + * bitflip, we must check each ECC error to see if it is actually an erased
> + * page with bitflips, not a truly corrupted page.
> + *
> + * On a real error, return a negative error code (-EBADMSG for ECC error), and
> + * buf will contain raw data.
> + * Otherwise, fill buf with 0xff and return the maximum number of
> + * bitflips-per-ECC-sector to the caller.
> + *
> + */
> +static int brcmstb_nand_verify_erased_page(struct mtd_info *mtd,
> +		  struct nand_chip *chip, void *buf, u64 addr)
> +{
> +	int i, sas, oob_nbits, data_nbits;
> +	void *oob = chip->oob_poi;
> +	unsigned int max_bitflips = 0;
> +	int page = addr >> chip->page_shift;
> +	int ret;
> +
> +	if (!buf) {
> +		buf = chip->buffers->databuf;
> +		/* Invalidate page cache */
> +		chip->pagebuf = -1;
> +	}
> +
> +	sas = mtd->oobsize / chip->ecc.steps;
> +	oob_nbits = sas << 3;
> +	data_nbits = chip->ecc.size << 3;
> +
> +	/* read without ecc for verification */
> +	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
> +	ret = chip->ecc.read_page_raw(mtd, chip, buf, true, page);

Do you really need to read the whole page in raw mode? Usually, only
reading the OOB sections is enough.

> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
> +		unsigned int bitflips = 0;
> +
> +		bitflips += oob_nbits - bitmap_weight(oob, oob_nbits);
> +		bitflips += data_nbits - bitmap_weight(buf, data_nbits);
> +
> +		buf += chip->ecc.size;
> +		addr += chip->ecc.size;

You seem to duplicate nand_check_erased_ecc_chunk() here. Do you have a
good reason for doing that?

Regards,

Boris
Kamal Dasu June 1, 2016, 4:46 p.m. UTC | #2
Boris,



On Mon, May 30, 2016 at 4:42 AM, Boris Brezillon
<boris.brezillon@free-electrons.com> wrote:
> Hi Kamal,
>
> On Fri, 29 Apr 2016 16:21:24 -0400
> Kamal Dasu <kdasu.kdev@gmail.com> wrote:
>
>> Check for erased page bitflips in a page. And if well within
>> threshold return data as all 0xff.
>>
>> Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
>> ---
>>  drivers/mtd/nand/brcmnand/brcmnand.c | 83 +++++++++++++++++++++++++++++++++---
>>  1 file changed, 78 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c
>> index e052839..29a9abd 100644
>> --- a/drivers/mtd/nand/brcmnand/brcmnand.c
>> +++ b/drivers/mtd/nand/brcmnand/brcmnand.c
>> @@ -1490,6 +1490,64 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>       return ret;
>>  }
>>
>> +/*
>> + * Check a page to see if it is erased (w/ bitflips) after an uncorrectable ECC
>> + * error
>> + *
>> + * Because the HW ECC signals an ECC error if an erase paged has even a single
>> + * bitflip, we must check each ECC error to see if it is actually an erased
>> + * page with bitflips, not a truly corrupted page.
>> + *
>> + * On a real error, return a negative error code (-EBADMSG for ECC error), and
>> + * buf will contain raw data.
>> + * Otherwise, fill buf with 0xff and return the maximum number of
>> + * bitflips-per-ECC-sector to the caller.
>> + *
>> + */
>> +static int brcmstb_nand_verify_erased_page(struct mtd_info *mtd,
>> +               struct nand_chip *chip, void *buf, u64 addr)
>> +{
>> +     int i, sas, oob_nbits, data_nbits;
>> +     void *oob = chip->oob_poi;
>> +     unsigned int max_bitflips = 0;
>> +     int page = addr >> chip->page_shift;
>> +     int ret;
>> +
>> +     if (!buf) {
>> +             buf = chip->buffers->databuf;
>> +             /* Invalidate page cache */
>> +             chip->pagebuf = -1;
>> +     }
>> +
>> +     sas = mtd->oobsize / chip->ecc.steps;
>> +     oob_nbits = sas << 3;
>> +     data_nbits = chip->ecc.size << 3;
>> +
>> +     /* read without ecc for verification */
>> +     chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
>> +     ret = chip->ecc.read_page_raw(mtd, chip, buf, true, page);
>
> Do you really need to read the whole page in raw mode? Usually, only
> reading the OOB sections is enough.
>

Just so that the HW ecc algo does not kick in we read the page in raw
mode. OOB registers are filled as part of this read. Also just the
data might have bit flips and OOB might be all FFs, we still need to
read  page data. Generally we read again to make sure that the hw ecc
algo did not change the data after calculations in the original  read
where it reported uncorrectable error.  I will double check this.

>> +     if (ret)
>> +             return ret;
>> +
>> +     for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
>> +             unsigned int bitflips = 0;
>> +
>> +             bitflips += oob_nbits - bitmap_weight(oob, oob_nbits);
>> +             bitflips += data_nbits - bitmap_weight(buf, data_nbits);
>> +
>> +             buf += chip->ecc.size;
>> +             addr += chip->ecc.size;
>
> You seem to duplicate nand_check_erased_ecc_chunk() here. Do you have a
> good reason for doing that?
>

Hmmm I see what you are saying. Let me try setting the
NAND_ECC_GENERIC_ERASED_CHECK option and see if we can get away
without having to read raw.  I will have to test and make sure on
uncorrectable error the hw leaves the return page data buffers and oob
buffers in raw state.

If that works as expected I will get rid of this duplication and send
a revised change which shall make use of the
NAND_ECC_GENERIC_ERASED_CHECK option.

Thanks
Kamal
Brian Norris June 1, 2016, 5:14 p.m. UTC | #3
On Wed, Jun 01, 2016 at 12:46:18PM -0400, Kamal Dasu wrote:
> On Mon, May 30, 2016 at 4:42 AM, Boris Brezillon
> <boris.brezillon@free-electrons.com> wrote:
> > On Fri, 29 Apr 2016 16:21:24 -0400
> > Kamal Dasu <kdasu.kdev@gmail.com> wrote:

> >> +     if (ret)
> >> +             return ret;
> >> +
> >> +     for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
> >> +             unsigned int bitflips = 0;
> >> +
> >> +             bitflips += oob_nbits - bitmap_weight(oob, oob_nbits);
> >> +             bitflips += data_nbits - bitmap_weight(buf, data_nbits);
> >> +
> >> +             buf += chip->ecc.size;
> >> +             addr += chip->ecc.size;
> >
> > You seem to duplicate nand_check_erased_ecc_chunk() here. Do you have a
> > good reason for doing that?
> >
> 
> Hmmm I see what you are saying. Let me try setting the
> NAND_ECC_GENERIC_ERASED_CHECK option and see if we can get away
> without having to read raw.  I will have to test and make sure on
> uncorrectable error the hw leaves the return page data buffers and oob
> buffers in raw state.

I'm quite sure you can't make use of NAND_ECC_GENERIC_ERASED_CHECK
unless you do a substantial rewrite; brcmnand doesn't use any of the
nand_base ecc.read_{page,subpage} callbacks.

> If that works as expected I will get rid of this duplication and send
> a revised change which shall make use of the
> NAND_ECC_GENERIC_ERASED_CHECK option.

I suspect he was just suggesting calling the
nand_check_erased_ecc_chunk() helper instead of doing your own
bitmap_weight() calls.

Brian
Boris Brezillon June 1, 2016, 5:22 p.m. UTC | #4
On Wed, 1 Jun 2016 10:14:40 -0700
Brian Norris <computersforpeace@gmail.com> wrote:

> On Wed, Jun 01, 2016 at 12:46:18PM -0400, Kamal Dasu wrote:
> > On Mon, May 30, 2016 at 4:42 AM, Boris Brezillon
> > <boris.brezillon@free-electrons.com> wrote:  
> > > On Fri, 29 Apr 2016 16:21:24 -0400
> > > Kamal Dasu <kdasu.kdev@gmail.com> wrote:  
> 
> > >> +     if (ret)
> > >> +             return ret;
> > >> +
> > >> +     for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
> > >> +             unsigned int bitflips = 0;
> > >> +
> > >> +             bitflips += oob_nbits - bitmap_weight(oob, oob_nbits);
> > >> +             bitflips += data_nbits - bitmap_weight(buf, data_nbits);
> > >> +
> > >> +             buf += chip->ecc.size;
> > >> +             addr += chip->ecc.size;  
> > >
> > > You seem to duplicate nand_check_erased_ecc_chunk() here. Do you have a
> > > good reason for doing that?
> > >  
> > 
> > Hmmm I see what you are saying. Let me try setting the
> > NAND_ECC_GENERIC_ERASED_CHECK option and see if we can get away
> > without having to read raw.  I will have to test and make sure on
> > uncorrectable error the hw leaves the return page data buffers and oob
> > buffers in raw state.  
> 
> I'm quite sure you can't make use of NAND_ECC_GENERIC_ERASED_CHECK
> unless you do a substantial rewrite; brcmnand doesn't use any of the
> nand_base ecc.read_{page,subpage} callbacks.
> 
> > If that works as expected I will get rid of this duplication and send
> > a revised change which shall make use of the
> > NAND_ECC_GENERIC_ERASED_CHECK option.  
> 
> I suspect he was just suggesting calling the
> nand_check_erased_ecc_chunk() helper instead of doing your own
> bitmap_weight() calls.

Exactly.
Kamal Dasu June 1, 2016, 5:27 p.m. UTC | #5
Boris,

>> I'm quite sure you can't make use of NAND_ECC_GENERIC_ERASED_CHECK
>> unless you do a substantial rewrite; brcmnand doesn't use any of the
>> nand_base ecc.read_{page,subpage} callbacks.
>>
>> > If that works as expected I will get rid of this duplication and send
>> > a revised change which shall make use of the
>> > NAND_ECC_GENERIC_ERASED_CHECK option.
>>
>> I suspect he was just suggesting calling the
>> nand_check_erased_ecc_chunk() helper instead of doing your own
>> bitmap_weight() calls.
>
> Exactly.
>
>

Ok got it. Yes I did realize that using the option is not straight
forward as far as brcmnand driver is concerned.

Thanks
Kamal
diff mbox

Patch

diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c
index e052839..29a9abd 100644
--- a/drivers/mtd/nand/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/brcmnand/brcmnand.c
@@ -1490,6 +1490,64 @@  static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
 	return ret;
 }
 
+/*
+ * Check a page to see if it is erased (w/ bitflips) after an uncorrectable ECC
+ * error
+ *
+ * Because the HW ECC signals an ECC error if an erase paged has even a single
+ * bitflip, we must check each ECC error to see if it is actually an erased
+ * page with bitflips, not a truly corrupted page.
+ *
+ * On a real error, return a negative error code (-EBADMSG for ECC error), and
+ * buf will contain raw data.
+ * Otherwise, fill buf with 0xff and return the maximum number of
+ * bitflips-per-ECC-sector to the caller.
+ *
+ */
+static int brcmstb_nand_verify_erased_page(struct mtd_info *mtd,
+		  struct nand_chip *chip, void *buf, u64 addr)
+{
+	int i, sas, oob_nbits, data_nbits;
+	void *oob = chip->oob_poi;
+	unsigned int max_bitflips = 0;
+	int page = addr >> chip->page_shift;
+	int ret;
+
+	if (!buf) {
+		buf = chip->buffers->databuf;
+		/* Invalidate page cache */
+		chip->pagebuf = -1;
+	}
+
+	sas = mtd->oobsize / chip->ecc.steps;
+	oob_nbits = sas << 3;
+	data_nbits = chip->ecc.size << 3;
+
+	/* read without ecc for verification */
+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
+	ret = chip->ecc.read_page_raw(mtd, chip, buf, true, page);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
+		unsigned int bitflips = 0;
+
+		bitflips += oob_nbits - bitmap_weight(oob, oob_nbits);
+		bitflips += data_nbits - bitmap_weight(buf, data_nbits);
+
+		buf += chip->ecc.size;
+		addr += chip->ecc.size;
+
+		/* Too many bitflips */
+		if (bitflips > chip->ecc.strength)
+			return -EBADMSG;
+
+		max_bitflips = max(max_bitflips, bitflips);
+	}
+
+	return max_bitflips;
+}
+
 static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
 			 u64 addr, unsigned int trans, u32 *buf, u8 *oob)
 {
@@ -1520,11 +1578,26 @@  static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
 	}
 
 	if (mtd_is_eccerr(err)) {
-		dev_dbg(ctrl->dev, "uncorrectable error at 0x%llx\n",
-			(unsigned long long)err_addr);
-		mtd->ecc_stats.failed++;
-		/* NAND layer expects zero on ECC errors */
-		return 0;
+		int ret;
+
+		ret = brcmstb_nand_verify_erased_page(mtd, chip, buf, addr);
+		if (ret < 0) {
+			dev_dbg(ctrl->dev, "uncorrectable error at 0x%llx\n",
+				(unsigned long long)err_addr);
+			mtd->ecc_stats.failed++;
+			/* NAND layer expects zero on ECC errors */
+			return 0;
+		} else {
+			if (buf)
+				memset(buf, 0xff, FC_BYTES * trans);
+			if (oob)
+				memset(oob, 0xff, mtd->oobsize);
+
+			dev_info(&host->pdev->dev,
+					"corrected %d bitflips in blank page at 0x%llx\n",
+					ret, (unsigned long long)addr);
+			return ret;
+		}
 	}
 
 	if (mtd_is_bitflip(err)) {