diff mbox

fsl_ifc_nand: are blank pages protected by ECC?

Message ID 20170421133721.GA15332@amd
State Not Applicable
Delegated to: Boris Brezillon
Headers show

Commit Message

Pavel Machek April 21, 2017, 1:37 p.m. UTC
Hi!

> (Added driver author to the cc list, maybe he can help).

> > > UBIFS complains:
> > > 
> > > UBIFS error (pid 931): ubifs_scan: corrupt empty space at LEB 282:252630
> > > UBIFS error (pid 931): ubifs_scanned_corruption: corruption at LEB 282:252630
> > > UBIFS error (pid 931): ubifs_scanned_corruption: first 1322 bytes from LEB 282:252630
> > > UBIFS error (pid 931): ubifs_scan: LEB 282 scanning failed
...
> > > is_blank() checks for all 0xff's, so single-bit 0xfe in the data will
> > > result in_blank() == 0 and uncorrectable error being signaled.
> > > 
> > > Should the driver be modified somehow?
> > 
> > Yep, nand_check_erased_ecc_chunk() [1] is here to help you check this
> > case, unfortunately, it's not directly applicable here, because this

Maybe I figured it out. Unfortunately, it is only compile tested. Does
it look approximately right?

Thanks,
								Pavel

Comments

Boris Brezillon April 21, 2017, 1:49 p.m. UTC | #1
On Fri, 21 Apr 2017 15:37:21 +0200
Pavel Machek <pavel@ucw.cz> wrote:

> Hi!
> 
> > (Added driver author to the cc list, maybe he can help).  
> 
> > > > UBIFS complains:
> > > > 
> > > > UBIFS error (pid 931): ubifs_scan: corrupt empty space at LEB 282:252630
> > > > UBIFS error (pid 931): ubifs_scanned_corruption: corruption at LEB 282:252630
> > > > UBIFS error (pid 931): ubifs_scanned_corruption: first 1322 bytes from LEB 282:252630
> > > > UBIFS error (pid 931): ubifs_scan: LEB 282 scanning failed  
> ...
> > > > is_blank() checks for all 0xff's, so single-bit 0xfe in the data will
> > > > result in_blank() == 0 and uncorrectable error being signaled.
> > > > 
> > > > Should the driver be modified somehow?  
> > > 
> > > Yep, nand_check_erased_ecc_chunk() [1] is here to help you check this
> > > case, unfortunately, it's not directly applicable here, because this  
> 
> Maybe I figured it out. Unfortunately, it is only compile tested. Does
> it look approximately right?

Yep that's definitely better. Just one thing missing (see below),
otherwise it looks good.

> 
> Thanks,
> 								Pavel
> 
> --- a/drivers/mtd/nand/fsl_ifc_nand.c
> +++ b/drivers/mtd/nand/fsl_ifc_nand.c
> @@ -171,32 +171,6 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
>  		ifc_nand_ctrl->index += mtd->writesize;
>  }
>  
> -static int is_blank(struct mtd_info *mtd, unsigned int bufnum)
> -{
> -	struct nand_chip *chip = mtd_to_nand(mtd);
> -	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
> -	u8 __iomem *addr = priv->vbase + bufnum * (mtd->writesize * 2);
> -	u32 __iomem *mainarea = (u32 __iomem *)addr;
> -	u8 __iomem *oob = addr + mtd->writesize;
> -	struct mtd_oob_region oobregion = { };
> -	int i, section = 0;
> -
> -	i = nand_check_erased_buf(&mainarea[i], mtd->writesize, 0);
> -	if (i)
> -		return 0;
> -
> -	mtd_ooblayout_ecc(mtd, section++, &oobregion);
> -	while (oobregion.length) {
> -		i = nand_check_erased_buf(&oob[oobregion.offset], oobregion.length, 0);
> -		if (i)
> -			return 0;
> -
> -		mtd_ooblayout_ecc(mtd, section++, &oobregion);
> -	}
> -
> -	return 1;
> -}
> -
>  /* returns nonzero if entire page is blank */
>  static int check_read_ecc(struct mtd_info *mtd, struct fsl_ifc_ctrl *ctrl,
>  			  u32 *eccstat, unsigned int bufnum)
> @@ -272,16 +246,14 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
>  			if (errors == 15) {
>  				/*
>  				 * Uncorrectable error.
> -				 * OK only if the whole page is blank.
> +				 * We'll check for blank pages later.
>  				 *
>  				 * We disable ECCER reporting due to...
>  				 * erratum IFC-A002770 -- so report it now if we
>  				 * see an uncorrectable error in ECCSTAT.
>  				 */
> -				if (!is_blank(mtd, bufnum))
> -					ctrl->nand_stat |=
> -						IFC_NAND_EVTER_STAT_ECCER;
> -				break;
> +				ctrl->nand_stat |= IFC_NAND_EVTER_STAT_ECCER;
> +				printk("NAND flash read: error but continuing.\n");
>  			}
>  
>  			mtd->ecc_stats.corrected += errors;
> @@ -676,6 +648,40 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
>  	return nand_fsr | NAND_STATUS_WP;
>  }
>  
> +/*
> + * The controller does not check for bitflips in erased pages,
> + * therefore software must check instead.
> + */
> +static int check_erased_page(struct nand_chip *chip, u8 *buf)
> +{
> +	struct mtd_info *mtd = nand_to_mtd(chip);
> +	u8 *ecc = chip->oob_poi;
> +	const int ecc_size = chip->ecc.bytes;
> +	const int pkt_size = chip->ecc.size;
> +	int i, res, bitflips = 0;
> +	struct mtd_oob_region oobregion = { };
> +	
> +	mtd_ooblayout_ecc(mtd, 0, &oobregion);
> +	ecc += oobregion.offset;
> +
> +	for (i = 0; i < chip->ecc.steps; ++i) {
> +		res = nand_check_erased_ecc_chunk(buf, pkt_size, ecc, ecc_size,
> +						  NULL, 0,
> +						  chip->ecc.strength);
> +		if (res < 0)
> +			mtd->ecc_stats.failed++;

		else
			mtd->ecc_stats.corrected += res;


> +
> +		bitflips = max(res, bitflips);
> +		buf += pkt_size;
> +		ecc += ecc_size;
> +	}
> +
> +	mtd_ooblayout_ecc(mtd, 1, &oobregion);
> +	BUG_ON(oobregion.length);

Probably something you should check at registration time only.

> +
> +	return bitflips;
> +}
> +
>  static int fsl_ifc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
>  			     uint8_t *buf, int oob_required, int page)
>  {
> @@ -687,11 +693,23 @@ static int fsl_ifc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
>  	if (oob_required)
>  		fsl_ifc_read_buf(mtd, chip->oob_poi, mtd->oobsize);
>  
> -	if (ctrl->nand_stat & IFC_NAND_EVTER_STAT_ECCER)
> -		dev_err(priv->dev, "NAND Flash ECC Uncorrectable Error\n");
> -
>  	if (ctrl->nand_stat != IFC_NAND_EVTER_STAT_OPC)
>  		mtd->ecc_stats.failed++;
> +	
> +	if (ctrl->nand_stat & IFC_NAND_EVTER_STAT_ECCER) {
> +		int res;
> +
> +		if (!oob_required)
> +			fsl_ifc_read_buf(mtd, chip->oob_poi, mtd->oobsize);
> +
> +		printk("NAND flash: have error (%d)\n", nctrl->max_bitflips);
> +		res = check_erased_page(chip, buf);
> +		printk("NAND flash: but erased page says %d\n", res);
> +
> +		dev_err(priv->dev, "NAND Flash ECC Uncorrectable Error or erased page\n");
> +		return res;
> +	}
> +
>  
>  	return nctrl->max_bitflips;
>  }
>
Pavel Machek April 22, 2017, 7:01 a.m. UTC | #2
Hi!

> > Maybe I figured it out. Unfortunately, it is only compile tested. Does
> > it look approximately right?
> 
> Yep that's definitely better. Just one thing missing (see below),
> otherwise it looks good.

Thanks for review. Unfortunately it is still untested, so...

> > +		if (res < 0)
> > +			mtd->ecc_stats.failed++;
> 
> 		else
> 			mtd->ecc_stats.corrected += res;

Ok, I copied this from tango_nand. I'll submit a patch to fix it
there...

> > +
> > +		bitflips = max(res, bitflips);
> > +		buf += pkt_size;
> > +		ecc += ecc_size;
> > +	}
> > +
> > +	mtd_ooblayout_ecc(mtd, 1, &oobregion);
> > +	BUG_ON(oobregion.length);
> 
> Probably something you should check at registration time only.

Drive defensively, buy a tank ;-). Ok, I'll delete this one in final
version.
									Pavel
Pavel Machek April 23, 2017, 9:58 a.m. UTC | #3
Hi!

> > Maybe I figured it out. Unfortunately, it is only compile tested. Does
> > it look approximately right?
> 
> Yep that's definitely better. Just one thing missing (see below),
> otherwise it looks good.

I'm copying from tango_nand, therefore I had to check tango_nand, too.

static int check_erased_page(struct nand_chip *chip, u8 *buf)
{
...
                res = nand_check_erased_ecc_chunk(buf, pkt_size, ecc, ecc_size,
                                                  meta, meta_len,
                                                  chip->ecc.strength);
                if (res < 0)
                        mtd->ecc_stats.failed++;
                else
                        mtd->ecc_stats.corrected += res;

                bitflips = max(res, bitflips);
...
        return bitflips;
}

static int tango_read_page(struct mtd_info *mtd, struct nand_chip *chip,
                           u8 *buf, int oob_required, int page)
{
...
        res = decode_error_report(nfc);
        if (res < 0) {
                chip->ecc.read_oob_raw(mtd, chip, page);
                res = check_erased_page(chip, buf);
        }

        return res;
}


So nand_check_erased_ecc_chunk() returns < 0 (failed ECC), but then we
perform max() with bitflips (lets say 1, correctable ECC) and return
1? tango_read_page then returns 1 (correctable ECC) forgetting about
failed ECC...?

Best regards,
								Pavel
Boris Brezillon April 24, 2017, 7:12 a.m. UTC | #4
On Sun, 23 Apr 2017 11:58:45 +0200
Pavel Machek <pavel@ucw.cz> wrote:

> Hi!
> 
> > > Maybe I figured it out. Unfortunately, it is only compile tested. Does
> > > it look approximately right?  
> > 
> > Yep that's definitely better. Just one thing missing (see below),
> > otherwise it looks good.  
> 
> I'm copying from tango_nand, therefore I had to check tango_nand, too.
> 
> static int check_erased_page(struct nand_chip *chip, u8 *buf)
> {
> ...
>                 res = nand_check_erased_ecc_chunk(buf, pkt_size, ecc, ecc_size,
>                                                   meta, meta_len,
>                                                   chip->ecc.strength);
>                 if (res < 0)
>                         mtd->ecc_stats.failed++;
>                 else
>                         mtd->ecc_stats.corrected += res;
> 
>                 bitflips = max(res, bitflips);
> ...
>         return bitflips;
> }
> 
> static int tango_read_page(struct mtd_info *mtd, struct nand_chip *chip,
>                            u8 *buf, int oob_required, int page)
> {
> ...
>         res = decode_error_report(nfc);
>         if (res < 0) {
>                 chip->ecc.read_oob_raw(mtd, chip, page);
>                 res = check_erased_page(chip, buf);
>         }
> 
>         return res;
> }
> 
> 
> So nand_check_erased_ecc_chunk() returns < 0 (failed ECC), but then we
> perform max() with bitflips (lets say 1, correctable ECC) and return
> 1? tango_read_page then returns 1 (correctable ECC) forgetting about
> failed ECC...?

Yep, that's expected, see what's done in the core to detect
uncorrectable errors and return EBADMSG when appropriate [1].

[1]http://lxr.free-electrons.com/source/drivers/mtd/nand/nand_base.c#L2033
diff mbox

Patch

--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -171,32 +171,6 @@  static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
 		ifc_nand_ctrl->index += mtd->writesize;
 }
 
-static int is_blank(struct mtd_info *mtd, unsigned int bufnum)
-{
-	struct nand_chip *chip = mtd_to_nand(mtd);
-	struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
-	u8 __iomem *addr = priv->vbase + bufnum * (mtd->writesize * 2);
-	u32 __iomem *mainarea = (u32 __iomem *)addr;
-	u8 __iomem *oob = addr + mtd->writesize;
-	struct mtd_oob_region oobregion = { };
-	int i, section = 0;
-
-	i = nand_check_erased_buf(&mainarea[i], mtd->writesize, 0);
-	if (i)
-		return 0;
-
-	mtd_ooblayout_ecc(mtd, section++, &oobregion);
-	while (oobregion.length) {
-		i = nand_check_erased_buf(&oob[oobregion.offset], oobregion.length, 0);
-		if (i)
-			return 0;
-
-		mtd_ooblayout_ecc(mtd, section++, &oobregion);
-	}
-
-	return 1;
-}
-
 /* returns nonzero if entire page is blank */
 static int check_read_ecc(struct mtd_info *mtd, struct fsl_ifc_ctrl *ctrl,
 			  u32 *eccstat, unsigned int bufnum)
@@ -272,16 +246,14 @@  static void fsl_ifc_run_command(struct mtd_info *mtd)
 			if (errors == 15) {
 				/*
 				 * Uncorrectable error.
-				 * OK only if the whole page is blank.
+				 * We'll check for blank pages later.
 				 *
 				 * We disable ECCER reporting due to...
 				 * erratum IFC-A002770 -- so report it now if we
 				 * see an uncorrectable error in ECCSTAT.
 				 */
-				if (!is_blank(mtd, bufnum))
-					ctrl->nand_stat |=
-						IFC_NAND_EVTER_STAT_ECCER;
-				break;
+				ctrl->nand_stat |= IFC_NAND_EVTER_STAT_ECCER;
+				printk("NAND flash read: error but continuing.\n");
 			}
 
 			mtd->ecc_stats.corrected += errors;
@@ -676,6 +648,40 @@  static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
 	return nand_fsr | NAND_STATUS_WP;
 }
 
+/*
+ * The controller does not check for bitflips in erased pages,
+ * therefore software must check instead.
+ */
+static int check_erased_page(struct nand_chip *chip, u8 *buf)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *ecc = chip->oob_poi;
+	const int ecc_size = chip->ecc.bytes;
+	const int pkt_size = chip->ecc.size;
+	int i, res, bitflips = 0;
+	struct mtd_oob_region oobregion = { };
+	
+	mtd_ooblayout_ecc(mtd, 0, &oobregion);
+	ecc += oobregion.offset;
+
+	for (i = 0; i < chip->ecc.steps; ++i) {
+		res = nand_check_erased_ecc_chunk(buf, pkt_size, ecc, ecc_size,
+						  NULL, 0,
+						  chip->ecc.strength);
+		if (res < 0)
+			mtd->ecc_stats.failed++;
+
+		bitflips = max(res, bitflips);
+		buf += pkt_size;
+		ecc += ecc_size;
+	}
+
+	mtd_ooblayout_ecc(mtd, 1, &oobregion);
+	BUG_ON(oobregion.length);
+
+	return bitflips;
+}
+
 static int fsl_ifc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 			     uint8_t *buf, int oob_required, int page)
 {
@@ -687,11 +693,23 @@  static int fsl_ifc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 	if (oob_required)
 		fsl_ifc_read_buf(mtd, chip->oob_poi, mtd->oobsize);
 
-	if (ctrl->nand_stat & IFC_NAND_EVTER_STAT_ECCER)
-		dev_err(priv->dev, "NAND Flash ECC Uncorrectable Error\n");
-
 	if (ctrl->nand_stat != IFC_NAND_EVTER_STAT_OPC)
 		mtd->ecc_stats.failed++;
+	
+	if (ctrl->nand_stat & IFC_NAND_EVTER_STAT_ECCER) {
+		int res;
+
+		if (!oob_required)
+			fsl_ifc_read_buf(mtd, chip->oob_poi, mtd->oobsize);
+
+		printk("NAND flash: have error (%d)\n", nctrl->max_bitflips);
+		res = check_erased_page(chip, buf);
+		printk("NAND flash: but erased page says %d\n", res);
+
+		dev_err(priv->dev, "NAND Flash ECC Uncorrectable Error or erased page\n");
+		return res;
+	}
+
 
 	return nctrl->max_bitflips;
 }