diff mbox

[15/39] mtd: nand: denali: improve readability of handle_ecc()

Message ID 1480183585-592-16-git-send-email-yamada.masahiro@socionext.com
State Deferred
Headers show

Commit Message

Masahiro Yamada Nov. 26, 2016, 6:06 p.m. UTC
This function is unreadable due to the deep nesting.  Note this
function does a job only when INTR_STATUS__ECC_ERR is set.
So, if the flag is not set, let it bail-out.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

 drivers/mtd/nand/denali.c | 119 +++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 60 deletions(-)

Comments

Boris Brezillon Nov. 27, 2016, 3:34 p.m. UTC | #1
On Sun, 27 Nov 2016 03:06:01 +0900
Masahiro Yamada <yamada.masahiro@socionext.com> wrote:

> This function is unreadable due to the deep nesting.  Note this
> function does a job only when INTR_STATUS__ECC_ERR is set.
> So, if the flag is not set, let it bail-out.
> 
> Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
> ---
> 
>  drivers/mtd/nand/denali.c | 119 +++++++++++++++++++++++-----------------------
>  1 file changed, 59 insertions(+), 60 deletions(-)
> 
> diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
> index a7dc692..b577560 100644
> --- a/drivers/mtd/nand/denali.c
> +++ b/drivers/mtd/nand/denali.c
> @@ -908,69 +908,68 @@ static bool handle_ecc(struct denali_nand_info *denali, u8 *buf,
>  {
>  	bool check_erased_page = false;
>  	unsigned int bitflips = 0;
> +	u32 err_address, err_correction_info, err_byte, err_sector, err_device,
> +	    err_correction_value;

Please use single line definitions for local variables:

	u32 err_address, err_correction_info, err_byte, err_sector;
	u32 err_device, err_correction_value;

Also, maybe you should use shorter names to avoid 80 chars wrapping as
much as possible.

>  
> -	if (irq_status & INTR_STATUS__ECC_ERR) {
> -		/* read the ECC errors. we'll ignore them for now */
> -		u32 err_address, err_correction_info, err_byte,
> -			err_sector, err_device, err_correction_value;
> -		denali_set_intr_modes(denali, false);
> -
> -		do {
> -			err_address = ioread32(denali->flash_reg +
> -						ECC_ERROR_ADDRESS);
> -			err_sector = ECC_SECTOR(err_address);
> -			err_byte = ECC_BYTE(err_address);
> -
> -			err_correction_info = ioread32(denali->flash_reg +
> -						ERR_CORRECTION_INFO);
> -			err_correction_value =
> +	if (!(irq_status & INTR_STATUS__ECC_ERR))
> +		goto out;
> +
> +	/* read the ECC errors. we'll ignore them for now */
> +	denali_set_intr_modes(denali, false);
> +
> +	do {
> +		err_address = ioread32(denali->flash_reg + ECC_ERROR_ADDRESS);
> +		err_sector = ECC_SECTOR(err_address);
> +		err_byte = ECC_BYTE(err_address);
> +
> +		err_correction_info = ioread32(denali->flash_reg +
> +					       ERR_CORRECTION_INFO);
> +		err_correction_value =
>  				ECC_CORRECTION_VALUE(err_correction_info);
> -			err_device = ECC_ERR_DEVICE(err_correction_info);
> -
> -			if (ECC_ERROR_CORRECTABLE(err_correction_info)) {
> -				/*
> -				 * If err_byte is larger than ECC_SECTOR_SIZE,
> -				 * means error happened in OOB, so we ignore
> -				 * it. It's no need for us to correct it
> -				 * err_device is represented the NAND error
> -				 * bits are happened in if there are more
> -				 * than one NAND connected.
> -				 */
> -				if (err_byte < ECC_SECTOR_SIZE) {
> -					struct mtd_info *mtd =
> -						nand_to_mtd(&denali->nand);
> -					int offset;
> -
> -					offset = (err_sector *
> -							ECC_SECTOR_SIZE +
> -							err_byte) *
> -							denali->devnum +
> -							err_device;
> -					/* correct the ECC error */
> -					buf[offset] ^= err_correction_value;
> -					mtd->ecc_stats.corrected++;
> -					bitflips++;
> -				}
> -			} else {
> -				/*
> -				 * if the error is not correctable, need to
> -				 * look at the page to see if it is an erased
> -				 * page. if so, then it's not a real ECC error
> -				 */
> -				check_erased_page = true;
> +		err_device = ECC_ERR_DEVICE(err_correction_info);
> +
> +		if (ECC_ERROR_CORRECTABLE(err_correction_info)) {
> +			/*
> +			 * If err_byte is larger than ECC_SECTOR_SIZE, means error
> +			 * happened in OOB, so we ignore it. It's no need for
> +			 * us to correct it err_device is represented the NAND
> +			 * error bits are happened in if there are more than
> +			 * one NAND connected.
> +			 */
> +			if (err_byte < ECC_SECTOR_SIZE) {
> +				struct mtd_info *mtd =
> +					nand_to_mtd(&denali->nand);
> +				int offset;
> +
> +				offset = (err_sector * ECC_SECTOR_SIZE + err_byte) *
> +					denali->devnum + err_device;
> +				/* correct the ECC error */
> +				buf[offset] ^= err_correction_value;
> +				mtd->ecc_stats.corrected++;
> +				bitflips++;
>  			}
> -		} while (!ECC_LAST_ERR(err_correction_info));
> -		/*
> -		 * Once handle all ecc errors, controller will triger
> -		 * a ECC_TRANSACTION_DONE interrupt, so here just wait
> -		 * for a while for this interrupt
> -		 */
> -		while (!(read_interrupt_status(denali) &
> -				INTR_STATUS__ECC_TRANSACTION_DONE))
> -			cpu_relax();
> -		clear_interrupts(denali);
> -		denali_set_intr_modes(denali, true);
> -	}
> +		} else {
> +			/*
> +			 * if the error is not correctable, need to look at the
> +			 * page to see if it is an erased page. if so, then
> +			 * it's not a real ECC error
> +			 */
> +			check_erased_page = true;
> +		}
> +	} while (!ECC_LAST_ERR(err_correction_info));
> +
> +	/*
> +	 * Once handle all ecc errors, controller will triger a
> +	 * ECC_TRANSACTION_DONE interrupt, so here just wait for
> +	 * a while for this interrupt
> +	 */
> +	while (!(read_interrupt_status(denali) &
> +		 INTR_STATUS__ECC_TRANSACTION_DONE))
> +		cpu_relax();
> +	clear_interrupts(denali);
> +	denali_set_intr_modes(denali, true);
> +
> + out:
>  	*max_bitflips = bitflips;
>  	return check_erased_page;
>  }
Boris Brezillon Nov. 27, 2016, 3:42 p.m. UTC | #2
On Sun, 27 Nov 2016 03:06:01 +0900
Masahiro Yamada <yamada.masahiro@socionext.com> wrote:

> This function is unreadable due to the deep nesting.  Note this
> function does a job only when INTR_STATUS__ECC_ERR is set.
> So, if the flag is not set, let it bail-out.
> 
> Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
> ---
> 
>  drivers/mtd/nand/denali.c | 119 +++++++++++++++++++++++-----------------------
>  1 file changed, 59 insertions(+), 60 deletions(-)
> 
> diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
> index a7dc692..b577560 100644
> --- a/drivers/mtd/nand/denali.c
> +++ b/drivers/mtd/nand/denali.c
> @@ -908,69 +908,68 @@ static bool handle_ecc(struct denali_nand_info *denali, u8 *buf,
>  {
>  	bool check_erased_page = false;
>  	unsigned int bitflips = 0;
> +	u32 err_address, err_correction_info, err_byte, err_sector, err_device,
> +	    err_correction_value;
>  
> -	if (irq_status & INTR_STATUS__ECC_ERR) {
> -		/* read the ECC errors. we'll ignore them for now */
> -		u32 err_address, err_correction_info, err_byte,
> -			err_sector, err_device, err_correction_value;
> -		denali_set_intr_modes(denali, false);
> -
> -		do {
> -			err_address = ioread32(denali->flash_reg +
> -						ECC_ERROR_ADDRESS);
> -			err_sector = ECC_SECTOR(err_address);
> -			err_byte = ECC_BYTE(err_address);
> -
> -			err_correction_info = ioread32(denali->flash_reg +
> -						ERR_CORRECTION_INFO);
> -			err_correction_value =
> +	if (!(irq_status & INTR_STATUS__ECC_ERR))
> +		goto out;
> +
> +	/* read the ECC errors. we'll ignore them for now */
> +	denali_set_intr_modes(denali, false);
> +
> +	do {
> +		err_address = ioread32(denali->flash_reg + ECC_ERROR_ADDRESS);
> +		err_sector = ECC_SECTOR(err_address);
> +		err_byte = ECC_BYTE(err_address);
> +
> +		err_correction_info = ioread32(denali->flash_reg +
> +					       ERR_CORRECTION_INFO);
> +		err_correction_value =
>  				ECC_CORRECTION_VALUE(err_correction_info);
> -			err_device = ECC_ERR_DEVICE(err_correction_info);
> -
> -			if (ECC_ERROR_CORRECTABLE(err_correction_info)) {
> -				/*
> -				 * If err_byte is larger than ECC_SECTOR_SIZE,
> -				 * means error happened in OOB, so we ignore
> -				 * it. It's no need for us to correct it
> -				 * err_device is represented the NAND error
> -				 * bits are happened in if there are more
> -				 * than one NAND connected.
> -				 */
> -				if (err_byte < ECC_SECTOR_SIZE) {
> -					struct mtd_info *mtd =
> -						nand_to_mtd(&denali->nand);
> -					int offset;
> -
> -					offset = (err_sector *
> -							ECC_SECTOR_SIZE +
> -							err_byte) *
> -							denali->devnum +
> -							err_device;
> -					/* correct the ECC error */
> -					buf[offset] ^= err_correction_value;
> -					mtd->ecc_stats.corrected++;
> -					bitflips++;
> -				}
> -			} else {
> -				/*
> -				 * if the error is not correctable, need to
> -				 * look at the page to see if it is an erased
> -				 * page. if so, then it's not a real ECC error
> -				 */
> -				check_erased_page = true;
> +		err_device = ECC_ERR_DEVICE(err_correction_info);
> +
> +		if (ECC_ERROR_CORRECTABLE(err_correction_info)) {
> +			/*
> +			 * If err_byte is larger than ECC_SECTOR_SIZE, means error
> +			 * happened in OOB, so we ignore it. It's no need for
> +			 * us to correct it err_device is represented the NAND
> +			 * error bits are happened in if there are more than
> +			 * one NAND connected.
> +			 */
> +			if (err_byte < ECC_SECTOR_SIZE) {
> +				struct mtd_info *mtd =
> +					nand_to_mtd(&denali->nand);
> +				int offset;
> +
> +				offset = (err_sector * ECC_SECTOR_SIZE + err_byte) *
> +					denali->devnum + err_device;
> +				/* correct the ECC error */
> +				buf[offset] ^= err_correction_value;
> +				mtd->ecc_stats.corrected++;
> +				bitflips++;

Hm, bitflips is what is set in max_bitflips, and apparently the
implementation (which is not yours) is not doing what the core expects.

You should first count bitflips per sector with something like that:

				bitflips[err_sector]++;


And then once you've iterated over all errors do:

	for (i = 0; i < nsectors; i++)
		max_bitflips = max(bitflips[err_sector], max_bitflips);

>  			}
> -		} while (!ECC_LAST_ERR(err_correction_info));
> -		/*
> -		 * Once handle all ecc errors, controller will triger
> -		 * a ECC_TRANSACTION_DONE interrupt, so here just wait
> -		 * for a while for this interrupt
> -		 */
> -		while (!(read_interrupt_status(denali) &
> -				INTR_STATUS__ECC_TRANSACTION_DONE))
> -			cpu_relax();
> -		clear_interrupts(denali);
> -		denali_set_intr_modes(denali, true);
> -	}
> +		} else {
> +			/*
> +			 * if the error is not correctable, need to look at the
> +			 * page to see if it is an erased page. if so, then
> +			 * it's not a real ECC error
> +			 */
> +			check_erased_page = true;
> +		}
> +	} while (!ECC_LAST_ERR(err_correction_info));
> +
> +	/*
> +	 * Once handle all ecc errors, controller will triger a
> +	 * ECC_TRANSACTION_DONE interrupt, so here just wait for
> +	 * a while for this interrupt
> +	 */
> +	while (!(read_interrupt_status(denali) &
> +		 INTR_STATUS__ECC_TRANSACTION_DONE))
> +		cpu_relax();
> +	clear_interrupts(denali);
> +	denali_set_intr_modes(denali, true);
> +
> + out:
>  	*max_bitflips = bitflips;
>  	return check_erased_page;
>  }
Masahiro Yamada Dec. 2, 2016, 4:26 a.m. UTC | #3
Hi Boris,


2016-11-28 0:42 GMT+09:00 Boris Brezillon <boris.brezillon@free-electrons.com>:
>> +                     if (err_byte < ECC_SECTOR_SIZE) {
>> +                             struct mtd_info *mtd =
>> +                                     nand_to_mtd(&denali->nand);
>> +                             int offset;
>> +
>> +                             offset = (err_sector * ECC_SECTOR_SIZE + err_byte) *
>> +                                     denali->devnum + err_device;
>> +                             /* correct the ECC error */
>> +                             buf[offset] ^= err_correction_value;
>> +                             mtd->ecc_stats.corrected++;
>> +                             bitflips++;
>
> Hm, bitflips is what is set in max_bitflips, and apparently the
> implementation (which is not yours) is not doing what the core expects.
>
> You should first count bitflips per sector with something like that:
>
>                                 bitflips[err_sector]++;
>
>
> And then once you've iterated over all errors do:
>
>         for (i = 0; i < nsectors; i++)
>                 max_bitflips = max(bitflips[err_sector], max_bitflips);


I see.

For soft ECC fixup, we can calculate bitflips
for each ECC sector, so I can fix the max_bitflips
as the core framework expects.

For hard ECC fixup, the register only reports
the number of corrected bit-flips
in the whole page (sum from all ECC sectors).
We cannot calculate max_bitflips, I think.



BTW, I noticed another problem of the current code.

      buf[offset] ^= err_correction_value;
      mtd->ecc_stats.corrected++;
      bitflips++;

This code is counting the number of corrected bytes,
not the number of corrected bits.


I think multiple bit-flips within one byte can happen.


Perhaps, we should add

  hweight8(buf[offset] ^ err_correction_value)

to ecc_stats.corrected and bitflips.
Boris Brezillon Dec. 2, 2016, 7:55 a.m. UTC | #4
On Fri, 2 Dec 2016 13:26:27 +0900
Masahiro Yamada <yamada.masahiro@socionext.com> wrote:

> Hi Boris,
> 
> 
> 2016-11-28 0:42 GMT+09:00 Boris Brezillon <boris.brezillon@free-electrons.com>:
> >> +                     if (err_byte < ECC_SECTOR_SIZE) {
> >> +                             struct mtd_info *mtd =
> >> +                                     nand_to_mtd(&denali->nand);
> >> +                             int offset;
> >> +
> >> +                             offset = (err_sector * ECC_SECTOR_SIZE + err_byte) *
> >> +                                     denali->devnum + err_device;
> >> +                             /* correct the ECC error */
> >> +                             buf[offset] ^= err_correction_value;
> >> +                             mtd->ecc_stats.corrected++;
> >> +                             bitflips++;  
> >
> > Hm, bitflips is what is set in max_bitflips, and apparently the
> > implementation (which is not yours) is not doing what the core expects.
> >
> > You should first count bitflips per sector with something like that:
> >
> >                                 bitflips[err_sector]++;
> >
> >
> > And then once you've iterated over all errors do:
> >
> >         for (i = 0; i < nsectors; i++)
> >                 max_bitflips = max(bitflips[err_sector], max_bitflips);  
> 
> 
> I see.
> 
> For soft ECC fixup, we can calculate bitflips
> for each ECC sector, so I can fix the max_bitflips
> as the core framework expects.
> 
> For hard ECC fixup, the register only reports
> the number of corrected bit-flips
> in the whole page (sum from all ECC sectors).
> We cannot calculate max_bitflips, I think.
> 

That's unfortunate. This means you'll return -EUCLEAN more quickly
(which will trigger UBI eraseblock move), since the NAND framework is
basing its 'too many bitflips' detection logic on the max_bitflips per
ECC chunk and the bitflips threshold (by default 3/4 of the ECC
strength).

That doesn't mean it won't work, you'll just wear your NAND more
quickly :-(.

ITOH, doing max_bitflips = nbitflips / nsteps is not good either,
because the bitflips might be all concentrated in the same ECC chunk,
and in this case you really want to return -EUCLEAN.

> 
> 
> BTW, I noticed another problem of the current code.
> 
>       buf[offset] ^= err_correction_value;
>       mtd->ecc_stats.corrected++;
>       bitflips++;
> 
> This code is counting the number of corrected bytes,
> not the number of corrected bits.
> 
> 
> I think multiple bit-flips within one byte can happen.

Yes.

> 
> 
> Perhaps, we should add
> 
>   hweight8(buf[offset] ^ err_correction_value)
> 
> to ecc_stats.corrected and bitflips.
> 

Looks good.
diff mbox

Patch

diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
index a7dc692..b577560 100644
--- a/drivers/mtd/nand/denali.c
+++ b/drivers/mtd/nand/denali.c
@@ -908,69 +908,68 @@  static bool handle_ecc(struct denali_nand_info *denali, u8 *buf,
 {
 	bool check_erased_page = false;
 	unsigned int bitflips = 0;
+	u32 err_address, err_correction_info, err_byte, err_sector, err_device,
+	    err_correction_value;
 
-	if (irq_status & INTR_STATUS__ECC_ERR) {
-		/* read the ECC errors. we'll ignore them for now */
-		u32 err_address, err_correction_info, err_byte,
-			err_sector, err_device, err_correction_value;
-		denali_set_intr_modes(denali, false);
-
-		do {
-			err_address = ioread32(denali->flash_reg +
-						ECC_ERROR_ADDRESS);
-			err_sector = ECC_SECTOR(err_address);
-			err_byte = ECC_BYTE(err_address);
-
-			err_correction_info = ioread32(denali->flash_reg +
-						ERR_CORRECTION_INFO);
-			err_correction_value =
+	if (!(irq_status & INTR_STATUS__ECC_ERR))
+		goto out;
+
+	/* read the ECC errors. we'll ignore them for now */
+	denali_set_intr_modes(denali, false);
+
+	do {
+		err_address = ioread32(denali->flash_reg + ECC_ERROR_ADDRESS);
+		err_sector = ECC_SECTOR(err_address);
+		err_byte = ECC_BYTE(err_address);
+
+		err_correction_info = ioread32(denali->flash_reg +
+					       ERR_CORRECTION_INFO);
+		err_correction_value =
 				ECC_CORRECTION_VALUE(err_correction_info);
-			err_device = ECC_ERR_DEVICE(err_correction_info);
-
-			if (ECC_ERROR_CORRECTABLE(err_correction_info)) {
-				/*
-				 * If err_byte is larger than ECC_SECTOR_SIZE,
-				 * means error happened in OOB, so we ignore
-				 * it. It's no need for us to correct it
-				 * err_device is represented the NAND error
-				 * bits are happened in if there are more
-				 * than one NAND connected.
-				 */
-				if (err_byte < ECC_SECTOR_SIZE) {
-					struct mtd_info *mtd =
-						nand_to_mtd(&denali->nand);
-					int offset;
-
-					offset = (err_sector *
-							ECC_SECTOR_SIZE +
-							err_byte) *
-							denali->devnum +
-							err_device;
-					/* correct the ECC error */
-					buf[offset] ^= err_correction_value;
-					mtd->ecc_stats.corrected++;
-					bitflips++;
-				}
-			} else {
-				/*
-				 * if the error is not correctable, need to
-				 * look at the page to see if it is an erased
-				 * page. if so, then it's not a real ECC error
-				 */
-				check_erased_page = true;
+		err_device = ECC_ERR_DEVICE(err_correction_info);
+
+		if (ECC_ERROR_CORRECTABLE(err_correction_info)) {
+			/*
+			 * If err_byte is larger than ECC_SECTOR_SIZE, means error
+			 * happened in OOB, so we ignore it. It's no need for
+			 * us to correct it err_device is represented the NAND
+			 * error bits are happened in if there are more than
+			 * one NAND connected.
+			 */
+			if (err_byte < ECC_SECTOR_SIZE) {
+				struct mtd_info *mtd =
+					nand_to_mtd(&denali->nand);
+				int offset;
+
+				offset = (err_sector * ECC_SECTOR_SIZE + err_byte) *
+					denali->devnum + err_device;
+				/* correct the ECC error */
+				buf[offset] ^= err_correction_value;
+				mtd->ecc_stats.corrected++;
+				bitflips++;
 			}
-		} while (!ECC_LAST_ERR(err_correction_info));
-		/*
-		 * Once handle all ecc errors, controller will triger
-		 * a ECC_TRANSACTION_DONE interrupt, so here just wait
-		 * for a while for this interrupt
-		 */
-		while (!(read_interrupt_status(denali) &
-				INTR_STATUS__ECC_TRANSACTION_DONE))
-			cpu_relax();
-		clear_interrupts(denali);
-		denali_set_intr_modes(denali, true);
-	}
+		} else {
+			/*
+			 * if the error is not correctable, need to look at the
+			 * page to see if it is an erased page. if so, then
+			 * it's not a real ECC error
+			 */
+			check_erased_page = true;
+		}
+	} while (!ECC_LAST_ERR(err_correction_info));
+
+	/*
+	 * Once handle all ecc errors, controller will triger a
+	 * ECC_TRANSACTION_DONE interrupt, so here just wait for
+	 * a while for this interrupt
+	 */
+	while (!(read_interrupt_status(denali) &
+		 INTR_STATUS__ECC_TRANSACTION_DONE))
+		cpu_relax();
+	clear_interrupts(denali);
+	denali_set_intr_modes(denali, true);
+
+ out:
 	*max_bitflips = bitflips;
 	return check_erased_page;
 }