diff mbox series

[RESEND,V2,2/2] mtd: core: NAND filling block

Message ID BYAPR08MB4533401FB969CA9A8D254F34DB9C0@BYAPR08MB4533.namprd08.prod.outlook.com
State Changes Requested
Delegated to: Miquel Raynal
Headers show
Series mtd: core: NAND erase preparation | expand

Commit Message

Bean Huo Jan. 18, 2019, 10:12 p.m. UTC
On some legacy planar 2D Micron NAND devices when a
block erase command is issued, occasionally even
though a block erase operation successfully completes
and returns a pass status, the flash block may not be
completely erased. Subsequent operations to this block
on very rare cases can result in subtle failures or
corruption. These extremely rare cases should nevertheless
be considered.

These rare occurrences have been observed on partially
written blocks. Partially written blocks are not uncommon
with UBI/UBIFS.

To avoid this rare occurrence, we make sure that at least
15 pages have been programmed to a block before it is erased.
In case we find that less than 15 pages have been programmed,
additional pages are programmed in the block. The observation
is that additional pages rarely need to be written and most of
the time UBI/UBIFS erases blocks that contain more programmed
pages.

Signed-off-by: Bean Huo <beanhuo@micron.com>
Reviewed-by: ZOLTAN SZUBBOCSEV <zszubbocsev@micron.com>
---
 drivers/mtd/nand/raw/nand_micron.c | 119 +++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

Comments

Boris Brezillon Jan. 18, 2019, 10:39 p.m. UTC | #1
Subject prefix should be "mtd: rawnand: " not "mtd: core: "

On Fri, 18 Jan 2019 22:12:04 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> On some legacy planar 2D Micron NAND devices when a
> block erase command is issued, occasionally even
> though a block erase operation successfully completes
> and returns a pass status, the flash block may not be
> completely erased. Subsequent operations to this block
> on very rare cases can result in subtle failures or
> corruption. These extremely rare cases should nevertheless
> be considered.
> 
> These rare occurrences have been observed on partially
> written blocks. Partially written blocks are not uncommon
> with UBI/UBIFS.
> 
> To avoid this rare occurrence, we make sure that at least
> 15 pages have been programmed to a block before it is erased.
> In case we find that less than 15 pages have been programmed,
> additional pages are programmed in the block. The observation
> is that additional pages rarely need to be written and most of
> the time UBI/UBIFS erases blocks that contain more programmed
> pages.
> 
> Signed-off-by: Bean Huo <beanhuo@micron.com>
> Reviewed-by: ZOLTAN SZUBBOCSEV <zszubbocsev@micron.com>
> ---
>  drivers/mtd/nand/raw/nand_micron.c | 119 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 119 insertions(+)
> 
> diff --git a/drivers/mtd/nand/raw/nand_micron.c b/drivers/mtd/nand/raw/nand_micron.c
> index b85e1c1..f52e072 100644
> --- a/drivers/mtd/nand/raw/nand_micron.c
> +++ b/drivers/mtd/nand/raw/nand_micron.c
> @@ -541,8 +541,127 @@ static void micron_fixup_onfi_param_page(struct nand_chip *chip,
>  		p->revision = cpu_to_le16(ONFI_VERSION_1_0);
>  }
>  
> +static int check_page_if_emtpy(struct nand_chip *chip, char *data)
> +{
> +	struct mtd_info *mtd = nand_to_mtd(chip);
> +	int ret, i;
> +	void *databuf, *eccbuf;
> +	int max_bitflips;
> +	struct mtd_oob_region oobregion;
> +
> +	mtd_ooblayout_ecc(mtd, 0, &oobregion);
> +	eccbuf = chip->oob_poi + oobregion.offset;
> +	databuf = data;
> +	max_bitflips = 0;
> +
> +	for (i = 0; i < chip->ecc.steps; i++) {
> +		ret = nand_check_erased_ecc_chunk(data,
> +						  chip->ecc.size,
> +						  eccbuf,
> +						  chip->ecc.bytes,
> +						  NULL, 0,
> +						  chip->ecc.strength);
> +		if (ret >= 0)
> +			max_bitflips = max(ret, max_bitflips);
> +		else
> +			return false;
> +
> +		databuf += chip->ecc.size;
> +		eccbuf += chip->ecc.bytes;
> +
> +	}
> +	/*
> +	 * As for the empty/erased page, bitflips number should be zero or
> +	 * at least less than the bitflip_threshold.
> +	 */
> +	if (max_bitflips > mtd->bitflip_threshold)
> +		return false;
> +	else
> +		return true;
> +}
> +
> +#define LAST_CHECKPU_PAGE 13
> +
> +static int before_erase_checkup(struct nand_chip *chip, int page)
> +{
> +	struct mtd_info *mtd = nand_to_mtd(chip);
> +	u8 *data_buf;
> +	int ret;
> +	uint32_t empty_page_mask = 0;
> +
> +	 /* Only for legacy planar 2D parallels NAND. */
> +	if ((mtd->type != MTD_NANDFLASH) || (chip->bits_per_cell != 1))
> +		return 0;

Are you implying that all your SLC chips are impacted by this issue,
really?

> +
> +	data_buf =  kmalloc(mtd->writesize, GFP_KERNEL);
> +	if (!data_buf)
> +		return -ENOMEM;
> +
> +	memset(data_buf, 0xFF, mtd->writesize);
> +	/*
> +	 * pgs[] contains pages need to be checked. We firstly check
> +	 * page13,because, for the most of cases, PEB being erased is
> +	 * not partially programmed. If page13 contains data, we directly
> +	 * return since it is not a partially programmed PEB.

So now you say the minimum amount of written pages is 14 not 15? In
your previous patch, the magic number was 15 (and the commit log
says 15 too). 

> Otherwise,
> +	 * then we check page0. And if page0 is programmed, and page13
> +	 * is not programmed, then we start to check from page11, page9,
> +	 * page7, page5, page3 respectively since the pages of PEB are
> +	 * programmed sequentially.

Looks overly complex for only a small gain. Did you try writing X (X
being 14 in this case) pages all the time? If you did, how does it
compare to this version (perf-wise). I suspect that reading pages
before potentially overwriting them will actually take more time than
blindly overwriting 14 pages with 0x00.

> We olny check odd page.

Why?!

> +	 */
> +
> +	char pgs[] = {13, 0, 11, 9, 7, 5, 3};
> +	int  c = ARRAY_SIZE(pgs) - 1;
> +	int i;
> +
> +	for (i = 0 ; i <=  c; i++) {
> +		ret = nand_read_page_raw(chip, data_buf, 1, page + pgs[i]);
> +		if (ret)
> +			continue; /* Read error, we just skip it now. */
> +		if (check_page_if_emtpy(chip, data_buf) == true) {
> +			if (pgs[i] == 0)
> +			/* page0 is not programmed. */
> +				goto free_buf;
> +
> +			/* Mark page need to program later */
> +			empty_page_mask |= (1 << pgs[i]);
> +		} else {
> +			if (pgs[i] == LAST_CHECKPU_PAGE)
> +			/*
> +			 * If page13 is programmed, this block has
> +			 * met the requirement of robust erase.
> +			 */
> +				goto free_buf;
> +		}
> +	}
> +	/* Corrupt page0 and page1, in order to simulate an
> +	 * uncompleted eraseing scenario. Just for case of
> +	 * power loss hits while below programming. in this
> +	 * way, the PEB will be re-erased again.

Looks like the "erase the first 2 pages" thing is here to corrupt
UBI's VID and EC headers. Is this really needed? We should definitely
not assume the NAND user is UBI, and that's exactly what you're doing
here.

> +	 */
> +	empty_page_mask |= 0x3;
> +	memset(data_buf, 0xAA,  mtd->writesize);
> +
> +	for (i = 0; i <= LAST_CHECKPU_PAGE; ) {
> +
> +	/* Start to program empty page, we only program the odd page. */
> +		if (empty_page_mask & (1 << i))
> +			nand_write_page_raw(chip, data_buf, 1, page + i);
> +
> +
> +		if (i == 0)
> +			i = 1;
> +		else
> +			i += 2;
> +	}
> +
> +free_buf:
> +	kfree(data_buf);
> +	return 0;
> +}
> +
>  const struct nand_manufacturer_ops micron_nand_manuf_ops = {
>  	.init = micron_nand_init,
>  	.cleanup = micron_nand_cleanup,
>  	.fixup_onfi_param_page = micron_fixup_onfi_param_page,
> +	.erase_pre = before_erase_checkup,
>  };
Boris Brezillon Jan. 18, 2019, 11:01 p.m. UTC | #2
On Fri, 18 Jan 2019 22:12:04 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> +	/* Corrupt page0 and page1, in order to simulate an
> +	 * uncompleted eraseing scenario. Just for case of
> +	 * power loss hits while below programming. in this
> +	 * way, the PEB will be re-erased again.
> +	 */
> +	empty_page_mask |= 0x3;
> +	memset(data_buf, 0xAA,  mtd->writesize);

Why do you use the 0xaa pattern BTW?
Miquel Raynal Jan. 20, 2019, 3:59 p.m. UTC | #3
Hi Bean,

"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote on Fri, 18 Jan 2019
22:12:04 +0000:

> On some legacy planar 2D Micron NAND devices when a
> block erase command is issued, occasionally even
> though a block erase operation successfully completes
> and returns a pass status, the flash block may not be
> completely erased. Subsequent operations to this block
> on very rare cases can result in subtle failures or
> corruption. These extremely rare cases should nevertheless
> be considered.
> 
> These rare occurrences have been observed on partially
> written blocks. Partially written blocks are not uncommon
> with UBI/UBIFS.
> 
> To avoid this rare occurrence, we make sure that at least
> 15 pages have been programmed to a block before it is erased.
> In case we find that less than 15 pages have been programmed,
> additional pages are programmed in the block. The observation
> is that additional pages rarely need to be written

I would stop the commit message here and remove the end of the sentence
which, I believe, is inaccurate.

> and most of
> the time UBI/UBIFS erases blocks that contain more programmed
> pages.
> 
> Signed-off-by: Bean Huo <beanhuo@micron.com>
> Reviewed-by: ZOLTAN SZUBBOCSEV <zszubbocsev@micron.com>

Could you write this name in usual upper/lower case like "Zoltan
Szubbocsev"?

> ---
>  drivers/mtd/nand/raw/nand_micron.c | 119 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 119 insertions(+)
> 
> diff --git a/drivers/mtd/nand/raw/nand_micron.c b/drivers/mtd/nand/raw/nand_micron.c
> index b85e1c1..f52e072 100644
> --- a/drivers/mtd/nand/raw/nand_micron.c
> +++ b/drivers/mtd/nand/raw/nand_micron.c
> @@ -541,8 +541,127 @@ static void micron_fixup_onfi_param_page(struct nand_chip *chip,
>  		p->revision = cpu_to_le16(ONFI_VERSION_1_0);
>  }
>  
> +static int check_page_if_emtpy(struct nand_chip *chip, char *data)

s/if/is
s/emtpy/empty/
s/char *data/void *data/

I would rename this function "nand_check_erased_page" to follow the
current naming and move it to nand_base.c right after
nand_check_erased_ecc_chunk(). Please also add kernel doc following the
other functions pattern.

> +{
> +	struct mtd_info *mtd = nand_to_mtd(chip);
> +	int ret, i;
> +	void *databuf, *eccbuf;
> +	int max_bitflips;
> +	struct mtd_oob_region oobregion;
> +
> +	mtd_ooblayout_ecc(mtd, 0, &oobregion);
> +	eccbuf = chip->oob_poi + oobregion.offset;
> +	databuf = data;
> +	max_bitflips = 0;
> +
> +	for (i = 0; i < chip->ecc.steps; i++) {
> +		ret = nand_check_erased_ecc_chunk(data,
> +						  chip->ecc.size,
> +						  eccbuf,
> +						  chip->ecc.bytes,
> +						  NULL, 0,
> +						  chip->ecc.strength);
> +		if (ret >= 0)
> +			max_bitflips = max(ret, max_bitflips);
> +		else
> +			return false;

		if (ret < 0)
			return false

		max_bitflips = max(ret, max_bitflips);

> +
> +		databuf += chip->ecc.size;
> +		eccbuf += chip->ecc.bytes;
> +
> +	}
> +	/*
> +	 * As for the empty/erased page, bitflips number should be zero or
> +	 * at least less than the bitflip_threshold.
> +	 */
> +	if (max_bitflips > mtd->bitflip_threshold)
> +		return false;
> +	else
> +		return true;

Why no just:

	return max_bitflips < mtd->bitflip_threshold;

Mind that currently, the check for returning -EUCLEAN to the upper
layer (meaning, there are too much bitflips) is "max_bitflips >=
mtd->bitflip_threshold", hence the use of '<' in my proposal.

> +}
> +
> +#define LAST_CHECKPU_PAGE 13

I would place this at the top of the micron_nand.c file

> +
> +static int before_erase_checkup(struct nand_chip *chip, int page)

Can you prefix this function with "micron_nand_" and use the same name
as what you will choose for the callback entry in nand_chip?

> +{
> +	struct mtd_info *mtd = nand_to_mtd(chip);
> +	u8 *data_buf;

"u8 *data" is enough

> +	int ret;
> +	uint32_t empty_page_mask = 0;

use u32 instead of uint32_t
> +
> +	 /* Only for legacy planar 2D parallels NAND. */

Extra space before /*, please run checkpatch.pl --strict and fix all
the warnings/checks that it reports.

> +	if ((mtd->type != MTD_NANDFLASH) || (chip->bits_per_cell != 1))
> +		return 0;
> +
> +	data_buf =  kmalloc(mtd->writesize, GFP_KERNEL);

Extra space before kmalloc.

You allocate just enough for storing in-band data (mtd->writesize) and
then you pass data_buf to the function checking for the page being
empty, which will then check behind the end of the allocated area. You
want to check if the page is empty, so you must allocate the oobsize
too.

However, I agree with Boris, why not just writing the pages directly?

> +	if (!data_buf)
> +		return -ENOMEM;
> +
> +	memset(data_buf, 0xFF, mtd->writesize);

Is this really needed?

> +	/*
> +	 * pgs[] contains pages need to be checked. We firstly check
                         ^the  ^that                   ^first
> +	 * page13,because, for the most of cases, PEB being erased is
> +	 * not partially programmed. If page13 contains data, we directly
> +	 * return since it is not a partially programmed PEB. Otherwise,
> +	 * then we check page0. And if page0 is programmed, and page13
> +	 * is not programmed, then we start to check from page11, page9,
> +	 * page7, page5, page3 respectively since the pages of PEB are
> +	 * programmed sequentially. We olny check odd page.
> +	 */
> +

Is this a hard rule that pages are programmed sequentially?

Why only odd pages?

[...]

Thanks,
Miquèl
Boris Brezillon Jan. 20, 2019, 5:31 p.m. UTC | #4
On Fri, 18 Jan 2019 23:39:43 +0100
Boris Brezillon <bbrezillon@kernel.org> wrote:


> > Otherwise,
> > +	 * then we check page0. And if page0 is programmed, and page13
> > +	 * is not programmed, then we start to check from page11, page9,
> > +	 * page7, page5, page3 respectively since the pages of PEB are
> > +	 * programmed sequentially.  
> 
> Looks overly complex for only a small gain. Did you try writing X (X
> being 14 in this case) pages all the time? If you did, how does it
> compare to this version (perf-wise). I suspect that reading pages
> before potentially overwriting them will actually take more time than
> blindly overwriting 14 pages with 0x00.

I looked at various datasheets and PROG time is indeed much bigger than
READ time, so the benefit of reading before writing is mainly dependent
on the page transfer time on the bus, which is highly dependent on the
controller and the page size. Maybe it's not such a bad idea to try to
figure out which pages have been written before overwriting them (but
in some cases it might be worse than directly overwriting the 16
first pages).

In any case, I think it'd be good to keep track of which pages have been
programmed at runtime. Assuming you only want to track the 16 first
pages, all you'll need is an u16 (bitmap) per block. When any of the 16
first pages of a block is written you set the corresponding bit, when
the block is erased you clear the u16 entry. This way, you only have to
figure out which blocks are partially written once after a cold boot.
Boris Brezillon Jan. 20, 2019, 5:36 p.m. UTC | #5
On Fri, 18 Jan 2019 22:12:04 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> +static int check_page_if_emtpy(struct nand_chip *chip, char *data)
> +{
> +	struct mtd_info *mtd = nand_to_mtd(chip);
> +	int ret, i;
> +	void *databuf, *eccbuf;
> +	int max_bitflips;
> +	struct mtd_oob_region oobregion;
> +
> +	mtd_ooblayout_ecc(mtd, 0, &oobregion);
> +	eccbuf = chip->oob_poi + oobregion.offset;
> +	databuf = data;
> +	max_bitflips = 0;
> +
> +	for (i = 0; i < chip->ecc.steps; i++) {
> +		ret = nand_check_erased_ecc_chunk(data,
> +						  chip->ecc.size,
> +						  eccbuf,
> +						  chip->ecc.bytes,
> +						  NULL, 0,
> +						  chip->ecc.strength);
> +		if (ret >= 0)
> +			max_bitflips = max(ret, max_bitflips);
> +		else
> +			return false;
> +
> +		databuf += chip->ecc.size;
> +		eccbuf += chip->ecc.bytes;
> +
> +	}

You should check the whole page and not only in-band-data+ECC bytes.
Plus, the "overwrite page" trigger is not dependent on the ECC
strength, but more something related to the NAND chip itself so using
ecc->strength as a threshold sounds like a bad idea to me.
Bean Huo Jan. 21, 2019, 10:04 a.m. UTC | #6
Hi, Boris

>On Fri, 18 Jan 2019 22:12:04 +0000
>"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:
>
>> +	/* Corrupt page0 and page1, in order to simulate an
>> +	 * uncompleted eraseing scenario. Just for case of
>> +	 * power loss hits while below programming. in this
>> +	 * way, the PEB will be re-erased again.
>> +	 */
>> +	empty_page_mask |= 0x3;
>> +	memset(data_buf, 0xAA,  mtd->writesize);
>
>Why do you use the 0xaa pattern BTW?

Random pattern or any pattern is ok. Just to fill in page.

//Bean
Boris Brezillon Jan. 21, 2019, 10:12 a.m. UTC | #7
On Mon, 21 Jan 2019 10:04:15 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> Hi, Boris
> 
> >On Fri, 18 Jan 2019 22:12:04 +0000
> >"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:
> >  
> >> +	/* Corrupt page0 and page1, in order to simulate an
> >> +	 * uncompleted eraseing scenario. Just for case of
> >> +	 * power loss hits while below programming. in this
> >> +	 * way, the PEB will be re-erased again.
> >> +	 */
> >> +	empty_page_mask |= 0x3;
> >> +	memset(data_buf, 0xAA,  mtd->writesize);  
> >
> >Why do you use the 0xaa pattern BTW?  
> 
> Random pattern or any pattern is ok. Just to fill in page.

Let's use 0x0 then. This way all cells are actually in a "programmed"
state. BTW, I'd still be interested in knowing more about the root
cause of this issue. What causes this wrong "cell is erased" detection
in your chips? I thought the ERASE operation was an iterative process
and cells were being tested after each step to know whether they are
erased or not in order to decide to do another step or stop.
Am I wrong? What happens here to cause this erroneous detection?
Bean Huo Jan. 21, 2019, 12:28 p.m. UTC | #8
Hi, Boris

>> +
>> +	 /* Only for legacy planar 2D parallels NAND. */
>> +	if ((mtd->type != MTD_NANDFLASH) || (chip->bits_per_cell != 1))
>> +		return 0;
>
>Are you implying that all your SLC chips are impacted by this issue, really?
>

In principle, all SLC NAND devices can show a sensitivity of the erase operation 
to block filling prior to erase. On certain devices it is almost undetectable. 
Nonetheless, given the expected improvement in robustness and the negligible impact
of the patch on performance, we recommend it to be applied to all SLC NAND devices. 
This will simplify maintenance and minimize the risk of mistakes with parts requiring
the patch not getting it.

>> +
>> +	data_buf =  kmalloc(mtd->writesize, GFP_KERNEL);
>> +	if (!data_buf)
>> +		return -ENOMEM;
>> +
>> +	memset(data_buf, 0xFF, mtd->writesize);
>> +	/*
>> +	 * pgs[] contains pages need to be checked. We firstly check
>> +	 * page13,because, for the most of cases, PEB being erased is
>> +	 * not partially programmed. If page13 contains data, we directly
>> +	 * return since it is not a partially programmed PEB.
>
>So now you say the minimum amount of written pages is 14 not 15? In your
>previous patch, the magic number was 15 (and the commit log says 15 too).
>
it is still 15 pages, page0~page14. But it is now odd pages, there will one less odd pages than
even pages.

>> Otherwise,
>> +	 * then we check page0. And if page0 is programmed, and page13
>> +	 * is not programmed, then we start to check from page11, page9,
>> +	 * page7, page5, page3 respectively since the pages of PEB are
>> +	 * programmed sequentially.
>
>Looks overly complex for only a small gain. Did you try writing X (X being 14 in this
>case) pages all the time? If you did, how does it compare to this version (perf-
>wise). I suspect that reading pages before potentially overwriting them will
>actually take more time than blindly overwriting 14 pages with 0x00.
>
>> We olny check odd page.
>
>Why?!
>
>> +	 */
>> +
>> +	char pgs[] = {13, 0, 11, 9, 7, 5, 3};
>> +	int  c = ARRAY_SIZE(pgs) - 1;
>> +	int i;
>> +
>> +	for (i = 0 ; i <=  c; i++) {
>> +		ret = nand_read_page_raw(chip, data_buf, 1, page + pgs[i]);
>> +		if (ret)
>> +			continue; /* Read error, we just skip it now. */
>> +		if (check_page_if_emtpy(chip, data_buf) == true) {
>> +			if (pgs[i] == 0)
>> +			/* page0 is not programmed. */
>> +				goto free_buf;
>> +
>> +			/* Mark page need to program later */
>> +			empty_page_mask |= (1 << pgs[i]);
>> +		} else {
>> +			if (pgs[i] == LAST_CHECKPU_PAGE)
>> +			/*
>> +			 * If page13 is programmed, this block has
>> +			 * met the requirement of robust erase.
>> +			 */
>> +				goto free_buf;
>> +		}
>> +	}
>> +	/* Corrupt page0 and page1, in order to simulate an
>> +	 * uncompleted eraseing scenario. Just for case of
>> +	 * power loss hits while below programming. in this
>> +	 * way, the PEB will be re-erased again.
>
>Looks like the "erase the first 2 pages" thing is here to corrupt UBI's VID and EC
>headers. Is this really needed? We should definitely not assume the NAND user is
>UBI, and that's exactly what you're doing here.
>
We must deal with all sorts of power loss, if other NAND based file systems
Have the same behavior with UBIFS to deal with un-completed erasing.
This is not just for UBIFS.

//Bean
Boris Brezillon Jan. 21, 2019, 12:44 p.m. UTC | #9
On Mon, 21 Jan 2019 12:28:17 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> Hi, Boris
> 
> >> +
> >> +	 /* Only for legacy planar 2D parallels NAND. */
> >> +	if ((mtd->type != MTD_NANDFLASH) || (chip->bits_per_cell != 1))
> >> +		return 0;  
> >
> >Are you implying that all your SLC chips are impacted by this issue, really?
> >  
> 
> In principle, all SLC NAND devices can show a sensitivity of the erase operation 
> to block filling prior to erase. On certain devices it is almost undetectable.

Can you point me to docs/papers describing this effect? It's still
something I do not understand so I'd like to have more background on
what this effect is and what makes it more likely to happen on modern
SLC NANDs.
 
> Nonetheless, given the expected improvement in robustness and the negligible impact
> of the patch on performance, we recommend it to be applied to all SLC NAND devices.

Again, the impact will vary greatly depending on how efficient the
controller is WRT to data transfer on the bus, so saying the impact is
negligible is a lie. I do agree on the "reliability is more important
than perf" aspect though.

> This will simplify maintenance and minimize the risk of mistakes with parts requiring
> the patch not getting it.

That's also true.

> 
> >> +
> >> +	data_buf =  kmalloc(mtd->writesize, GFP_KERNEL);
> >> +	if (!data_buf)
> >> +		return -ENOMEM;
> >> +
> >> +	memset(data_buf, 0xFF, mtd->writesize);
> >> +	/*
> >> +	 * pgs[] contains pages need to be checked. We firstly check
> >> +	 * page13,because, for the most of cases, PEB being erased is
> >> +	 * not partially programmed. If page13 contains data, we directly
> >> +	 * return since it is not a partially programmed PEB.  
> >
> >So now you say the minimum amount of written pages is 14 not 15? In your
> >previous patch, the magic number was 15 (and the commit log says 15 too).
> >  
> it is still 15 pages, page0~page14. But it is now odd pages, there will one less odd pages than
> even pages.

Why only odd pages? You did not reply to this question yet, and I
really want to understand why.

> >> +	 */
> >> +
> >> +	char pgs[] = {13, 0, 11, 9, 7, 5, 3};
> >> +	int  c = ARRAY_SIZE(pgs) - 1;
> >> +	int i;
> >> +
> >> +	for (i = 0 ; i <=  c; i++) {
> >> +		ret = nand_read_page_raw(chip, data_buf, 1, page + pgs[i]);
> >> +		if (ret)
> >> +			continue; /* Read error, we just skip it now. */
> >> +		if (check_page_if_emtpy(chip, data_buf) == true) {
> >> +			if (pgs[i] == 0)
> >> +			/* page0 is not programmed. */
> >> +				goto free_buf;
> >> +
> >> +			/* Mark page need to program later */
> >> +			empty_page_mask |= (1 << pgs[i]);
> >> +		} else {
> >> +			if (pgs[i] == LAST_CHECKPU_PAGE)
> >> +			/*
> >> +			 * If page13 is programmed, this block has
> >> +			 * met the requirement of robust erase.
> >> +			 */
> >> +				goto free_buf;
> >> +		}
> >> +	}
> >> +	/* Corrupt page0 and page1, in order to simulate an
> >> +	 * uncompleted eraseing scenario. Just for case of
> >> +	 * power loss hits while below programming. in this
> >> +	 * way, the PEB will be re-erased again.  
> >
> >Looks like the "erase the first 2 pages" thing is here to corrupt UBI's VID and EC
> >headers. Is this really needed? We should definitely not assume the NAND user is
> >UBI, and that's exactly what you're doing here.
> >  
> We must deal with all sorts of power loss, if other NAND based file systems
> Have the same behavior with UBIFS to deal with un-completed erasing.
> This is not just for UBIFS.

I do agree with that, except you're doing exactly the opposite: assuming
UBI is used and corrupting the first 2 pages is enough. That's just
wrong, and it's a typical example of UBI behavior leaking into the lower
layers of the MTD stack. To make it clear, incomplete erase can happen,
and wear-leveling/FS layers should be able to deal with that without
expecting the NAND driver to corrupt the first 2 pages of the block.

> 
> //Bean
Bean Huo Jan. 24, 2019, 3:47 p.m. UTC | #10
Hi, Miquel

>
>> On some legacy planar 2D Micron NAND devices when a block erase
>> command is issued, occasionally even though a block erase operation
>> successfully completes and returns a pass status, the flash block may
>> not be completely erased. Subsequent operations to this block on very
>> rare cases can result in subtle failures or corruption. These
>> extremely rare cases should nevertheless be considered.
>>
>> These rare occurrences have been observed on partially written blocks.
>> Partially written blocks are not uncommon with UBI/UBIFS.
>>
>> To avoid this rare occurrence, we make sure that at least
>> 15 pages have been programmed to a block before it is erased.
>> In case we find that less than 15 pages have been programmed,
>> additional pages are programmed in the block. The observation is that
>> additional pages rarely need to be written
>
>I would stop the commit message here and remove the end of the sentence
>which, I believe, is inaccurate.
>
I don't understand where is inaccurate.

>> and most of
>> the time UBI/UBIFS erases blocks that contain more programmed pages.
>>
>> Signed-off-by: Bean Huo <beanhuo@micron.com>
>> Reviewed-by: ZOLTAN SZUBBOCSEV <zszubbocsev@micron.com>
>
>Could you write this name in usual upper/lower case like "Zoltan Szubbocsev"?
>

Sorry for this typo, fixed in next version.

.....
>>
>> +static int check_page_if_emtpy(struct nand_chip *chip, char *data)
>
>s/if/is
>s/emtpy/empty/
>s/char *data/void *data/
>
>I would rename this function "nand_check_erased_page" to follow the current
>naming and move it to nand_base.c right after nand_check_erased_ecc_chunk().
>Please also add kernel doc following the other functions pattern.
>
Thanks, will add in next version.

>> +{
>> +	struct mtd_info *mtd = nand_to_mtd(chip);
>> +	int ret, i;
>> +	void *databuf, *eccbuf;
>> +	int max_bitflips;
>> +	struct mtd_oob_region oobregion;
>> +
>> +	mtd_ooblayout_ecc(mtd, 0, &oobregion);
>> +	eccbuf = chip->oob_poi + oobregion.offset;
>> +	databuf = data;
>> +	max_bitflips = 0;
>> +
>> +	for (i = 0; i < chip->ecc.steps; i++) {
>> +		ret = nand_check_erased_ecc_chunk(data,
>> +						  chip->ecc.size,
>> +						  eccbuf,
>> +						  chip->ecc.bytes,
>> +						  NULL, 0,
>> +						  chip->ecc.strength);
>> +		if (ret >= 0)
>> +			max_bitflips = max(ret, max_bitflips);
>> +		else
>> +			return false;
>
>		if (ret < 0)
>			return false
>
>		max_bitflips = max(ret, max_bitflips);
>
>> +
>> +		databuf += chip->ecc.size;
>> +		eccbuf += chip->ecc.bytes;
>> +
>> +	}
>> +	/*
>> +	 * As for the empty/erased page, bitflips number should be zero or
>> +	 * at least less than the bitflip_threshold.
>> +	 */
>> +	if (max_bitflips > mtd->bitflip_threshold)
>> +		return false;
>> +	else
>> +		return true;
>
>Why no just:
>
>	return max_bitflips < mtd->bitflip_threshold;
>
>Mind that currently, the check for returning -EUCLEAN to the upper layer
>(meaning, there are too much bitflips) is "max_bitflips >=
>mtd->bitflip_threshold", hence the use of '<' in my proposal.
>


Ok, will add.

>> +}
>> +
>> +#define LAST_CHECKPU_PAGE 13
>
>I would place this at the top of the micron_nand.c file
>
>> +
>> +static int before_erase_checkup(struct nand_chip *chip, int page)
>
>Can you prefix this function with "micron_nand_" and use the same name as what
>you will choose for the callback entry in nand_chip?
>

Will fix.

//Bean
Miquel Raynal Jan. 24, 2019, 8:02 p.m. UTC | #11
Hi Bean,

"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote on Thu, 24 Jan 2019
15:47:25 +0000:

> Hi, Miquel
> 
> >  
> >> On some legacy planar 2D Micron NAND devices when a block erase
> >> command is issued, occasionally even though a block erase operation
> >> successfully completes and returns a pass status, the flash block may
> >> not be completely erased. Subsequent operations to this block on very
> >> rare cases can result in subtle failures or corruption. These
> >> extremely rare cases should nevertheless be considered.
> >>
> >> These rare occurrences have been observed on partially written blocks.
> >> Partially written blocks are not uncommon with UBI/UBIFS.
> >>
> >> To avoid this rare occurrence, we make sure that at least
> >> 15 pages have been programmed to a block before it is erased.
> >> In case we find that less than 15 pages have been programmed,
> >> additional pages are programmed in the block. The observation is that
> >> additional pages rarely need to be written  
> >
> >I would stop the commit message here and remove the end of the sentence
> >which, I believe, is inaccurate.
> >  
> I don't understand where is inaccurate.
> 
> >> and most of
> >> the time UBI/UBIFS erases blocks that contain more programmed pages.

Because UBI/UBIFS is *one* user of MTD. This patch has nothing to do
with UBI/UBIFS. Plus, the sentence is not very clear to me anyway.


Thanks,
Miquèl
Bean Huo Jan. 25, 2019, 1:31 p.m. UTC | #12
Hi, Boris

>
>On Mon, 21 Jan 2019 10:04:15 +0000
>"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:
>
>> Hi, Boris
>>
>> >On Fri, 18 Jan 2019 22:12:04 +0000
>> >"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:
>> >
>> >> +	/* Corrupt page0 and page1, in order to simulate an
>> >> +	 * uncompleted eraseing scenario. Just for case of
>> >> +	 * power loss hits while below programming. in this
>> >> +	 * way, the PEB will be re-erased again.
>> >> +	 */
>> >> +	empty_page_mask |= 0x3;
>> >> +	memset(data_buf, 0xAA,  mtd->writesize);
>> >
>> >Why do you use the 0xaa pattern BTW?
>>
>> Random pattern or any pattern is ok. Just to fill in page.
>
>Let's use 0x0 then. This way all cells are actually in a "programmed"
>state. 
The proposed solution is effective in addressing the problem. A 00h pattern would also
work but it would add unnecessary stress to the device, by programming more memory cells.

>BTW, I'd still be interested in knowing more about the root cause of this
>issue. What causes this wrong "cell is erased" detection in your chips? I thought
>the ERASE operation was an iterative process and cells were being tested after
>each step to know whether they are erased or not in order to decide to do
>another step or stop.
>Am I wrong? What happens here to cause this erroneous detection?

I cannot comment on the details of the detection procedure, which is proprietary algorithm. The patch ensures that the erase detection is accurate.

//Bean
Bean Huo Jan. 25, 2019, 1:39 p.m. UTC | #13
>> >
>> it is still 15 pages, page0~page14. But it is now odd pages, there
>> will one less odd pages than even pages.
>
>Why only odd pages? You did not reply to this question yet, and I really want to
>understand why.
>

Programming only odd pages will be effective, while reducing the number of programmed pages.
This is related to the proprietary device architecture and erase procedure.
Boris Brezillon Jan. 25, 2019, 2:10 p.m. UTC | #14
On Fri, 25 Jan 2019 13:39:13 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> >> >  
> >> it is still 15 pages, page0~page14. But it is now odd pages, there
> >> will one less odd pages than even pages.  
> >
> >Why only odd pages? You did not reply to this question yet, and I really want to
> >understand why.
> >  
> 
> Programming only odd pages will be effective, while reducing the number of programmed pages.
> This is related to the proprietary device architecture and erase procedure. 
> 

I'm clearly not happy with this explanation, but it looks like I don't
have a choice here. Please add a comment explaining that this is Micron
black-magic and we have no choice but to trust you on that one because
it's *confidential*. I hope you're right in your analysis...
Boris Brezillon Jan. 25, 2019, 2:15 p.m. UTC | #15
On Fri, 25 Jan 2019 13:31:17 +0000
"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:

> Hi, Boris
> 
> >
> >On Mon, 21 Jan 2019 10:04:15 +0000
> >"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:
> >  
> >> Hi, Boris
> >>  
> >> >On Fri, 18 Jan 2019 22:12:04 +0000
> >> >"Bean Huo (beanhuo)" <beanhuo@micron.com> wrote:
> >> >  
> >> >> +	/* Corrupt page0 and page1, in order to simulate an
> >> >> +	 * uncompleted eraseing scenario. Just for case of
> >> >> +	 * power loss hits while below programming. in this
> >> >> +	 * way, the PEB will be re-erased again.
> >> >> +	 */
> >> >> +	empty_page_mask |= 0x3;
> >> >> +	memset(data_buf, 0xAA,  mtd->writesize);  
> >> >
> >> >Why do you use the 0xaa pattern BTW?  
> >>
> >> Random pattern or any pattern is ok. Just to fill in page.  
> >
> >Let's use 0x0 then. This way all cells are actually in a "programmed"
> >state.   
> The proposed solution is effective in addressing the problem. A 00h pattern would also
> work but it would add unnecessary stress to the device, by programming more memory cells.

Then add a comment explaining that. Oh, BTW, you'll anyway wear the
block out pretty quickly since you're always programming the same cells
to 0, right?

> 
> >BTW, I'd still be interested in knowing more about the root cause of this
> >issue. What causes this wrong "cell is erased" detection in your chips? I thought
> >the ERASE operation was an iterative process and cells were being tested after
> >each step to know whether they are erased or not in order to decide to do
> >another step or stop.
> >Am I wrong? What happens here to cause this erroneous detection?  
> 
> I cannot comment on the details of the detection procedure, which is proprietary algorithm.

Which is a shame, but enough on that, I'm pissed off by Micron's
behavior and you already know it.

> The patch ensures that the erase detection is accurate.

As said in the other mail, I hope you're right...
diff mbox series

Patch

diff --git a/drivers/mtd/nand/raw/nand_micron.c b/drivers/mtd/nand/raw/nand_micron.c
index b85e1c1..f52e072 100644
--- a/drivers/mtd/nand/raw/nand_micron.c
+++ b/drivers/mtd/nand/raw/nand_micron.c
@@ -541,8 +541,127 @@  static void micron_fixup_onfi_param_page(struct nand_chip *chip,
 		p->revision = cpu_to_le16(ONFI_VERSION_1_0);
 }
 
+static int check_page_if_emtpy(struct nand_chip *chip, char *data)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+	int ret, i;
+	void *databuf, *eccbuf;
+	int max_bitflips;
+	struct mtd_oob_region oobregion;
+
+	mtd_ooblayout_ecc(mtd, 0, &oobregion);
+	eccbuf = chip->oob_poi + oobregion.offset;
+	databuf = data;
+	max_bitflips = 0;
+
+	for (i = 0; i < chip->ecc.steps; i++) {
+		ret = nand_check_erased_ecc_chunk(data,
+						  chip->ecc.size,
+						  eccbuf,
+						  chip->ecc.bytes,
+						  NULL, 0,
+						  chip->ecc.strength);
+		if (ret >= 0)
+			max_bitflips = max(ret, max_bitflips);
+		else
+			return false;
+
+		databuf += chip->ecc.size;
+		eccbuf += chip->ecc.bytes;
+
+	}
+	/*
+	 * As for the empty/erased page, bitflips number should be zero or
+	 * at least less than the bitflip_threshold.
+	 */
+	if (max_bitflips > mtd->bitflip_threshold)
+		return false;
+	else
+		return true;
+}
+
+#define LAST_CHECKPU_PAGE 13
+
+static int before_erase_checkup(struct nand_chip *chip, int page)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *data_buf;
+	int ret;
+	uint32_t empty_page_mask = 0;
+
+	 /* Only for legacy planar 2D parallels NAND. */
+	if ((mtd->type != MTD_NANDFLASH) || (chip->bits_per_cell != 1))
+		return 0;
+
+	data_buf =  kmalloc(mtd->writesize, GFP_KERNEL);
+	if (!data_buf)
+		return -ENOMEM;
+
+	memset(data_buf, 0xFF, mtd->writesize);
+	/*
+	 * pgs[] contains pages need to be checked. We firstly check
+	 * page13,because, for the most of cases, PEB being erased is
+	 * not partially programmed. If page13 contains data, we directly
+	 * return since it is not a partially programmed PEB. Otherwise,
+	 * then we check page0. And if page0 is programmed, and page13
+	 * is not programmed, then we start to check from page11, page9,
+	 * page7, page5, page3 respectively since the pages of PEB are
+	 * programmed sequentially. We olny check odd page.
+	 */
+
+	char pgs[] = {13, 0, 11, 9, 7, 5, 3};
+	int  c = ARRAY_SIZE(pgs) - 1;
+	int i;
+
+	for (i = 0 ; i <=  c; i++) {
+		ret = nand_read_page_raw(chip, data_buf, 1, page + pgs[i]);
+		if (ret)
+			continue; /* Read error, we just skip it now. */
+		if (check_page_if_emtpy(chip, data_buf) == true) {
+			if (pgs[i] == 0)
+			/* page0 is not programmed. */
+				goto free_buf;
+
+			/* Mark page need to program later */
+			empty_page_mask |= (1 << pgs[i]);
+		} else {
+			if (pgs[i] == LAST_CHECKPU_PAGE)
+			/*
+			 * If page13 is programmed, this block has
+			 * met the requirement of robust erase.
+			 */
+				goto free_buf;
+		}
+	}
+	/* Corrupt page0 and page1, in order to simulate an
+	 * uncompleted eraseing scenario. Just for case of
+	 * power loss hits while below programming. in this
+	 * way, the PEB will be re-erased again.
+	 */
+	empty_page_mask |= 0x3;
+	memset(data_buf, 0xAA,  mtd->writesize);
+
+	for (i = 0; i <= LAST_CHECKPU_PAGE; ) {
+
+	/* Start to program empty page, we only program the odd page. */
+		if (empty_page_mask & (1 << i))
+			nand_write_page_raw(chip, data_buf, 1, page + i);
+
+
+		if (i == 0)
+			i = 1;
+		else
+			i += 2;
+	}
+
+free_buf:
+	kfree(data_buf);
+	return 0;
+}
+
 const struct nand_manufacturer_ops micron_nand_manuf_ops = {
 	.init = micron_nand_init,
 	.cleanup = micron_nand_cleanup,
 	.fixup_onfi_param_page = micron_fixup_onfi_param_page,
+	.erase_pre = before_erase_checkup,
 };