diff mbox

[RFC,1/2] mtd: nand: add nand_check_erased helper functions

Message ID 1438277694-23763-2-git-send-email-boris.brezillon@free-electrons.com
State Superseded
Headers show

Commit Message

Boris Brezillon July 30, 2015, 5:34 p.m. UTC
Add two helper functions to help NAND controller drivers test whether a
specific NAND region is erased or not.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 104 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h     |   8 ++++
 2 files changed, 112 insertions(+)

Comments

Boris Brezillon July 31, 2015, 7:10 a.m. UTC | #1
On Thu, 30 Jul 2015 19:34:53 +0200
Boris Brezillon <boris.brezillon@free-electrons.com> wrote:

> Add two helper functions to help NAND controller drivers test whether a
> specific NAND region is erased or not.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> ---
>  drivers/mtd/nand/nand_base.c | 104 +++++++++++++++++++++++++++++++++++++++++++
>  include/linux/mtd/nand.h     |   8 ++++
>  2 files changed, 112 insertions(+)
> 
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index ceb68ca..1542ea7 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -1101,6 +1101,110 @@ out:
>  EXPORT_SYMBOL(nand_lock);
>  
>  /**
> + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
> + * @buf: buffer to test
> + * @len: buffer length
> + * @bitflips_threshold:maximum number of bitflips
> + *
> + * Check if a buffer contains only 0xff, which means the underlying region
> + * has been erased and is ready to be programmed.
> + * The bitflips_threshold specify the maximum number of bitflips before
> + * considering the region is not erased.
> + * Note: The logic of this function has been extracted from the memweight
> + * implementation, except that nand_check_erased_buf function exit before
> + * testing the whole buffer if the number of bitflips exceed the
> + * bitflips_threshold value.
> + *
> + * Returns a positive number of bitflips or -ERROR_CODE.
> + */
> +int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
> +{
> +	const unsigned char *bitmap = buf;
> +	int bitflips = 0;
> +	int weight;
> +	int longs;
> +
> +	for (; len && ((unsigned long)bitmap) % sizeof(long);
> +	     len--, bitmap++) {
> +		weight = hweight8(*bitmap);
> +
> +		bitflips += sizeof(u8) - weight;
> +		if (bitflips > bitflips_threshold)
> +			return -EINVAL;
> +	}
> +
> +
> +	for (longs = len / sizeof(long); longs;
> +	     longs--, bitmap += sizeof(long)) {
> +		BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
> +		weight = hweight_long(*((unsigned long *)bitmap));
> +
> +		bitflips += sizeof(long) - weight;
> +		if (bitflips > bitflips_threshold)
> +			return -EINVAL;
> +	}
> +
> +	len %= sizeof(long);
> +
> +	for (; len > 0; len--, bitmap++) {
> +		weight = hweight8(*bitmap);
> +		bitflips += sizeof(u8) - weight;
> +	}
> +
> +	return bitflips;
> +}
> +EXPORT_SYMBOL(nand_check_erased_buf);
> +
> +/**
> + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only
> + *				 0xff data
> + * @data: data buffer to test
> + * @datalen: data length
> + * @ecc: ECC buffer
> + * @ecclen: ECC length
> + * @extraoob: extra OOB buffer
> + * @extraooblen: extra OOB length
> + * @bitflips_threshold: maximum number of bitflips
> + *
> + * Check if a data buffer and its associated ECC and OOB data contains only
> + * 0xff pattern, which means the underlying region has been erased and is
> + * ready to be programmed.
> + * The bitflips_threshold specify the maximum number of bitflips before
> + * considering the region as not erased.
> + *
> + * Returns a positive number of bitflips or -ERROR_CODE.
> + */
> +int nand_check_erased_ecc_chunk(void *data, int datalen,
> +				void *ecc, int ecclen,
> +				void *extraoob, int extraooblen,
> +				int bitflips_threshold)
> +{
> +	int bitflips = 0;
> +	int ret;
> +
> +	ret = nand_check_erased_buf(data, datalen, bitflips_threshold);
> +	if (ret < 0)
> +		return ret;
> +
> +	bitflips += ret;
> +	bitflips_threshold -= ret;
> +
> +	ret = nand_check_erased_buf(ecc, ecclen, bitflips_threshold);
> +	if (ret < 0)
> +		return ret;
> +
> +	bitflips += ret;
> +	bitflips_threshold -= ret;
> +
> +	ret = nand_check_erased_buf(extraoob, extraooblen, bitflips_threshold);
> +	if (ret < 0)
> +		return ret;
> +

Forgot the memset operations here:

	memset(data, 0xff, datalen);
	memset(ecc, 0xff, ecclen);
	memset(extraoob, 0xff, extraooblen);

> +	return bitflips + ret;
> +}
> +EXPORT_SYMBOL(nand_check_erased_ecc_chunk);
> +
> +/**
>   * nand_read_page_raw - [INTERN] read raw page data without ecc
>   * @mtd: mtd info structure
>   * @chip: nand chip info structure
> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
> index 272f429..ae06a07 100644
> --- a/include/linux/mtd/nand.h
> +++ b/include/linux/mtd/nand.h
> @@ -1030,4 +1030,12 @@ struct nand_sdr_timings {
>  
>  /* get timing characteristics from ONFI timing mode. */
>  const struct nand_sdr_timings *onfi_async_timing_mode_to_sdr_timings(int mode);
> +
> +int nand_check_erased_buf(void *data, int datalen,
> +			  int threshold);
> +
> +int nand_check_erased_ecc_chunk(void *data, int datalen,
> +				void *ecc, int ecclen,
> +				void *extraoob, int extraooblen,
> +				int threshold);
>  #endif /* __LINUX_MTD_NAND_H */
rnd4@dave-tech.it July 31, 2015, 10:06 a.m. UTC | #2
Dear Boris,

thanks for pointing this out again.

I'm on the same topic too, using iMX6 (I'll try to test you patch on the 
next days, if I found some spare time, unfortunately I got a 3.10 
kernel, so I think the patch will not apply cleanly :-( ).

See my comment below (and on the next mail too)

Il 31/07/2015 09:10, Boris Brezillon ha scritto:
> On Thu, 30 Jul 2015 19:34:53 +0200
> Boris Brezillon <boris.brezillon@free-electrons.com> wrote:
>
>> Add two helper functions to help NAND controller drivers test whether a
>> specific NAND region is erased or not.
>>
>> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
>> ---
>>   drivers/mtd/nand/nand_base.c | 104 +++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/mtd/nand.h     |   8 ++++
>>   2 files changed, 112 insertions(+)
>>
>> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>> index ceb68ca..1542ea7 100644
>> --- a/drivers/mtd/nand/nand_base.c
>> +++ b/drivers/mtd/nand/nand_base.c
>> @@ -1101,6 +1101,110 @@ out:
>>   EXPORT_SYMBOL(nand_lock);
>>
>>   /**
>> + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
>> + * @buf: buffer to test
>> + * @len: buffer length
>> + * @bitflips_threshold:maximum number of bitflips
>> + *
>> + * Check if a buffer contains only 0xff, which means the underlying region
>> + * has been erased and is ready to be programmed.
>> + * The bitflips_threshold specify the maximum number of bitflips before
>> + * considering the region is not erased.
>> + * Note: The logic of this function has been extracted from the memweight
>> + * implementation, except that nand_check_erased_buf function exit before
>> + * testing the whole buffer if the number of bitflips exceed the
>> + * bitflips_threshold value.
>> + *
>> + * Returns a positive number of bitflips or -ERROR_CODE.
>> + */
>> +int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
>> +{
>> +	const unsigned char *bitmap = buf;
>> +	int bitflips = 0;
>> +	int weight;
>> +	int longs;
>> +
>> +	for (; len && ((unsigned long)bitmap) % sizeof(long);
>> +	     len--, bitmap++) {
>> +		weight = hweight8(*bitmap);
>> +
>> +		bitflips += sizeof(u8) - weight;
>> +		if (bitflips > bitflips_threshold)
>> +			return -EINVAL;

I think it's better to do something like:

if (UNLIKELY(bitflips > bitflips_threshold))
	return -EINVAL;

isn't it? :-)
(the same for the other if)


>> +	}
>> +
>> +
>> +	for (longs = len / sizeof(long); longs;
>> +	     longs--, bitmap += sizeof(long)) {
>> +		BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
>> +		weight = hweight_long(*((unsigned long *)bitmap));
>> +
>> +		bitflips += sizeof(long) - weight;
>> +		if (bitflips > bitflips_threshold)
>> +			return -EINVAL;
>> +	}
>> +
>> +	len %= sizeof(long);
>> +
>> +	for (; len > 0; len--, bitmap++) {
>> +		weight = hweight8(*bitmap);
>> +		bitflips += sizeof(u8) - weight;
>> +	}
>> +
>> +	return bitflips;
>> +}
>> +EXPORT_SYMBOL(nand_check_erased_buf);
>> +
>> +/**
>> + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only
>> + *				 0xff data
>> + * @data: data buffer to test
>> + * @datalen: data length
>> + * @ecc: ECC buffer
>> + * @ecclen: ECC length
>> + * @extraoob: extra OOB buffer
>> + * @extraooblen: extra OOB length
>> + * @bitflips_threshold: maximum number of bitflips
>> + *
>> + * Check if a data buffer and its associated ECC and OOB data contains only
>> + * 0xff pattern, which means the underlying region has been erased and is
>> + * ready to be programmed.
>> + * The bitflips_threshold specify the maximum number of bitflips before
>> + * considering the region as not erased.
>> + *
>> + * Returns a positive number of bitflips or -ERROR_CODE.
>> + */
>> +int nand_check_erased_ecc_chunk(void *data, int datalen,
>> +				void *ecc, int ecclen,
>> +				void *extraoob, int extraooblen,
>> +				int bitflips_threshold)
>> +{
>> +	int bitflips = 0;
>> +	int ret;
>> +
>> +	ret = nand_check_erased_buf(data, datalen, bitflips_threshold);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	bitflips += ret;
>> +	bitflips_threshold -= ret;
>> +
>> +	ret = nand_check_erased_buf(ecc, ecclen, bitflips_threshold);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	bitflips += ret;
>> +	bitflips_threshold -= ret;
>> +
>> +	ret = nand_check_erased_buf(extraoob, extraooblen, bitflips_threshold);
>> +	if (ret < 0)
>> +		return ret;
>> +
>
> Forgot the memset operations here:
>
> 	memset(data, 0xff, datalen);
> 	memset(ecc, 0xff, ecclen);
> 	memset(extraoob, 0xff, extraooblen);

Yes, you're right.. I did the same mistake on my first implementation 
too ;-)

As additional optimization you may also check if the lower layer already 
did the check for you (e.g. if you have an iMXQP as we saw in latest 
days), but I think it's a minor one, because you'll face this situation 
very very unlikely.
Boris Brezillon July 31, 2015, 10:21 a.m. UTC | #3
Hi Andrea,

On Fri, 31 Jul 2015 12:06:32 +0200
Andrea Scian <rnd4@dave-tech.it> wrote:

> 
> Dear Boris,
> 
> thanks for pointing this out again.
> 
> I'm on the same topic too, using iMX6 (I'll try to test you patch on the 
> next days, if I found some spare time, unfortunately I got a 3.10 
> kernel, so I think the patch will not apply cleanly :-( ).
> 
> See my comment below (and on the next mail too)
> 
> Il 31/07/2015 09:10, Boris Brezillon ha scritto:
> > On Thu, 30 Jul 2015 19:34:53 +0200
> > Boris Brezillon <boris.brezillon@free-electrons.com> wrote:
> >
> >> Add two helper functions to help NAND controller drivers test whether a
> >> specific NAND region is erased or not.
> >>
> >> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> >> ---
> >>   drivers/mtd/nand/nand_base.c | 104 +++++++++++++++++++++++++++++++++++++++++++
> >>   include/linux/mtd/nand.h     |   8 ++++
> >>   2 files changed, 112 insertions(+)
> >>
> >> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> >> index ceb68ca..1542ea7 100644
> >> --- a/drivers/mtd/nand/nand_base.c
> >> +++ b/drivers/mtd/nand/nand_base.c
> >> @@ -1101,6 +1101,110 @@ out:
> >>   EXPORT_SYMBOL(nand_lock);
> >>
> >>   /**
> >> + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
> >> + * @buf: buffer to test
> >> + * @len: buffer length
> >> + * @bitflips_threshold:maximum number of bitflips
> >> + *
> >> + * Check if a buffer contains only 0xff, which means the underlying region
> >> + * has been erased and is ready to be programmed.
> >> + * The bitflips_threshold specify the maximum number of bitflips before
> >> + * considering the region is not erased.
> >> + * Note: The logic of this function has been extracted from the memweight
> >> + * implementation, except that nand_check_erased_buf function exit before
> >> + * testing the whole buffer if the number of bitflips exceed the
> >> + * bitflips_threshold value.
> >> + *
> >> + * Returns a positive number of bitflips or -ERROR_CODE.
> >> + */
> >> +int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
> >> +{
> >> +	const unsigned char *bitmap = buf;
> >> +	int bitflips = 0;
> >> +	int weight;
> >> +	int longs;
> >> +
> >> +	for (; len && ((unsigned long)bitmap) % sizeof(long);
> >> +	     len--, bitmap++) {
> >> +		weight = hweight8(*bitmap);
> >> +
> >> +		bitflips += sizeof(u8) - weight;
> >> +		if (bitflips > bitflips_threshold)
> >> +			return -EINVAL;
> 
> I think it's better to do something like:
> 
> if (UNLIKELY(bitflips > bitflips_threshold))
> 	return -EINVAL;
> 
> isn't it? :-)
> (the same for the other if)

Maybe, or maybe not. It depends on whether you expect to have a lot of
corrupted pages or a lot of blank pages with bitflips ;-).
Anyway, I'm not opposed to this change.

> 
> 
> >> +	}
> >> +
> >> +
> >> +	for (longs = len / sizeof(long); longs;
> >> +	     longs--, bitmap += sizeof(long)) {
> >> +		BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
> >> +		weight = hweight_long(*((unsigned long *)bitmap));
> >> +
> >> +		bitflips += sizeof(long) - weight;
> >> +		if (bitflips > bitflips_threshold)
> >> +			return -EINVAL;
> >> +	}
> >> +
> >> +	len %= sizeof(long);
> >> +
> >> +	for (; len > 0; len--, bitmap++) {
> >> +		weight = hweight8(*bitmap);
> >> +		bitflips += sizeof(u8) - weight;
> >> +	}
> >> +
> >> +	return bitflips;
> >> +}
> >> +EXPORT_SYMBOL(nand_check_erased_buf);
> >> +
> >> +/**
> >> + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only
> >> + *				 0xff data
> >> + * @data: data buffer to test
> >> + * @datalen: data length
> >> + * @ecc: ECC buffer
> >> + * @ecclen: ECC length
> >> + * @extraoob: extra OOB buffer
> >> + * @extraooblen: extra OOB length
> >> + * @bitflips_threshold: maximum number of bitflips
> >> + *
> >> + * Check if a data buffer and its associated ECC and OOB data contains only
> >> + * 0xff pattern, which means the underlying region has been erased and is
> >> + * ready to be programmed.
> >> + * The bitflips_threshold specify the maximum number of bitflips before
> >> + * considering the region as not erased.
> >> + *
> >> + * Returns a positive number of bitflips or -ERROR_CODE.
> >> + */
> >> +int nand_check_erased_ecc_chunk(void *data, int datalen,
> >> +				void *ecc, int ecclen,
> >> +				void *extraoob, int extraooblen,
> >> +				int bitflips_threshold)
> >> +{
> >> +	int bitflips = 0;
> >> +	int ret;
> >> +
> >> +	ret = nand_check_erased_buf(data, datalen, bitflips_threshold);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >> +	bitflips += ret;
> >> +	bitflips_threshold -= ret;
> >> +
> >> +	ret = nand_check_erased_buf(ecc, ecclen, bitflips_threshold);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >> +	bitflips += ret;
> >> +	bitflips_threshold -= ret;
> >> +
> >> +	ret = nand_check_erased_buf(extraoob, extraooblen, bitflips_threshold);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >
> > Forgot the memset operations here:
> >
> > 	memset(data, 0xff, datalen);
> > 	memset(ecc, 0xff, ecclen);
> > 	memset(extraoob, 0xff, extraooblen);
> 
> Yes, you're right.. I did the same mistake on my first implementation 
> too ;-)

Hehe.

> 
> As additional optimization you may also check if the lower layer already 
> did the check for you (e.g. if you have an iMXQP as we saw in latest 
> days), but I think it's a minor one, because you'll face this situation 
> very very unlikely.

If the hardware is capable of doing such test (I mean counting the
number of bits to one and considering the page as erased under a given
limit of bitflips), there's a lot of chance it will implement its own
ecc_read_page function, and will never use this helper.

Best Regards,

Boris
rnd4@dave-tech.it July 31, 2015, 1:29 p.m. UTC | #4
Boris,

Il 31/07/2015 12:21, Boris Brezillon ha scritto:
> Hi Andrea,
>
> On Fri, 31 Jul 2015 12:06:32 +0200
> Andrea Scian <rnd4@dave-tech.it> wrote:
>
>>
>> Dear Boris,
>>
>> thanks for pointing this out again.
>>
>> I'm on the same topic too, using iMX6 (I'll try to test you patch on the
>> next days, if I found some spare time, unfortunately I got a 3.10
>> kernel, so I think the patch will not apply cleanly :-( ).
>>
>> See my comment below (and on the next mail too)
>>
>> Il 31/07/2015 09:10, Boris Brezillon ha scritto:
>>> On Thu, 30 Jul 2015 19:34:53 +0200
>>> Boris Brezillon <boris.brezillon@free-electrons.com> wrote:
>>>
>>>> Add two helper functions to help NAND controller drivers test whether a
>>>> specific NAND region is erased or not.
>>>>
>>>> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
>>>> ---
>>>>    drivers/mtd/nand/nand_base.c | 104 +++++++++++++++++++++++++++++++++++++++++++
>>>>    include/linux/mtd/nand.h     |   8 ++++
>>>>    2 files changed, 112 insertions(+)
>>>>
>>>> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>>>> index ceb68ca..1542ea7 100644
>>>> --- a/drivers/mtd/nand/nand_base.c
>>>> +++ b/drivers/mtd/nand/nand_base.c
>>>> @@ -1101,6 +1101,110 @@ out:
>>>>    EXPORT_SYMBOL(nand_lock);
>>>>
>>>>    /**
>>>> + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
>>>> + * @buf: buffer to test
>>>> + * @len: buffer length
>>>> + * @bitflips_threshold:maximum number of bitflips
>>>> + *
>>>> + * Check if a buffer contains only 0xff, which means the underlying region
>>>> + * has been erased and is ready to be programmed.
>>>> + * The bitflips_threshold specify the maximum number of bitflips before
>>>> + * considering the region is not erased.
>>>> + * Note: The logic of this function has been extracted from the memweight
>>>> + * implementation, except that nand_check_erased_buf function exit before
>>>> + * testing the whole buffer if the number of bitflips exceed the
>>>> + * bitflips_threshold value.
>>>> + *
>>>> + * Returns a positive number of bitflips or -ERROR_CODE.
>>>> + */
>>>> +int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
>>>> +{
>>>> +	const unsigned char *bitmap = buf;
>>>> +	int bitflips = 0;
>>>> +	int weight;
>>>> +	int longs;
>>>> +
>>>> +	for (; len && ((unsigned long)bitmap) % sizeof(long);
>>>> +	     len--, bitmap++) {
>>>> +		weight = hweight8(*bitmap);
>>>> +
>>>> +		bitflips += sizeof(u8) - weight;
>>>> +		if (bitflips > bitflips_threshold)
>>>> +			return -EINVAL;
>>
>> I think it's better to do something like:
>>
>> if (UNLIKELY(bitflips > bitflips_threshold))
>> 	return -EINVAL;
>>
>> isn't it? :-)
>> (the same for the other if)
>
> Maybe, or maybe not. It depends on whether you expect to have a lot of
> corrupted pages or a lot of blank pages with bitflips ;-).
> Anyway, I'm not opposed to this change.

I think that everything implemented inside the MTD stack 
(NAND/MTD/UBI/UBIFS) should lead to a "working" solid state device, that 
do not show any uncorrectable bitflips.
Uncorrectable pages, IMO, should happens, on stable systems, only in 
some rare case, because it means that you loss some data (or power 
during erase/write. Any other case?).

What is more frequent is that bitflips > mtd->bitflip_threshold (by 
default DIV_ROUND_UP(mtd->ecc_strength * 3, 4)), which should avoid 
bitflips > ecc_strength

>>
>>
>>>> +	}
>>>> +
>>>> +
>>>> +	for (longs = len / sizeof(long); longs;
>>>> +	     longs--, bitmap += sizeof(long)) {
>>>> +		BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
>>>> +		weight = hweight_long(*((unsigned long *)bitmap));
>>>> +
>>>> +		bitflips += sizeof(long) - weight;
>>>> +		if (bitflips > bitflips_threshold)
>>>> +			return -EINVAL;
>>>> +	}
>>>> +
>>>> +	len %= sizeof(long);
>>>> +
>>>> +	for (; len > 0; len--, bitmap++) {
>>>> +		weight = hweight8(*bitmap);
>>>> +		bitflips += sizeof(u8) - weight;
>>>> +	}
>>>> +
>>>> +	return bitflips;
>>>> +}
>>>> +EXPORT_SYMBOL(nand_check_erased_buf);
>>>> +
>>>> +/**
>>>> + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only
>>>> + *				 0xff data
>>>> + * @data: data buffer to test
>>>> + * @datalen: data length
>>>> + * @ecc: ECC buffer
>>>> + * @ecclen: ECC length
>>>> + * @extraoob: extra OOB buffer
>>>> + * @extraooblen: extra OOB length
>>>> + * @bitflips_threshold: maximum number of bitflips
>>>> + *
>>>> + * Check if a data buffer and its associated ECC and OOB data contains only
>>>> + * 0xff pattern, which means the underlying region has been erased and is
>>>> + * ready to be programmed.
>>>> + * The bitflips_threshold specify the maximum number of bitflips before
>>>> + * considering the region as not erased.
>>>> + *
>>>> + * Returns a positive number of bitflips or -ERROR_CODE.
>>>> + */
>>>> +int nand_check_erased_ecc_chunk(void *data, int datalen,
>>>> +				void *ecc, int ecclen,
>>>> +				void *extraoob, int extraooblen,
>>>> +				int bitflips_threshold)
>>>> +{
>>>> +	int bitflips = 0;
>>>> +	int ret;
>>>> +
>>>> +	ret = nand_check_erased_buf(data, datalen, bitflips_threshold);
>>>> +	if (ret < 0)
>>>> +		return ret;
>>>> +
>>>> +	bitflips += ret;
>>>> +	bitflips_threshold -= ret;
>>>> +
>>>> +	ret = nand_check_erased_buf(ecc, ecclen, bitflips_threshold);
>>>> +	if (ret < 0)
>>>> +		return ret;
>>>> +
>>>> +	bitflips += ret;
>>>> +	bitflips_threshold -= ret;
>>>> +
>>>> +	ret = nand_check_erased_buf(extraoob, extraooblen, bitflips_threshold);
>>>> +	if (ret < 0)
>>>> +		return ret;
>>>> +
>>>
>>> Forgot the memset operations here:
>>>
>>> 	memset(data, 0xff, datalen);
>>> 	memset(ecc, 0xff, ecclen);
>>> 	memset(extraoob, 0xff, extraooblen);
>>
>> Yes, you're right.. I did the same mistake on my first implementation
>> too ;-)
>
> Hehe.
>
>>
>> As additional optimization you may also check if the lower layer already
>> did the check for you (e.g. if you have an iMXQP as we saw in latest
>> days), but I think it's a minor one, because you'll face this situation
>> very very unlikely.
>
> If the hardware is capable of doing such test (I mean counting the
> number of bits to one and considering the page as erased under a given
> limit of bitflips), there's a lot of chance it will implement its own
> ecc_read_page function, and will never use this helper.
>

Ops.. I misunderstand your patch. I think it was something similar to 
what Brian already proposed some time ago [1].
IIUC Brial solution works, out of the box, even with the ones that 
override read_page callback, as I think most of current nand controller 
do (please correct me if I'm wrong).
If we want to add erased block check to omap2.c, atmel_nand.c, 
sh_flctl.c we have to modify them all.

I'm really not the right one to make such a decision ;-) but I think you 
already thought about it and can tell me the pros and cons of your patch 
vs the Brian's one.

What I understand up until now, is that Brian solution does not fit into 
all weird stuff that we find in single NAND controller implementation 
and this is where your solution come in handy. Am I wrong?

Kind Regards,
Boris Brezillon July 31, 2015, 1:58 p.m. UTC | #5
On Fri, 31 Jul 2015 15:29:21 +0200
Andrea Scian <rnd4@dave-tech.it> wrote:

> >>>> +int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
> >>>> +{
> >>>> +	const unsigned char *bitmap = buf;
> >>>> +	int bitflips = 0;
> >>>> +	int weight;
> >>>> +	int longs;
> >>>> +
> >>>> +	for (; len && ((unsigned long)bitmap) % sizeof(long);
> >>>> +	     len--, bitmap++) {
> >>>> +		weight = hweight8(*bitmap);
> >>>> +
> >>>> +		bitflips += sizeof(u8) - weight;
> >>>> +		if (bitflips > bitflips_threshold)
> >>>> +			return -EINVAL;
> >>
> >> I think it's better to do something like:
> >>
> >> if (UNLIKELY(bitflips > bitflips_threshold))
> >> 	return -EINVAL;
> >>
> >> isn't it? :-)
> >> (the same for the other if)
> >
> > Maybe, or maybe not. It depends on whether you expect to have a lot of
> > corrupted pages or a lot of blank pages with bitflips ;-).
> > Anyway, I'm not opposed to this change.
> 
> I think that everything implemented inside the MTD stack 
> (NAND/MTD/UBI/UBIFS) should lead to a "working" solid state device, that 
> do not show any uncorrectable bitflips.
> Uncorrectable pages, IMO, should happens, on stable systems, only in 
> some rare case, because it means that you loss some data (or power 
> during erase/write. Any other case?).
> 
> What is more frequent is that bitflips > mtd->bitflip_threshold (by 
> default DIV_ROUND_UP(mtd->ecc_strength * 3, 4)), which should avoid 
> bitflips > ecc_strength

Yes, you're probably right. I'll add the unlikely keyword in the next
version.


> >>
> >> As additional optimization you may also check if the lower layer already
> >> did the check for you (e.g. if you have an iMXQP as we saw in latest
> >> days), but I think it's a minor one, because you'll face this situation
> >> very very unlikely.
> >
> > If the hardware is capable of doing such test (I mean counting the
> > number of bits to one and considering the page as erased under a given
> > limit of bitflips), there's a lot of chance it will implement its own
> > ecc_read_page function, and will never use this helper.
> >
> 
> Ops.. I misunderstand your patch. I think it was something similar to 
> what Brian already proposed some time ago [1].
> IIUC Brial solution works, out of the box, even with the ones that 
> override read_page callback, as I think most of current nand controller 
> do (please correct me if I'm wrong).
> If we want to add erased block check to omap2.c, atmel_nand.c, 
> sh_flctl.c we have to modify them all.
> 
> I'm really not the right one to make such a decision ;-) but I think you 
> already thought about it and can tell me the pros and cons of your patch 
> vs the Brian's one.
> 
> What I understand up until now, is that Brian solution does not fit into 
> all weird stuff that we find in single NAND controller implementation 
> and this is where your solution come in handy. Am I wrong?

That's exactly why I'm proposing it as helper functions instead of
trying to apply the erased page check for everybody.
Here are a few missing things to make the test applicable to everybody:
- some controllers are not implementing the read_page_raw function and
  thus checking ECC bytes is impossible
- some of them are not able to retrieve OOB bytes (or are only able to
  retrieve a small portion of it)
- some controllers do not properly define the ECC layout, which makes
  it impossible to check which ECC byte is assigned to which ECC chunk.
- some ECC controllers (like the GPMI one) do not align ECC data on a
  byte, which renders the erased check even more complicated

The other reason to not enforce this test for everybody is that some
controllers generate 0xff ECC bytes for 0xff data (the sama5d4 NAND
controller does), which means they are fully capable of correcting
bitflips in erased pages by their own, and if they report a read
failure, there's no need to check for an empty page, this is a real
failure.

By only providing helper functions, we let the NAND controller
drivers decide how to check it, but we're still providing common
functions instead of duplicating the same code in all drivers.

Best Regards,

Boris
rnd4@dave-tech.it Aug. 4, 2015, 3:42 p.m. UTC | #6
Boris,

sorry for the later review.
I'm trying to run your patch on our systems. See below

Il 31/07/2015 09:10, Boris Brezillon ha scritto:
> On Thu, 30 Jul 2015 19:34:53 +0200
> Boris Brezillon <boris.brezillon@free-electrons.com> wrote:
>
>> Add two helper functions to help NAND controller drivers test whether a
>> specific NAND region is erased or not.
>>
>> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
>> ---
>>   drivers/mtd/nand/nand_base.c | 104 +++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/mtd/nand.h     |   8 ++++
>>   2 files changed, 112 insertions(+)
>>
>> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>> index ceb68ca..1542ea7 100644
>> --- a/drivers/mtd/nand/nand_base.c
>> +++ b/drivers/mtd/nand/nand_base.c
>> @@ -1101,6 +1101,110 @@ out:
>>   EXPORT_SYMBOL(nand_lock);
>>
>>   /**
>> + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
>> + * @buf: buffer to test
>> + * @len: buffer length
>> + * @bitflips_threshold:maximum number of bitflips
>> + *
>> + * Check if a buffer contains only 0xff, which means the underlying region
>> + * has been erased and is ready to be programmed.
>> + * The bitflips_threshold specify the maximum number of bitflips before
>> + * considering the region is not erased.
>> + * Note: The logic of this function has been extracted from the memweight
>> + * implementation, except that nand_check_erased_buf function exit before
>> + * testing the whole buffer if the number of bitflips exceed the
>> + * bitflips_threshold value.
>> + *
>> + * Returns a positive number of bitflips or -ERROR_CODE.
>> + */
>> +int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
>> +{
>> +	const unsigned char *bitmap = buf;
>> +	int bitflips = 0;
>> +	int weight;
>> +	int longs;
>> +
>> +	for (; len && ((unsigned long)bitmap) % sizeof(long);
>> +	     len--, bitmap++) {
>> +		weight = hweight8(*bitmap);
>> +
>> +		bitflips += sizeof(u8) - weight;

I think the above like should be

bitflips += sizeof(u8)*BITS_PER_BYTE - weight;


>> +		if (bitflips > bitflips_threshold)
>> +			return -EINVAL;
>> +	}
>> +
>> +
>> +	for (longs = len / sizeof(long); longs;
>> +	     longs--, bitmap += sizeof(long)) {
>> +		BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
>> +		weight = hweight_long(*((unsigned long *)bitmap));
>> +
>> +		bitflips += sizeof(long) - weight;

as above:

bitflips += sizeof(long)*BITS_PER_BYTE - weight;

>> +		if (bitflips > bitflips_threshold)
>> +			return -EINVAL;
>> +	}
>> +
>> +	len %= sizeof(long);
>> +
>> +	for (; len > 0; len--, bitmap++) {
>> +		weight = hweight8(*bitmap);
>> +		bitflips += sizeof(u8) - weight;


as above:

bitflips += sizeof(u8)*BITS_PER_BYTE - weight;


>> +	}
>> +
>> +	return bitflips;
>> +}
>> +EXPORT_SYMBOL(nand_check_erased_buf);
>> +
>> +/**
>> + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only
>> + *				 0xff data
>> + * @data: data buffer to test
>> + * @datalen: data length
>> + * @ecc: ECC buffer
>> + * @ecclen: ECC length
>> + * @extraoob: extra OOB buffer
>> + * @extraooblen: extra OOB length
>> + * @bitflips_threshold: maximum number of bitflips
>> + *
>> + * Check if a data buffer and its associated ECC and OOB data contains only
>> + * 0xff pattern, which means the underlying region has been erased and is
>> + * ready to be programmed.
>> + * The bitflips_threshold specify the maximum number of bitflips before
>> + * considering the region as not erased.
>> + *
>> + * Returns a positive number of bitflips or -ERROR_CODE.
>> + */
>> +int nand_check_erased_ecc_chunk(void *data, int datalen,
>> +				void *ecc, int ecclen,
>> +				void *extraoob, int extraooblen,
>> +				int bitflips_threshold)
>> +{
>> +	int bitflips = 0;
>> +	int ret;
>> +
>> +	ret = nand_check_erased_buf(data, datalen, bitflips_threshold);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	bitflips += ret;
>> +	bitflips_threshold -= ret;
>> +
>> +	ret = nand_check_erased_buf(ecc, ecclen, bitflips_threshold);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	bitflips += ret;
>> +	bitflips_threshold -= ret;
>> +
>> +	ret = nand_check_erased_buf(extraoob, extraooblen, bitflips_threshold);
>> +	if (ret < 0)
>> +		return ret;
>> +
>
> Forgot the memset operations here:
>
> 	memset(data, 0xff, datalen);
> 	memset(ecc, 0xff, ecclen);
> 	memset(extraoob, 0xff, extraooblen);
>
>> +	return bitflips + ret;
>> +}
>> +EXPORT_SYMBOL(nand_check_erased_ecc_chunk);
>> +
>> +/**
>>    * nand_read_page_raw - [INTERN] read raw page data without ecc
>>    * @mtd: mtd info structure
>>    * @chip: nand chip info structure
>> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
>> index 272f429..ae06a07 100644
>> --- a/include/linux/mtd/nand.h
>> +++ b/include/linux/mtd/nand.h
>> @@ -1030,4 +1030,12 @@ struct nand_sdr_timings {
>>
>>   /* get timing characteristics from ONFI timing mode. */
>>   const struct nand_sdr_timings *onfi_async_timing_mode_to_sdr_timings(int mode);
>> +
>> +int nand_check_erased_buf(void *data, int datalen,
>> +			  int threshold);
>> +
>> +int nand_check_erased_ecc_chunk(void *data, int datalen,
>> +				void *ecc, int ecclen,
>> +				void *extraoob, int extraooblen,
>> +				int threshold);
>>   #endif /* __LINUX_MTD_NAND_H */
>
>
>
Boris Brezillon Aug. 4, 2015, 4:27 p.m. UTC | #7
Hi Andrea

Le 4 août 2015 17:42:43 GMT+02:00, Andrea Scian <rnd4@dave-tech.it> a écrit :
>
>>> +
>>> +		bitflips += sizeof(long) - weight;
>
>as above:
>
>bitflips += sizeof(long)*BITS_PER_BYTE - weight;


Indeed, or we could just replace sizeof(u8) by BITS_PER_BYTE and sizeof(long) by BITS_PER_LONG.

I'll fix that in the next version.

Note that I didn't test the series on a real platform and won't be able to do so for the next two weeks, so you might find other inconsistencies/bugs.

Thanks for testing it.

Best regards,

Boris
diff mbox

Patch

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index ceb68ca..1542ea7 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1101,6 +1101,110 @@  out:
 EXPORT_SYMBOL(nand_lock);
 
 /**
+ * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
+ * @buf: buffer to test
+ * @len: buffer length
+ * @bitflips_threshold:maximum number of bitflips
+ *
+ * Check if a buffer contains only 0xff, which means the underlying region
+ * has been erased and is ready to be programmed.
+ * The bitflips_threshold specify the maximum number of bitflips before
+ * considering the region is not erased.
+ * Note: The logic of this function has been extracted from the memweight
+ * implementation, except that nand_check_erased_buf function exit before
+ * testing the whole buffer if the number of bitflips exceed the
+ * bitflips_threshold value.
+ *
+ * Returns a positive number of bitflips or -ERROR_CODE.
+ */
+int nand_check_erased_buf(void *buf, int len, int bitflips_threshold)
+{
+	const unsigned char *bitmap = buf;
+	int bitflips = 0;
+	int weight;
+	int longs;
+
+	for (; len && ((unsigned long)bitmap) % sizeof(long);
+	     len--, bitmap++) {
+		weight = hweight8(*bitmap);
+
+		bitflips += sizeof(u8) - weight;
+		if (bitflips > bitflips_threshold)
+			return -EINVAL;
+	}
+
+
+	for (longs = len / sizeof(long); longs;
+	     longs--, bitmap += sizeof(long)) {
+		BUG_ON(longs >= INT_MAX / BITS_PER_LONG);
+		weight = hweight_long(*((unsigned long *)bitmap));
+
+		bitflips += sizeof(long) - weight;
+		if (bitflips > bitflips_threshold)
+			return -EINVAL;
+	}
+
+	len %= sizeof(long);
+
+	for (; len > 0; len--, bitmap++) {
+		weight = hweight8(*bitmap);
+		bitflips += sizeof(u8) - weight;
+	}
+
+	return bitflips;
+}
+EXPORT_SYMBOL(nand_check_erased_buf);
+
+/**
+ * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only
+ *				 0xff data
+ * @data: data buffer to test
+ * @datalen: data length
+ * @ecc: ECC buffer
+ * @ecclen: ECC length
+ * @extraoob: extra OOB buffer
+ * @extraooblen: extra OOB length
+ * @bitflips_threshold: maximum number of bitflips
+ *
+ * Check if a data buffer and its associated ECC and OOB data contains only
+ * 0xff pattern, which means the underlying region has been erased and is
+ * ready to be programmed.
+ * The bitflips_threshold specify the maximum number of bitflips before
+ * considering the region as not erased.
+ *
+ * Returns a positive number of bitflips or -ERROR_CODE.
+ */
+int nand_check_erased_ecc_chunk(void *data, int datalen,
+				void *ecc, int ecclen,
+				void *extraoob, int extraooblen,
+				int bitflips_threshold)
+{
+	int bitflips = 0;
+	int ret;
+
+	ret = nand_check_erased_buf(data, datalen, bitflips_threshold);
+	if (ret < 0)
+		return ret;
+
+	bitflips += ret;
+	bitflips_threshold -= ret;
+
+	ret = nand_check_erased_buf(ecc, ecclen, bitflips_threshold);
+	if (ret < 0)
+		return ret;
+
+	bitflips += ret;
+	bitflips_threshold -= ret;
+
+	ret = nand_check_erased_buf(extraoob, extraooblen, bitflips_threshold);
+	if (ret < 0)
+		return ret;
+
+	return bitflips + ret;
+}
+EXPORT_SYMBOL(nand_check_erased_ecc_chunk);
+
+/**
  * nand_read_page_raw - [INTERN] read raw page data without ecc
  * @mtd: mtd info structure
  * @chip: nand chip info structure
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 272f429..ae06a07 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1030,4 +1030,12 @@  struct nand_sdr_timings {
 
 /* get timing characteristics from ONFI timing mode. */
 const struct nand_sdr_timings *onfi_async_timing_mode_to_sdr_timings(int mode);
+
+int nand_check_erased_buf(void *data, int datalen,
+			  int threshold);
+
+int nand_check_erased_ecc_chunk(void *data, int datalen,
+				void *ecc, int ecclen,
+				void *extraoob, int extraooblen,
+				int threshold);
 #endif /* __LINUX_MTD_NAND_H */