diff mbox

[v4,1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.

Message ID 1396308537-16013-2-git-send-email-davidm@egauge.net
State Rejected
Headers show

Commit Message

David Mosberger-Tang March 31, 2014, 11:28 p.m. UTC
Signed-off-by: David Mosberger <davidm@egauge.net>
---
 drivers/mtd/nand/nand_base.c |   27 ++++++++++++++++++++++++---
 include/linux/mtd/nand.h     |    4 ++++
 2 files changed, 28 insertions(+), 3 deletions(-)

Comments

Brian Norris April 1, 2014, 6:39 a.m. UTC | #1
On Mon, Mar 31, 2014 at 05:28:53PM -0600, David Mosberger wrote:
> Signed-off-by: David Mosberger <davidm@egauge.net>
> ---
>  drivers/mtd/nand/nand_base.c |   27 ++++++++++++++++++++++++---
>  include/linux/mtd/nand.h     |    4 ++++
>  2 files changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index 09fe1b1..f145f00 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -3054,11 +3054,30 @@ static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
>  			feature);
>  }
>  
> +static void

Does this really need to be on its own line? It doesn't match the style
of anything else.

> +nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
> +				   struct nand_chip *chip)

I'm really not sure why the inconsistent style throughout nand_base on
using both an 'mtd' and a 'chip' parameter (we often assume that
'mtd->priv == chip'). If you need the mtd parameter, let's just pass it
instead of 'chip'.

> +{
> +	u8 features[ONFI_SUBFEATURE_PARAM_LEN];
> +
> +	if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
> +				    features) < 0)
> +		return;
> +
> +	if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
> +		/*
> +		 * If the chip has on-die ECC enabled, we kind of have
> +		 * to do the same...
> +		 */

That's not really true at all, is it? We can simply disable the on-die
ECC and use the provided spare area with a different ECC scheme, can't
we? (e.g., software BCH; or an SoC's HW ECC) In fact, I think disabling
the on-die ECC is a more reasonable default; nand_base is used by many
drivers, most of which provide their own implementations, and many of
which would not be compatible with the implementations you provide. A
system should have to "opt in" somehow to enable this, I think.

Additionally, you need a way to inform the hardware driver that you're
using on-die ECC, so they can make the appropriate choices (to disable
their HW ECC, for instance).

BTW, what driver/controller/SoC are you running this with?

> +		pr_info("Using on-die ECC\n");
> +	}
> +}
> +
>  /*
>   * Configure chip properties from Micron vendor-specific ONFI table
>   */
> -static void nand_onfi_detect_micron(struct nand_chip *chip,
> -		struct nand_onfi_params *p)
> +static void nand_onfi_detect_micron(struct mtd_info *mtd,
> +		struct nand_chip *chip, struct nand_onfi_params *p)

Ditto on mtd + chip.

>  {
>  	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
>  
> @@ -3067,6 +3086,8 @@ static void nand_onfi_detect_micron(struct nand_chip *chip,
>  
>  	chip->read_retries = micron->read_retry_options;
>  	chip->setup_read_retry = nand_setup_read_retry_micron;
> +
> +	nand_onfi_detect_micron_on_die_ecc(mtd, chip);
>  }
>  
>  /*
> @@ -3168,7 +3189,7 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
>  	}
>  
>  	if (p->jedec_id == NAND_MFR_MICRON)
> -		nand_onfi_detect_micron(chip, p);
> +		nand_onfi_detect_micron(mtd, chip, p);
>  
>  	return 1;
>  }
> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
> index 2bd6e4e..780ab58 100644
> --- a/include/linux/mtd/nand.h
> +++ b/include/linux/mtd/nand.h
> @@ -225,6 +225,10 @@ struct nand_chip;
>  /* Vendor-specific feature address (Micron) */
>  #define ONFI_FEATURE_ADDR_READ_RETRY	0x89
>  
> +/* Vendor-specific array operation mode (Micron) */
> +#define ONFI_FEATURE_ADDR_ARRAY_OP_MODE	0x90
> +#define ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC		0x08
> +
>  /* ONFI subfeature parameters length */
>  #define ONFI_SUBFEATURE_PARAM_LEN	4
>  

Brian
David Mosberger-Tang April 1, 2014, 3:26 p.m. UTC | #2
Brian,

On Tue, Apr 1, 2014 at 12:39 AM, Brian Norris
<computersforpeace@gmail.com> wrote:
> On Mon, Mar 31, 2014 at 05:28:53PM -0600, David Mosberger wrote:
>> Signed-off-by: David Mosberger <davidm@egauge.net>
>> ---
>>  drivers/mtd/nand/nand_base.c |   27 ++++++++++++++++++++++++---
>>  include/linux/mtd/nand.h     |    4 ++++
>>  2 files changed, 28 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>> index 09fe1b1..f145f00 100644
>> --- a/drivers/mtd/nand/nand_base.c
>> +++ b/drivers/mtd/nand/nand_base.c
>> @@ -3054,11 +3054,30 @@ static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
>>                       feature);
>>  }
>>
>> +static void
>
> Does this really need to be on its own line? It doesn't match the style
> of anything else.

Sure, I changed that.  It's not something that's flagged by chkpatch
and I'm working on enough different code-bases that I'm pretty
much blind to such things.

>> +nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
>> +                                struct nand_chip *chip)
>
> I'm really not sure why the inconsistent style throughout nand_base on
> using both an 'mtd' and a 'chip' parameter (we often assume that
> 'mtd->priv == chip'). If you need the mtd parameter, let's just pass it
> instead of 'chip'.

Ah, yes, that makes sense.  I updated my code.

>> +{
>> +     u8 features[ONFI_SUBFEATURE_PARAM_LEN];
>> +
>> +     if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
>> +                                 features) < 0)
>> +             return;
>> +
>> +     if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
>> +             /*
>> +              * If the chip has on-die ECC enabled, we kind of have
>> +              * to do the same...
>> +              */
>
> That's not really true at all, is it? We can simply disable the on-die
> ECC and use the provided spare area with a different ECC scheme, can't
> we? (e.g., software BCH; or an SoC's HW ECC) In fact, I think disabling
> the on-die ECC is a more reasonable default; nand_base is used by many
> drivers, most of which provide their own implementations, and many of
> which would not be compatible with the implementations you provide. A
> system should have to "opt in" somehow to enable this, I think.
>
> Additionally, you need a way to inform the hardware driver that you're
> using on-die ECC, so they can make the appropriate choices (to disable
> their HW ECC, for instance).
>
> BTW, what driver/controller/SoC are you running this with?

No, if you booted with on-die ECC enabled, presumably you have the
on-die ECC layout and you can't switch (sure, you can turn off ECC
but all hell will break lose when you actually try to read data with
the wrong ECC layout).

There seems to be this misconception that somehow on-die ECC would
be turned on "accidentially".  It is not.  Either a boot loader has to turn
it on or you have to pay Micron extra money (several dollars/chip, AFAIR)
to have them send you custom chips with on-die ECC enabled by default.

This is not to say that you may want a way to override it, but the patch
as it stands is safe.

  --david
pekon gupta April 2, 2014, 7:27 a.m. UTC | #3
Hi David,

>From: David Mosberger [mailto:davidm@egauge.net]
>>On Tue, Apr 1, 2014 at 12:39 AM, Brian Norris ><computersforpeace@gmail.com> wrote:
>>> On Mon, Mar 31, 2014 at 05:28:53PM -0600, David Mosberger wrote:
[...]

>>> +{
>>> +     u8 features[ONFI_SUBFEATURE_PARAM_LEN];
>>> +
>>> +     if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
>>> +                                 features) < 0)
>>> +             return;
>>> +
>>> +     if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
>>> +             /*
>>> +              * If the chip has on-die ECC enabled, we kind of have
>>> +              * to do the same...
>>> +              */
>>
>> That's not really true at all, is it? We can simply disable the on-die
>> ECC and use the provided spare area with a different ECC scheme, can't
>> we? (e.g., software BCH; or an SoC's HW ECC) In fact, I think disabling
>> the on-die ECC is a more reasonable default; nand_base is used by many
>> drivers, most of which provide their own implementations, and many of
>> which would not be compatible with the implementations you provide. A
>> system should have to "opt in" somehow to enable this, I think.
>>
>> Additionally, you need a way to inform the hardware driver that you're
>> using on-die ECC, so they can make the appropriate choices (to disable
>> their HW ECC, for instance).
>>
>> BTW, what driver/controller/SoC are you running this with?
>
>No, if you booted with on-die ECC enabled, presumably you have the
>on-die ECC layout and you can't switch (sure, you can turn off ECC
>but all hell will break lose when you actually try to read data with
>the wrong ECC layout).
>
You have to consider multiple scenarios here (as Brian also suggested).

(1) If a system does _not_ boot from NAND, instead it enables NAND
only in kernel (like for rootfs). Then you need a mechanism to
enable/disable on-die ECC in the kernel, the same way boot-loader does.

(2) If a boot-loader, has enabled on-die ECC, but kernel driver supports
even higher ECC scheme, then you need a hook in driver to allow user
to 'disable' this feature.

>There seems to be this misconception that somehow on-die ECC would
>be turned on "accidentially".  It is not.  Either a boot loader has to turn
>it on or you have to pay Micron extra money (several dollars/chip, AFAIR)
>to have them send you custom chips with on-die ECC enabled by default.
>
>This is not to say that you may want a way to override it, but the patch
>as it stands is safe.
>
>  --david
>--
>eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768


with regards, pekon
David Mosberger-Tang April 2, 2014, 3:07 p.m. UTC | #4
Pekon,

On Wed, Apr 2, 2014 at 1:27 AM, Gupta, Pekon <pekon@ti.com> wrote:

> You have to consider multiple scenarios here (as Brian also suggested).
>
> (1) If a system does _not_ boot from NAND, instead it enables NAND
> only in kernel (like for rootfs). Then you need a mechanism to
> enable/disable on-die ECC in the kernel, the same way boot-loader does.

Why?  Like I said before, on-die ECC does not get turned on "accidentally".
Furthermore, it is always *safe* to use on-die ECC,  unlike the ECC provided
by the platform-specific drivers.

> (2) If a boot-loader, has enabled on-die ECC, but kernel driver supports
> even higher ECC scheme, then you need a hook in driver to allow user
> to 'disable' this feature.

Does that really happen in practice?

I'm much more worried about someone using too weak an ECC than wanting
to use a stronger than required ECC.  In fact, that's what got us
into this mess: we ended up using SWECC on a chip that required multi-bit
ECC (yes, shame on us that we didn't catch that, but stuff happens; obviously
we were not the only ones caught off-guard by this change).

  --david
Gerhard Sittig April 2, 2014, 4:50 p.m. UTC | #5
On Wed, 2014-04-02 at 09:07 -0600, David Mosberger-Tang wrote:
> 
> Pekon,
> 
> On Wed, Apr 2, 2014 at 1:27 AM, Gupta, Pekon <pekon@ti.com> wrote:
> 
> > You have to consider multiple scenarios here (as Brian also suggested).
> >
> > (1) If a system does _not_ boot from NAND, instead it enables NAND
> > only in kernel (like for rootfs). Then you need a mechanism to
> > enable/disable on-die ECC in the kernel, the same way boot-loader does.
> 
> Why?  Like I said before, on-die ECC does not get turned on "accidentally".
> Furthermore, it is always *safe* to use on-die ECC,  unlike the ECC provided
> by the platform-specific drivers.

I feel that the motivation has been given in previous messages.

You appear to assume that the complete system runtime is using
the same ECC method.  This just need not be a necessity, and need
not be the best idea either.

Any component in the startup sequence may choose any arbitrary
method to deal with the chips.  It's perfectly fine for a ROM
loader to enable the on-die-ECC which was off after POR.  It's
fine for a bootloader to find the chip in this state and keep
using this method.  And it's fine for an operating system to find
the chip in this state yet picking a different ECC method, and
thus disabling the on-die-ECC within the chip.

All that is required for data integrity is that those who work
with data do so by the method which was used to create that data.
Still you can have boot files in areas that use on-die-ECC, and
have a filesystem in an area that uses software or hardware ECC
different from the on-die-ECC method, using the OOB as they
please.

Just provide tools, don't encode policy.  Leave it to the users
what they want to do.  Don't assume that everybody is in the very
same situation which you are in, or shares your priorities.

> > (2) If a boot-loader, has enabled on-die ECC, but kernel driver supports
> > even higher ECC scheme, then you need a hook in driver to allow user
> > to 'disable' this feature.
> 
> Does that really happen in practice?

Simple answer: Yes.  The example was given, when hardware or
software support methods which are stronger than the chip's ECC,
or evenly strong yet preferrable for some other reason (like
speed, or existing support and portability or general
availability).


Please take a step back, take a deep breath, and try to see why
people are asking questions.  It's not that you are being
rejected or need to "defend" something.  It's the opposite that
we are trying to create something that scratches more than one
personal itch, and serves other people's needs as well as yours.
And please accept that thorough review isn't free either, yet
spending the work upfront is cheaper than maintaining things that
don't fit, or have stuff rot and cause problems.  Getting ignored
is worse than receiving feedback and help. :)  Try to see the
benefit of it, please.  People spend their time on communicating
with you, nobody's fighting (it's easier to turn around and do
something else, if emotions get in the way and hinder progress --
we are all busy after all).


virtually yours
Gerhard Sittig
David Mosberger-Tang April 2, 2014, 5:02 p.m. UTC | #6
Gerhard,

On Wed, Apr 2, 2014 at 10:50 AM, Gerhard Sittig <gsi@denx.de> wrote:

> Just provide tools, don't encode policy.

Hey, I'm all for separating mechanism and policy.
It's just not what exists in the NAND driver now.
AFAIK, ECC selection is *per chip* (not per filesystem)
and is hard-coded by a combination of config-options
and platform-specific drivers.  Isn't that true?

> Please take a step back, take a deep breath, and try to see why
> people are asking questions.  It's not that you are being
> rejected or need to "defend" something.  It's the opposite that
> we are trying to create something that scratches more than one
> personal itch, and serves other people's needs as well as yours.

Yes, and I'm just trying to understand where you're coming from.

  --david
Gerhard Sittig April 3, 2014, 7:10 a.m. UTC | #7
On Wed, 2014-04-02 at 11:02 -0600, David Mosberger wrote:
> 
> AFAIK, ECC selection is *per chip* (not per filesystem)
> and is hard-coded by a combination of config-options
> and platform-specific drivers.  Isn't that true?

Let's look at the boot stages:

- ROM loader code reads bootloader code
- bootloader code reads bootloader configuration, and boot files
  (kernel image, device tree)
- OS kernels read and write the filesystem

So each of those phases run at different times and operate on
different areas of the chip.

- the bootloader need not write nor modify ROM loader data
- the OS kernel need not write nor modify bootloader data or boot
  files

Which means that you can perfectly operate those individual areas
with different ECC methods, depending on the capabilities of the
software that is running at this moment, and your preferences or
having access to different levels of hardware support, the will
or capability/incapability to sync among those components, or to
adjust and update these individual boot phases, etc.

I really don't see that enabling on-die-ECC in one stage requires
all other stages to follow.  It's one apparent approach, but need
not be the only one.  That's what previous messages tried to say.

Does this explanation help answer your question?


virtually yours
Gerhard Sittig
David Mosberger-Tang April 3, 2014, 3:26 p.m. UTC | #8
[Resend in plain text, sorry...]

On Thu, Apr 3, 2014 at 9:24 AM, David Mosberger <davidm@egauge.net> wrote:
>
> Gerhard,
>
> On Thu, Apr 3, 2014 at 1:10 AM, Gerhard Sittig <gsi@denx.de> wrote:
>>
>> I really don't see that enabling on-die-ECC in one stage requires
>> all other stages to follow.  It's one apparent approach, but need
>> not be the only one.  That's what previous messages tried to say.
>
>
>>
>> Does this explanation help answer your question?
>
>
> Yes, that helps, thanks for taking the time to answer!
>
> Leaving on-die ECC on is certainly not the only way to do it,
> but I still think it's a reasonable way.
>
> I doubt it's reasonable to modify each and every platform-driver
> to be able to decide whether to use on-die ECC or the best
> ECC mode it can supply.
>
> Maybe it's possible to use the new(ish) ecc_strength_ds/ecc_step_ds
> to chose whether or not to keep on-die ECC on?  Say, if the driver's
> ECC is stronger, turn on-die ECC off, otherwise leave it on?
>
> Again, I think the present patch is safe and reasonable, but I'm certainly
> open to better suggestions.
>
>   --david
> --
> eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768
diff mbox

Patch

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 09fe1b1..f145f00 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3054,11 +3054,30 @@  static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
 			feature);
 }
 
+static void
+nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
+				   struct nand_chip *chip)
+{
+	u8 features[ONFI_SUBFEATURE_PARAM_LEN];
+
+	if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
+				    features) < 0)
+		return;
+
+	if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
+		/*
+		 * If the chip has on-die ECC enabled, we kind of have
+		 * to do the same...
+		 */
+		pr_info("Using on-die ECC\n");
+	}
+}
+
 /*
  * Configure chip properties from Micron vendor-specific ONFI table
  */
-static void nand_onfi_detect_micron(struct nand_chip *chip,
-		struct nand_onfi_params *p)
+static void nand_onfi_detect_micron(struct mtd_info *mtd,
+		struct nand_chip *chip, struct nand_onfi_params *p)
 {
 	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
 
@@ -3067,6 +3086,8 @@  static void nand_onfi_detect_micron(struct nand_chip *chip,
 
 	chip->read_retries = micron->read_retry_options;
 	chip->setup_read_retry = nand_setup_read_retry_micron;
+
+	nand_onfi_detect_micron_on_die_ecc(mtd, chip);
 }
 
 /*
@@ -3168,7 +3189,7 @@  static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
 	}
 
 	if (p->jedec_id == NAND_MFR_MICRON)
-		nand_onfi_detect_micron(chip, p);
+		nand_onfi_detect_micron(mtd, chip, p);
 
 	return 1;
 }
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2bd6e4e..780ab58 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -225,6 +225,10 @@  struct nand_chip;
 /* Vendor-specific feature address (Micron) */
 #define ONFI_FEATURE_ADDR_READ_RETRY	0x89
 
+/* Vendor-specific array operation mode (Micron) */
+#define ONFI_FEATURE_ADDR_ARRAY_OP_MODE	0x90
+#define ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC		0x08
+
 /* ONFI subfeature parameters length */
 #define ONFI_SUBFEATURE_PARAM_LEN	4