diff mbox

[RFC] Handling of errors for AMD NOR (cfi_cmdset_0002) chips

Message ID 4D797E23.6070409@emcraft.com
State RFC
Headers show

Commit Message

Ilya Yanok March 11, 2011, 1:42 a.m. UTC
Hello,

current cfi_cmdset_0002.c code does not implement handling of error 
statuses reported by the chip during write/erase and just waits for a 
software timeout instead.

We think that proper handling of these conditions could help us to debug 
the issue we have with the NOR flash and UBIFS so we'd like to implement 
such handling.

I've tried to do this, please see my initial patch below. For now errors 
are just reported via pr_debug().

I'd like to hear any comments. Am I doing things right or not? Maybe I'm 
missing something?

Here is the patch:

  /*
   * Return true if the chip is ready.
   *
@@ -520,12 +592,17 @@ static struct mtd_info *cfi_amdstd_setup(struct 
mtd_info *mtd)
   */
  static int __xipram chip_ready(struct map_info *map, unsigned long addr)
  {
-       map_word d, t;
+       struct cfi_private *cfi = map->fldrv_priv;
+       map_word r1, r2, r3;
+       int i, res = 0;

-       d = map_read(map, addr);
-       t = map_read(map, addr);
+       r1 = map_read(map, addr);
+       r2 = map_read(map, addr);
+       r3 = map_read(map, addr);

-       return map_word_equal(map, d, t);
+       for (i = 0; i < cfi_interleave(cfi); i++)
+               res += check_chip(map, i, r1, r2, r3);
+       return !res;
  }

  /*

Thanks!

Regards, Ilya.

Comments

Norbert van Bolhuis March 11, 2011, 8:25 a.m. UTC | #1
On 03/11/11 02:42, Ilya Yanok wrote:
> Hello,
>
> current cfi_cmdset_0002.c code does not implement handling of error statuses reported by the chip during write/erase and just waits for a software timeout instead.
>
> We think that proper handling of these conditions could help us to debug the issue we have with the NOR flash and UBIFS so we'd like to implement such handling.
>
> I've tried to do this, please see my initial patch below. For now errors are just reported via pr_debug().
>
> I'd like to hear any comments. Am I doing things right or not? Maybe I'm missing something?
>


What issue you have with NOR flash and UBIFS ?

---
N. van Bolhuis.
Markus Niebel March 15, 2011, 8:01 a.m. UTC | #2
Hello Ilya,

what type of chips are causing your problems?

AFAIK in cfi_cmdset_0002 the use of max timouts from cfi query struct is 
missing (support in the per chip struct is prepared). This should be 
done first (I think).

Regarding the handling of status bits you can use MERGESTATUS / 
cfi_merge_status (from linux/mtd/cfi.h)

Would it be useful to send rough patches for 2.6.34 for the max timout?

Markus

-

Am 11.03.2011 02:42, schrieb Ilya Yanok:
> Hello,
>
> current cfi_cmdset_0002.c code does not implement handling of error
> statuses reported by the chip during write/erase and just waits for a
> software timeout instead.
>
> We think that proper handling of these conditions could help us to debug
> the issue we have with the NOR flash and UBIFS so we'd like to implement
> such handling.
>
> I've tried to do this, please see my initial patch below. For now errors
> are just reported via pr_debug().
>
> I'd like to hear any comments. Am I doing things right or not? Maybe I'm
> missing something?
>
> Here is the patch:
>
> diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c
> b/drivers/mtd/chips/cfi_cmdset_0002.c
> index 156aaa6..804cfc9 100644
> --- a/drivers/mtd/chips/cfi_cmdset_0002.c
> +++ b/drivers/mtd/chips/cfi_cmdset_0002.c
> @@ -507,6 +507,78 @@ static struct mtd_info *cfi_amdstd_setup(struct
> mtd_info *mtd)
> return NULL;
> }
>
> +static u32 get_chip_status(struct map_info *map, map_word val, int n)
> +{
> + struct cfi_private *cfi = map->fldrv_priv;
> + int wordwidth, words_per_bus, chip_mode, chips_per_word;
> + unsigned long onestat;
> + u32 res;
> +
> + BUG_ON(n >= cfi_interleave(cfi));
> +
> + if (map_bankwidth_is_large(map)) {
> + wordwidth = sizeof(unsigned long);
> + words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1
> + } else {
> + wordwidth = map_bankwidth(map);
> + words_per_bus = 1;
> + }
> +
> + chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
> + chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
> +
> + onestat = val.x[n/chips_per_word];
> + onestat >>= chip_mode * 8 * (n % chips_per_word);
> +
> + res = (u32) onestat;
> +
> + /* Last, determine what the bit-pattern should be for a single
> + device, according to chip mode and endianness... */
> + switch (chip_mode) {
> + case 1:
> + break;
> + case 2:
> + res = cfi16_to_cpu(res);
> + break;
> + case 4:
> + res = cfi32_to_cpu(res);
> + break;
> + default: BUG();
> + }
> + return res;
> +}
> +
> +/*
> + * Chip statuses
> + */
> +#define CHIP_READY 0
> +#define CHIP_BUSY 1
> +#define CHIP_TIMEOUT 2
> +#define CHIP_ABORT 3
> +
> +static int check_chip(struct map_info *map, int n, map_word read_1,
> + map_word read_2, map_word read_3)
> +{
> + u32 r1 = get_chip_status(map, read_1, n);
> + u32 r2 = get_chip_status(map, read_2, n);
> + u32 r3 = get_chip_status(map, read_3, n);
> +
> + if ((r1 == r2) && (r2 == r3))
> + return CHIP_READY;
> +
> + if (r1 & (1 << 1)) {
> + pr_debug("Write abort on chip %d\n", n);
> + return CHIP_ABORT;
> + }
> +
> + if (r1 & (1 << 5)) {
> + pr_debug("Timeout on chip %d\n",n);
> + return CHIP_TIMEOUT;
> + }
> +
> + return CHIP_BUSY;
> +}
> +
> /*
> * Return true if the chip is ready.
> *
> @@ -520,12 +592,17 @@ static struct mtd_info *cfi_amdstd_setup(struct
> mtd_info *mtd)
> */
> static int __xipram chip_ready(struct map_info *map, unsigned long addr)
> {
> - map_word d, t;
> + struct cfi_private *cfi = map->fldrv_priv;
> + map_word r1, r2, r3;
> + int i, res = 0;
>
> - d = map_read(map, addr);
> - t = map_read(map, addr);
> + r1 = map_read(map, addr);
> + r2 = map_read(map, addr);
> + r3 = map_read(map, addr);
>
> - return map_word_equal(map, d, t);
> + for (i = 0; i < cfi_interleave(cfi); i++)
> + res += check_chip(map, i, r1, r2, r3);
> + return !res;
> }
>
> /*
>
> Thanks!
>
> Regards, Ilya.
>
> ______________________________________________________
> Linux MTD discussion mailing list
> http://lists.infradead.org/mailman/listinfo/linux-mtd/
Detlev Zundel March 23, 2011, 3:47 p.m. UTC | #3
Hi Norbert,

> What issue you have with NOR flash and UBIFS ?

We're still working on the issue initially reported by Anatolij here:

http://thread.gmane.org/gmane.linux.drivers.mtd/33832

Cheers
  Detlev

--
DENX Software Engineering GmbH,      MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich,  Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-40 Fax: (+49)-8142-66989-80 Email: dzu@denx.de
Detlev Zundel March 23, 2011, 4:16 p.m. UTC | #4
Hi Markus,

> what type of chips are causing your problems?

Two 16-bit Spansion S29GL512P interleaved into 32-bit bus width.

> AFAIK in cfi_cmdset_0002 the use of max timouts from cfi query struct
> is missing (support in the per chip struct is prepared). This should
> be done first (I think).

I don't think we have this problem.  As can be seen by the link provided
in my other mail, we are suffering from recovery problems after a reset,
everything else looks normal.

> Regarding the handling of status bits you can use MERGESTATUS /
> cfi_merge_status (from linux/mtd/cfi.h)
>
> Would it be useful to send rough patches for 2.6.34 for the max timout?

It is always uselful to post patches that solve problems for some of us,
so yes please! ;)

Best wishes
  Detlev Zundel

--
DENX Software Engineering GmbH,      MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich,  Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-40 Fax: (+49)-8142-66989-80 Email: dzu@denx.de
Artem Bityutskiy March 23, 2011, 5:33 p.m. UTC | #5
On Wed, 2011-03-23 at 16:47 +0100, Detlev Zundel wrote:
> Hi Norbert,
> 
> > What issue you have with NOR flash and UBIFS ?
> 
> We're still working on the issue initially reported by Anatolij here:
> 
> http://thread.gmane.org/gmane.linux.drivers.mtd/33832

Note, I've started implementing the power cut emulation in UBI to
emulate power cuts and unstable bits. It is work in progress, I keep it
in the 'devel' branch of ubi-2.6.git tree.

The plan is then to implement user-space test, probably improving the
integck test, and then start fixing the issues.
Ilya Yanok March 23, 2011, 6:27 p.m. UTC | #6
Hello Markus,

Markus Niebel <list-09_linux_mtd <at> tqsc.de> writes:
> what type of chips are causing your problems?

As Detlev has already wrote they are 16-bit Spansion S29GL512P.

> AFAIK in cfi_cmdset_0002 the use of max timouts from cfi query struct is 
> missing (support in the per chip struct is prepared). This should be 
> done first (I think).

Hm, I have to admit I'm not really a flash expert. Could you please give me some
pointer?

> Regarding the handling of status bits you can use MERGESTATUS / 
> cfi_merge_status (from linux/mtd/cfi.h)

Well, most of my code is actually stolen from cfi_merge_status implementation.
Still, I think it's incorrect to use MERGESTATUS directly: we need to filter
that have already finished (with no toggling bits), otherwise we can take the
data byte as a status.

> Would it be useful to send rough patches for 2.6.34 for the max timout?

Yes, please do post them. Any information would be useful.

Regards, Ilya.
Norbert van Bolhuis March 24, 2011, 7:35 a.m. UTC | #7
On 03/23/11 16:47, Detlev Zundel wrote:
> Hi Norbert,
>
>> What issue you have with NOR flash and UBIFS ?
>
> We're still working on the issue initially reported by Anatolij here:
>
> http://thread.gmane.org/gmane.linux.drivers.mtd/33832
>
> Cheers
>    Detlev
>


ok, thanks.

But the unstable bit problem is different from software timeout
related issues regarding UBI and NOR flash the OP mentioned.
It is anyway good to hear work is in progress here.

I asked about the software timeout issue because I've seen false
timeouts with powerpc + CONFIG_NO_HZ a while ago.
In my case the low-level do_write_buffer (cfi_cmdset_0002.c) timed
out too early. See
http://lkml.org/lkml/2009/9/3/84

---
N. van Bolhuis.
Alexander Stein March 24, 2011, 7:45 a.m. UTC | #8
Hello,

On Friday 11 March 2011, 02:42:59 Ilya Yanok wrote:
> current cfi_cmdset_0002.c code does not implement handling of error
> statuses reported by the chip during write/erase and just waits for a
> software timeout instead.
> 
> We think that proper handling of these conditions could help us to debug
> the issue we have with the NOR flash and UBIFS so we'd like to implement
> such handling.
> 
> I've tried to do this, please see my initial patch below. For now errors
> are just reported via pr_debug().
> 
> I'd like to hear any comments. Am I doing things right or not? Maybe I'm
> missing something?

I'm not that much into these cfi commands, but does this status check address 
immediate finished flash erases?
Let me explain: We have a NOR-Flash partition used for UBIFS. We noticed 
ubiformat takes about 11 minutes to format it (~100MiB). In u-boot ubi create 
takes about 2 _seconds_ iff the partition has been erased before.
If the flash hasn't been erased before it takes about the same time.
It seems u-boot can detect if the flash is already ready after invoking a 
erase command on an empty sector while linux doesn't.
I once checked the erase/format time for 1 sector in ubiformat:
non-empty sector: ~1s
empty-sector:    0.5s

Regards,
Alexander
Artem Bityutskiy March 24, 2011, 7:55 a.m. UTC | #9
On Thu, 2011-03-24 at 08:45 +0100, Alexander Stein wrote:
> Hello,
> 
> On Friday 11 March 2011, 02:42:59 Ilya Yanok wrote:
> > current cfi_cmdset_0002.c code does not implement handling of error
> > statuses reported by the chip during write/erase and just waits for a
> > software timeout instead.
> > 
> > We think that proper handling of these conditions could help us to debug
> > the issue we have with the NOR flash and UBIFS so we'd like to implement
> > such handling.
> > 
> > I've tried to do this, please see my initial patch below. For now errors
> > are just reported via pr_debug().
> > 
> > I'd like to hear any comments. Am I doing things right or not? Maybe I'm
> > missing something?
> 
> I'm not that much into these cfi commands, but does this status check address 
> immediate finished flash erases?
> Let me explain: We have a NOR-Flash partition used for UBIFS. We noticed 
> ubiformat takes about 11 minutes to format it (~100MiB). In u-boot ubi create 
> takes about 2 _seconds_ iff the partition has been erased before.
> If the flash hasn't been erased before it takes about the same time.
> It seems u-boot can detect if the flash is already ready after invoking a 
> erase command on an empty sector while linux doesn't.
> I once checked the erase/format time for 1 sector in ubiformat:
> non-empty sector: ~1s
> empty-sector:    0.5s

That's just question of improving ubiformat a bit.
Detlev Zundel March 24, 2011, 9:27 a.m. UTC | #10
Hi Norbert,

> On 03/23/11 16:47, Detlev Zundel wrote:
>> Hi Norbert,
>>
>>> What issue you have with NOR flash and UBIFS ?
>>
>> We're still working on the issue initially reported by Anatolij here:
>>
>> http://thread.gmane.org/gmane.linux.drivers.mtd/33832
>>
>> Cheers
>>    Detlev
>>
>
>
> ok, thanks.
>
> But the unstable bit problem is different from software timeout
> related issues regarding UBI and NOR flash the OP mentioned.
> It is anyway good to hear work is in progress here.

It is never good to ignore errors reported by the hardware if you are
looking for problems in higher software layers.  That's why we decided
to implement such error checks which can then be helpful in our further
testing.  We are hoping that we can better differentiate between
software and hardware problems this way.

> I asked about the software timeout issue because I've seen false
> timeouts with powerpc + CONFIG_NO_HZ a while ago.
> In my case the low-level do_write_buffer (cfi_cmdset_0002.c) timed
> out too early. See
> http://lkml.org/lkml/2009/9/3/84

Ok, I see, thanks for the pointer.  The problems you experienced then
were unneccessary software timeouts, but there was no data corruption,
correct?

Best wishes
  Detlev Zundel

--
DENX Software Engineering GmbH,      MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich,  Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-40 Fax: (+49)-8142-66989-80 Email: dzu@denx.de
Norbert van Bolhuis March 24, 2011, 10:48 a.m. UTC | #11
On 03/24/11 10:27, Detlev Zundel wrote:
...
>> But the unstable bit problem is different from software timeout
>> related issues regarding UBI and NOR flash the OP mentioned.
>> It is anyway good to hear work is in progress here.
>
> It is never good to ignore errors reported by the hardware if you are
> looking for problems in higher software layers.  That's why we decided
> to implement such error checks which can then be helpful in our further
> testing.  We are hoping that we can better differentiate between
> software and hardware problems this way.
>

ok, I see.

>> I asked about the software timeout issue because I've seen false
>> timeouts with powerpc + CONFIG_NO_HZ a while ago.
>> In my case the low-level do_write_buffer (cfi_cmdset_0002.c) timed
>> out too early. See
>> http://lkml.org/lkml/2009/9/3/84
>
> Ok, I see, thanks for the pointer.  The problems you experienced then
> were unneccessary software timeouts, but there was no data corruption,
> correct?
>

yes that is correct.
It happened only when system was 'stressed' a bit and it caused false
software timeouts which made UBIFS remount the fs in read-only mode.

---
N. van Bolhuis.
diff mbox

Patch

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c 
b/drivers/mtd/chips/cfi_cmdset_0002.c
index 156aaa6..804cfc9 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -507,6 +507,78 @@  static struct mtd_info *cfi_amdstd_setup(struct 
mtd_info *mtd)
         return NULL;
  }

+static u32 get_chip_status(struct map_info *map, map_word val, int n)
+{
+       struct cfi_private *cfi = map->fldrv_priv;
+       int wordwidth, words_per_bus, chip_mode, chips_per_word;
+       unsigned long onestat;
+       u32 res;
+
+       BUG_ON(n >= cfi_interleave(cfi));
+
+       if (map_bankwidth_is_large(map)) {
+               wordwidth = sizeof(unsigned long);
+               words_per_bus = (map_bankwidth(map)) / wordwidth; // 
i.e. normally 1
+       } else {
+               wordwidth = map_bankwidth(map);
+               words_per_bus = 1;
+       }
+
+       chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
+       chips_per_word = wordwidth * cfi_interleave(cfi) / 
map_bankwidth(map);
+
+       onestat = val.x[n/chips_per_word];
+       onestat >>= chip_mode * 8 * (n % chips_per_word);
+
+       res = (u32) onestat;
+
+       /* Last, determine what the bit-pattern should be for a single
+          device, according to chip mode and endianness... */
+       switch (chip_mode) {
+       case 1:
+               break;
+       case 2:
+               res = cfi16_to_cpu(res);
+               break;
+       case 4:
+               res = cfi32_to_cpu(res);
+               break;
+       default: BUG();
+       }
+       return res;
+}
+
+/*
+ * Chip statuses
+ */
+#define CHIP_READY     0
+#define CHIP_BUSY      1
+#define CHIP_TIMEOUT   2
+#define CHIP_ABORT     3
+
+static int check_chip(struct map_info *map, int n, map_word read_1,
+               map_word read_2, map_word read_3)
+{
+       u32 r1 = get_chip_status(map, read_1, n);
+       u32 r2 = get_chip_status(map, read_2, n);
+       u32 r3 = get_chip_status(map, read_3, n);
+
+       if ((r1 == r2) && (r2 == r3))
+               return CHIP_READY;
+
+       if (r1 & (1 << 1)) {
+               pr_debug("Write abort on chip %d\n", n);
+               return CHIP_ABORT;
+       }
+
+       if (r1 & (1 << 5)) {
+               pr_debug("Timeout on chip %d\n",n);
+               return CHIP_TIMEOUT;
+       }
+
+       return CHIP_BUSY;
+}
+