Patchwork UBIFS failure & stable page writes

login
register
mail settings
Submitter Adrian Hunter
Date June 12, 2013, 7:58 a.m.
Message ID <51B82A1D.30009@intel.com>
Download mbox | patch
Permalink /patch/250684/
State New
Headers show

Comments

Adrian Hunter - June 12, 2013, 7:58 a.m.
This shows that the orphan area has recorded the orphaning of inode 0 (non-existent!) and inode 1 (root inode!!!).

A sticking plaster solution is to prevent orphans-processing from deleting the root inode e.g.



On 11/06/13 15:16, Prins Anton (ST-CO/ENG1.1) wrote:
> Hi Artem,
> 
> We got logging inside the recovery procedure to print the 'orphan LEB' information.
> I wonder if the LEB9 moves yes/no? So is it safe to use 'dd' with an offset of 9*LEB?
> (Difficulty is we have no file system, so this was the first option...)
> 
> You see flood of message's; but I suppose the first part is the information required!
> If not I have to make some solution to go for 'dd'.
> 
> Thanks in advance,
> 
> Anton
> 
> 
> 
> leb9dump-filtered.log
> 
> 
> UBIFS: parse sync
> UBIFS: recovery needed
> ubi_leb_read datafs 9
> 00000000: 31 18 10 06 1a c9 4a be 09 ca 5d 00 00 00 00 00 28 00 00 00 0b 00 00 00 60 05 00 00 00 00 00 80  1.....J...].....(.......`.......
> 00000020: ea 88 0c 00 00 00 00 00 31 18 10 06 74 7c 20 ed 00 00 00 00 00 00 00 00 1c 00 00 00 05 00 00 00  ........1...t| .................
> 00000040: bc 0f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................
> 00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................


struct ubifs_ch {
	__le32 magic;		06101831	06101831
	__le32 crc;		be4ac91a	ed207c74
	__le64 sqnum;		5dca09		0
	__le32 len;		28		1c
	__u8 node_type;		b=orph		5=pad
	__u8 group_type;	0		0
	__u8 padding[2];	0		0
} __packed;

struct ubifs_orph_node {
	struct ubifs_ch ch;
	__le64 cmt_no;		60 05 00 00 00 00 00 80
	__le64 inos[];		0c88ea
				
} __packed;

struct ubifs_pad_node {
	struct ubifs_ch ch;
	__le32 pad_len;		0fbc
} __packed;


<snip>

> 00001000: 31 18 10 06 33 f1 d9 f4 0c 80 6e 00 00 00 00 00 28 00 00 00 0b 00 00 00 73 05 00 00 00 00 00 80  1...3.....n.....(.......s.......
> 00001020: 9f ea 0e 00 00 00 00 00 31 18 10 06 74 7c 20 ed 00 00 00 00 00 00 00 00 1c 00 00 00 05 00 00 00  ........1...t| .................
> 00001040: bc 0f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................
> 00001060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................

	__le32 len;		28
	__le64 cmt_no;		73 05 00 00 00 00 00 80
	__le64 inos[];		0eea9f

<snip>

> 00002000: 31 18 10 06 c0 79 a0 c5 35 65 6f 00 00 00 00 00 28 00 00 00 0b 00 00 00 74 05 00 00 00 00 00 80  1....y..5eo.....(.......t.......
> 00002020: 4c 0b 0f 00 00 00 00 00 31 18 10 06 74 7c 20 ed 00 00 00 00 00 00 00 00 1c 00 00 00 05 00 00 00  L.......1...t| .................
> 00002040: bc 0f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................
> 00002060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................

	__le32 len;		28
	__le64 cmt_no;		74 05 00 00 00 00 00 80
	__le64 inos[];		0f0b4c

<snip>

> 00003000: 31 18 10 06 f8 40 ed 34 89 2f 71 00 00 00 00 00 28 00 00 00 0b 00 00 00 76 05 00 00 00 00 00 80  1....@.4./q.....(.......v.......
> 00003020: 00 00 00 00 00 00 00 00 31 18 10 06 74 7c 20 ed 00 00 00 00 00 00 00 00 1c 00 00 00 05 00 00 00  ........1...t| .................
> 00003040: bc 0f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................
> 00003060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................

	__le32 len;		28
	__le64 cmt_no;		76 05 00 00 00 00 00 80
	__le64 inos[];		0				<----- !!!!!

<snip>

> 00004000: 31 18 10 06 43 37 1d 7e 00 73 77 00 00 00 00 00 28 00 00 00 0b 00 00 00 7d 05 00 00 00 00 00 80  1...C7.~.sw.....(.......}.......
> 00004020: 01 00 00 00 00 00 00 00 31 18 10 06 74 7c 20 ed 00 00 00 00 00 00 00 00 1c 00 00 00 05 00 00 00  ........1...t| .................
> 00004040: bc 0f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................
> 00004060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................

	__le32 len;		28
	__le64 cmt_no;		7d 05 00 00 00 00 00 80
	__le64 inos[];		1				<----- !!!!!

<snip>

> 00005000: 31 18 10 06 86 67 56 c0 53 c8 7d 00 00 00 00 00 28 00 00 00 0b 00 00 00 84 05 00 00 00 00 00 80  1....gV.S.}.....(...............
> 00005020: 8a 18 11 00 00 00 00 00 31 18 10 06 74 7c 20 ed 00 00 00 00 00 00 00 00 1c 00 00 00 05 00 00 00  ........1...t| .................
> 00005040: bc 0f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................
> 00005060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................................

	__le32 len;		28
	__le64 cmt_no;		84 05 00 00 00 00 00 80
	__le64 inos[];		11188a

<snip>

> 00006000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff  ................................

<snip>

> UBIFS: recovery completed
> UBIFS: mounted UBI device 1, volume 1, name "datafs"
> UBIFS: file system size:   640475136 bytes (625464 KiB, 610 MiB, 1241 LEBs)
> UBIFS: journal size:       10452992 bytes (10208 KiB, 9 MiB, 21 LEBs)
> UBIFS: media format:       w4/r0 (latest is w4/r0)
> UBIFS: default compressor: lzo
> UBIFS: reserved for root:  0 bytes (0 KiB)
> UBIFS error (pid 1): ubifs_iget: failed to read inode 1, error -2
> ubifs_fill_super couldn't read UBIFS_ROOT_INO
>
Prins Anton (ST-CO/ENG1.1) - June 12, 2013, 11:13 a.m.
Does it make sense to use chk_orphans?

I suppose I have to do: $ echo "1" > chk_orphans

Or am I wrong?

Met vriendelijke groeten | Best Regards, 
Anton Prins
Prins Anton (ST-CO/ENG1.1) - June 12, 2013, 11:57 a.m.
Ok clear!

Regarding mount debug: I suppose I only will see this on the 'first' boot after the problem is in the persistence storage?
(Up to now I'm not able to signal the point-of-failure... I'm only looking to an (possibly) after recovery situation).

Or makes this sense to do this mount debug on the existing failing device?

FYI: 
- with 100 devices power cycling every 5 minutes for a weekend we DON'T see a problem.
- with 100 devices powered on for a weekend and after that give a single power-cycle shows maybe 1 or 2 failing devices (but sometimes '0').

So a main problem in analyzing is the 'moment of failure' and 'reproduction (rate)'.

Met vriendelijke groeten | Best Regards, 
Anton Prins

Met vriendelijke groeten | Best Regards, 
Anton Prins


-----Original Message-----
From: Adrian Hunter [mailto:adrian.hunter@intel.com] 
Sent: woensdag 12 juni 2013 14:00
To: Prins Anton (ST-CO/ENG1.1)
Cc: dedekind1@gmail.com; linux-mtd@lists.infradead.org
Subject: Re: UBIFS failure & stable page writes

On 12/06/13 14:13, Prins Anton (ST-CO/ENG1.1) wrote:
> Does it make sense to use chk_orphans?

That checks for missing orphans but the problem is the opposite: the
presence of orphans that should not be there.

>
> I suppose I have to do: $ echo "1" > chk_orphans
>
> Or am I wrong?
>
> Met vriendelijke groeten | Best Regards, 
> Anton Prins
>
Adrian Hunter - June 12, 2013, noon
On 12/06/13 14:13, Prins Anton (ST-CO/ENG1.1) wrote:
> Does it make sense to use chk_orphans?

That checks for missing orphans but the problem is the opposite: the
presence of orphans that should not be there.

>
> I suppose I have to do: $ echo "1" > chk_orphans
>
> Or am I wrong?
>
> Met vriendelijke groeten | Best Regards, 
> Anton Prins
>
Adrian Hunter - June 12, 2013, 1:09 p.m.
Can you tar and send fs/ubifs directory?

On 12/06/13 14:57, Prins Anton (ST-CO/ENG1.1) wrote:
> Ok clear!
>
> Regarding mount debug: I suppose I only will see this on the 'first' boot after the problem is in the persistence storage?
> (Up to now I'm not able to signal the point-of-failure... I'm only looking to an (possibly) after recovery situation).
>
> Or makes this sense to do this mount debug on the existing failing device?
>
> FYI: 
> - with 100 devices power cycling every 5 minutes for a weekend we DON'T see a problem.
> - with 100 devices powered on for a weekend and after that give a single power-cycle shows maybe 1 or 2 failing devices (but sometimes '0').
>
> So a main problem in analyzing is the 'moment of failure' and 'reproduction (rate)'.
>
> Met vriendelijke groeten | Best Regards, 
> Anton Prins
>
> Met vriendelijke groeten | Best Regards, 
> Anton Prins
>
>
> -----Original Message-----
> From: Adrian Hunter [mailto:adrian.hunter@intel.com] 
> Sent: woensdag 12 juni 2013 14:00
> To: Prins Anton (ST-CO/ENG1.1)
> Cc: dedekind1@gmail.com; linux-mtd@lists.infradead.org
> Subject: Re: UBIFS failure & stable page writes
>
> On 12/06/13 14:13, Prins Anton (ST-CO/ENG1.1) wrote:
>> Does it make sense to use chk_orphans?
> That checks for missing orphans but the problem is the opposite: the
> presence of orphans that should not be there.
>
>> I suppose I have to do: $ echo "1" > chk_orphans
>>
>> Or am I wrong?
>>
>> Met vriendelijke groeten | Best Regards, 
>> Anton Prins
>>
Prins Anton (ST-CO/ENG1.1) - June 12, 2013, 1:57 p.m.
Ofcourse

Met vriendelijke groeten | Best Regards, 
Anton Prins

Bosch Security Systems BV, 
Conference Systems (ST-CO/ENG1.1) 
Torenallee 49
5617 BA  Eindhoven
The Netherlands 
www.boschsecurity.com 
T. +31 (0)40 2577077
anton.prins@nl.bosch.com


-----Original Message-----
From: Adrian Hunter [mailto:adrian.hunter@intel.com] 

Sent: woensdag 12 juni 2013 15:10
To: Prins Anton (ST-CO/ENG1.1)
Cc: dedekind1@gmail.com; linux-mtd@lists.infradead.org
Subject: Re: UBIFS failure & stable page writes

Can you tar and send fs/ubifs directory?

On 12/06/13 14:57, Prins Anton (ST-CO/ENG1.1) wrote:
> Ok clear!

>

> Regarding mount debug: I suppose I only will see this on the 'first' boot after the problem is in the persistence storage?

> (Up to now I'm not able to signal the point-of-failure... I'm only looking to an (possibly) after recovery situation).

>

> Or makes this sense to do this mount debug on the existing failing device?

>

> FYI: 

> - with 100 devices power cycling every 5 minutes for a weekend we DON'T see a problem.

> - with 100 devices powered on for a weekend and after that give a single power-cycle shows maybe 1 or 2 failing devices (but sometimes '0').

>

> So a main problem in analyzing is the 'moment of failure' and 'reproduction (rate)'.

>

> Met vriendelijke groeten | Best Regards, 

> Anton Prins

>

> Met vriendelijke groeten | Best Regards, 

> Anton Prins

>

>

> -----Original Message-----

> From: Adrian Hunter [mailto:adrian.hunter@intel.com] 

> Sent: woensdag 12 juni 2013 14:00

> To: Prins Anton (ST-CO/ENG1.1)

> Cc: dedekind1@gmail.com; linux-mtd@lists.infradead.org

> Subject: Re: UBIFS failure & stable page writes

>

> On 12/06/13 14:13, Prins Anton (ST-CO/ENG1.1) wrote:

>> Does it make sense to use chk_orphans?

> That checks for missing orphans but the problem is the opposite: the

> presence of orphans that should not be there.

>

>> I suppose I have to do: $ echo "1" > chk_orphans

>>

>> Or am I wrong?

>>

>> Met vriendelijke groeten | Best Regards, 

>> Anton Prins

>>
Mats Kärrman - June 13, 2013, 10:54 a.m.
Hi!

Joining in from thread "UBIFS Orphaned inode 1" since it's the same problem.
This is an orphans block from one of my crashed units:

=== Orphan nodes (LEB 9) ===

$ dd if=/dev/ubi0_0 bs=1 count=130944 skip=1178496 |hd
00000000  31 18 10 06 a9 ec d2 ed  c8 f6 6a 01 00 00 00 00  |1.........j.....|
00000010  30 00 00 00 0b 00 00 00  a4 06 00 00 00 00 00 80  |0...............|
00000020  14 72 12 00 00 00 00 00  01 00 00 00 00 00 00 00  |.r..............|
00000030  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................|
*
0001ff80

Decoded:
31 18 10 06 - UBIFS_NODE_MAGIC
a9 ec d2 ed - CRC
c8 f6 6a 01 00 00 00 00 - Sequence number
30 00 00 00 - node length = 48 bytes
0b - node type = Orphan node
00 - group type = Not part of group
00 00 - padding
a4 06 00 00 00 00 00 80 - commit number = 0x6a4, last node flag = set
14 72 12 00 00 00 00 00 - inode num = 0x127214
01 00 00 00 00 00 00 00 - inode num = 1 !!!

I have v2.6.35 with a lot of patches from Artem's maintenance tree and some more from the later developments.
I diffed my fs/ubifs to the files from Antion Prins and they are almost identical, mostly debug print changes and some small things that I think are unrelated.

I have not been able to reproduce the error but haven't got the resources to set up 100 test units. So far a hand-full of units has crashed in the field.

BR // Mats
Prins Anton (ST-CO/ENG1.1) - June 13, 2013, 7:24 p.m.
Hi Mats,

Some questions to get more feeling with our problem:

1. Does your application add/remove files (delete/unlink?)
2. Does your kernel run on a single or multi-core machine?
3. Do you experience it with an ungraceful shutdown or also graceful?

The statistics we see is that rebooting every 5 minutes will not give higher occurrence.
It seems that device turned on for a longer time have more chance to fail; but as said not predictable.
Mats Kärrman - June 15, 2013, 10:45 p.m.
Hi Anton,

1. Yes, there are files added, removed and renamed.
2. Single core, 32-bit PowerPC e300c4 in BE mode, CONFIG_PREEMPT=y.
3. Never tried a graceful one ;) Normal procedure is shutting down application + sync + power off to be as quick as possible. However, as other people are responsible for those units that experienced this problem I can't be sure that no-one used the shutdown command but it's unlikely.

BR // Mats
Prins Anton (ST-CO/ENG1.1) - June 17, 2013, 11:20 a.m.
Thanks Mats for your information; hopefully the 'double free' patches will solve something!

Over here we did run a test over the weekend with 70 devices and the logging as mentioned in previous messages.
(without the 'double free' patches mentioned earlier, this to be able to point to a root-cause). 
Next to the logging we did binary analyze LEB9.

The results is that the 70 devices in total had 88 writes to the orphan area of node 0.. (ranging from 0 to 5 node 0 writes on a single device).
We also analyzed the 70 LEB9s which gave the same total of 88 node 0's inside them.

These writes of node 0 did not turn in a reboot-failure or any other problem; 
So I expect a second scenario where node 1 is written, but not yet seen in any logging up to now!

Adrian, when will de orphan area be 'cleaned'? 
May I analyze it after reboot/recovery as well?

Now we know about our logging of 'node 0'; we will apply the patches and check the log-results in this case.
A subset of our devices; I will keep in the old-scenario to test for node-1 logging (and reboot failures)

Adrian, could there be a way to trigger the problem that the 'double free' patch solves, more often?

Thanks in Advance,

Anton
Prins Anton (ST-CO/ENG1.1) - June 18, 2013, 6:31 a.m.
Last night I did do additional tests with create/remove of files in a while loop on a synchronous mounted UBIFS.
I did NOT get any node 0 or 1 written over this night, but obvious enough I saw some strange node id in the orphan area: 0xdead4ead

0xdead4ead Is known to me as SPINLOCK_MAGIC; but no glue why It is in the orphan area if node number...
Is something known about 0xdead4ead?
Adrian Hunter - June 18, 2013, 7:01 a.m.
On 18/06/13 09:31, Prins Anton (ST-CO/ENG1.1) wrote:
> Last night I did do additional tests with create/remove of files in a
> while loop on a synchronous mounted UBIFS. I did NOT get any node 0 or 1
> written over this night, but obvious enough I saw some strange node id in
> the orphan area: 0xdead4ead
> 
> 0xdead4ead Is known to me as SPINLOCK_MAGIC; but no glue why It is in the
> orphan area if node number... Is something known about 0xdead4ead?

I am afraid I have not had time to analyze the effects of the double-free
but it is reasonable to assume that UBIFS may be writing an orphan from a
structure that has been freed and therefore re-used by, for example in this
case, a spinlock.
Prins Anton (ST-CO/ENG1.1) - June 18, 2013, 7:17 a.m.
Ok thanks, meanwhile we have applied the 'double free' patches in our nightly build and coming week I hope to have more information on this.

-----Original Message-----
From: Adrian Hunter [mailto:adrian.hunter@intel.com] 
Sent: dinsdag 18 juni 2013 9:01
To: Prins Anton (ST-CO/ENG1.1)
Cc: Mats Kärrman; linux-mtd@lists.infradead.org; dedekind1@gmail.com
Subject: Re: UBIFS failure & stable page writes

On 18/06/13 09:31, Prins Anton (ST-CO/ENG1.1) wrote:
> Last night I did do additional tests with create/remove of files in a
> while loop on a synchronous mounted UBIFS. I did NOT get any node 0 or 1
> written over this night, but obvious enough I saw some strange node id in
> the orphan area: 0xdead4ead
> 
> 0xdead4ead Is known to me as SPINLOCK_MAGIC; but no glue why It is in the
> orphan area if node number... Is something known about 0xdead4ead?

I am afraid I have not had time to analyze the effects of the double-free
but it is reasonable to assume that UBIFS may be writing an orphan from a
structure that has been freed and therefore re-used by, for example in this
case, a spinlock.
Prins Anton (ST-CO/ENG1.1) - June 25, 2013, 8:21 a.m.
Another weekend of tests, including the patches showed no more logging of writing orphan node 0.
Also the 0xdead4ead did not appear; so the patches certainly do their job!

We do not have hard evidence the missing root node is caused by this 'writing of free-ed memory'.
But at least it seems a good suspect! Because the occurrence is very low we will continue testing...
(And go back mounting synchronous to get some higher occurrence...)

I'll keep you up-to-date.

Anton


-----Original Message-----
From: Adrian Hunter [mailto:adrian.hunter@intel.com] 
Sent: dinsdag 18 juni 2013 9:01
To: Prins Anton (ST-CO/ENG1.1)
Cc: Mats Kärrman; linux-mtd@lists.infradead.org; dedekind1@gmail.com
Subject: Re: UBIFS failure & stable page writes

On 18/06/13 09:31, Prins Anton (ST-CO/ENG1.1) wrote:
> Last night I did do additional tests with create/remove of files in a
> while loop on a synchronous mounted UBIFS. I did NOT get any node 0 or 1
> written over this night, but obvious enough I saw some strange node id in
> the orphan area: 0xdead4ead
> 
> 0xdead4ead Is known to me as SPINLOCK_MAGIC; but no glue why It is in the
> orphan area if node number... Is something known about 0xdead4ead?

I am afraid I have not had time to analyze the effects of the double-free
but it is reasonable to assume that UBIFS may be writing an orphan from a
structure that has been freed and therefore re-used by, for example in this
case, a spinlock.
Mats Kärrman - June 28, 2013, 9:05 a.m.
Hi Anton,

Thanks for your efforts!
The tests I have run has shown no regressions, but some stability improvements in the integck test.
(I still have not been able to reproduce the orphan node 1 problem myself though.)

A question; do you use file extended attributes? I have seen problems in this area while running
integck and also tested patches that seem to help.
ref: http://thread.gmane.org/gmane.linux.drivers.mtd/47021

BR // Mats

> From: Prins Anton (ST-CO/ENG1.1) [Anton.Prins@nl.bosch.com]
> Sent: Tuesday, June 25, 2013 10:21 AM
> To: Adrian Hunter
> Cc: Mats Kärrman; linux-mtd@lists.infradead.org; dedekind1@gmail.com
> Subject: RE: UBIFS failure & stable page writes
> 
> Another weekend of tests, including the patches showed no more logging of writing orphan node 0.
> Also the 0xdead4ead did not appear; so the patches certainly do their job!
> 
> We do not have hard evidence the missing root node is caused by this 'writing of free-ed memory'.
> But at least it seems a good suspect! Because the occurrence is very low we will continue testing...
> (And go back mounting synchronous to get some higher occurrence...)
> 
> I'll keep you up-to-date.
> 
> Anton
Prins Anton (ST-CO/ENG1.1) - June 28, 2013, 9:27 a.m.
Hi Mats,

We are not using extended attributes, so I expect no relation with our findings.

Thanks,

Anton

-----Original Message-----
From: Mats Kärrman [mailto:Mats.Karrman@tritech.se] 
Sent: vrijdag 28 juni 2013 11:05
To: Prins Anton (ST-CO/ENG1.1); Adrian Hunter
Cc: linux-mtd@lists.infradead.org; dedekind1@gmail.com
Subject: RE: UBIFS failure & stable page writes

Hi Anton,

Thanks for your efforts!
The tests I have run has shown no regressions, but some stability improvements in the integck test.
(I still have not been able to reproduce the orphan node 1 problem myself though.)

A question; do you use file extended attributes? I have seen problems in this area while running
integck and also tested patches that seem to help.
ref: http://thread.gmane.org/gmane.linux.drivers.mtd/47021

BR // Mats

> From: Prins Anton (ST-CO/ENG1.1) [Anton.Prins@nl.bosch.com]
> Sent: Tuesday, June 25, 2013 10:21 AM
> To: Adrian Hunter
> Cc: Mats Kärrman; linux-mtd@lists.infradead.org; dedekind1@gmail.com
> Subject: RE: UBIFS failure & stable page writes
> 
> Another weekend of tests, including the patches showed no more logging of writing orphan node 0.
> Also the 0xdead4ead did not appear; so the patches certainly do their job!
> 
> We do not have hard evidence the missing root node is caused by this 'writing of free-ed memory'.
> But at least it seems a good suspect! Because the occurrence is very low we will continue testing...
> (And go back mounting synchronous to get some higher occurrence...)
> 
> I'll keep you up-to-date.
> 
> Anton
Prins Anton (ST-CO/ENG1.1) - July 25, 2013, 1:18 p.m.
After two weeks of testing not seen any node 1 problem/orphan 0 logging any more; so gives good feeling!

Regarding exended attributes, we do not use them for now.

-----Original Message-----
From: Mats Kärrman [mailto:Mats.Karrman@tritech.se] 
Sent: vrijdag 28 juni 2013 11:05
To: Prins Anton (ST-CO/ENG1.1); Adrian Hunter
Cc: linux-mtd@lists.infradead.org; dedekind1@gmail.com
Subject: RE: UBIFS failure & stable page writes

Hi Anton,

Thanks for your efforts!
The tests I have run has shown no regressions, but some stability improvements in the integck test.
(I still have not been able to reproduce the orphan node 1 problem myself though.)

A question; do you use file extended attributes? I have seen problems in this area while running
integck and also tested patches that seem to help.
ref: http://thread.gmane.org/gmane.linux.drivers.mtd/47021

BR // Mats

> From: Prins Anton (ST-CO/ENG1.1) [Anton.Prins@nl.bosch.com]
> Sent: Tuesday, June 25, 2013 10:21 AM
> To: Adrian Hunter
> Cc: Mats Kärrman; linux-mtd@lists.infradead.org; dedekind1@gmail.com
> Subject: RE: UBIFS failure & stable page writes
> 
> Another weekend of tests, including the patches showed no more logging of writing orphan node 0.
> Also the 0xdead4ead did not appear; so the patches certainly do their job!
> 
> We do not have hard evidence the missing root node is caused by this 'writing of free-ed memory'.
> But at least it seems a good suspect! Because the occurrence is very low we will continue testing...
> (And go back mounting synchronous to get some higher occurrence...)
> 
> I'll keep you up-to-date.
> 
> Anton

Patch

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ba32da3..9400b5b 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -613,6 +613,11 @@  static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
                for (i = 0; i < n; i++) {
                        inum = le64_to_cpu(orph->inos[i]);
+                       if (inum < UBIFS_FIRST_INO) {
+                               ubifs_err("*not* deleting orphaned inode %lu",
+                                         (unsigned long)inum);
+                               continue;
+                       }
                        dbg_rcvry("deleting orphaned inode %lu",
                                  (unsigned long)inum);
                        err = ubifs_tnc_remove_ino(c, inum);