diff mbox

[1/7,RFC] UBI: Add checkpoint on-chip layout

Message ID 1336585125-127220-2-git-send-email-richard@nod.at
State RFC
Headers show

Commit Message

Richard Weinberger May 9, 2012, 5:38 p.m. UTC
Specify the on-chip checkpoint layout.
The checkpoint consists of two major parts.
A super block (identified via UBI_CP_SB_VOLUME_ID) and
zero or more data blocks (identified via UBI_CP_DATA_VOLUME_ID).
Data blocks are only used if whole checkpoint information does not fit
into the super block.

All three checkpointing pools have the same size for now, this my also change.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/ubi-media.h |  135 +++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 135 insertions(+), 0 deletions(-)

Comments

Artem Bityutskiy May 11, 2012, 11:17 a.m. UTC | #1
I'd like to a git tree for this stuff at some point, when you feel you
are ready, and then do all further changes incrementally on top of that.
Then would also be able to participate - at least do minor things myself
and send patches to you.

On Wed, 2012-05-09 at 19:38 +0200, Richard Weinberger wrote:
> +#ifdef CONFIG_MTD_UBI_CHECKPOINT
> +#define UBI_CP_SB_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 1)
> +#define UBI_CP_DATA_VOLUME_ID	(UBI_CP_SB_VOLUME_ID + 1)

#define UBI_CP_DATA_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 2)

is more readable I think.

> +#define UBI_CP_MAX_BLOCKS	32

This really needs a comment on top of it telling how big flash with say,
128KiB PEB size this would support.

> +
> +/**
> + * struct ubi_cp_sb - UBI checkpoint super block
> + * @magic: checkpoint super block magic number (%UBI_CP_SB_MAGIC)
> + * @version: format version of this checkpoint
> + * @data_crc: CRC over the checkpoint data
> + * @nblocks: number of PEBs used by this checkpoint
> + * @block_loc: an array containing the location of all PEBs of the checkpoint
> + * @block_ec: the erase counter of each used PEB
> + * @sqnum: highest sequence number value at the time while taking the checkpoint
> + *
> + * The checkpoint
> + */
> +struct ubi_cp_sb {
> +	__be32 magic;
> +	__u8 version;
> +	__be32 data_crc;
> +	__be32 nblocks;
> +	__be32 block_loc[UBI_CP_MAX_BLOCKS];
> +	__be32 block_ec[UBI_CP_MAX_BLOCKS];
> +	__be64 sqnum;
> +} __packed;

Please, unless it is size-critical, always leave some unused space in
on-flash data structure for possible future extensions, and initialize
them to 0.

BTW, side-note, please, check that you follow the convention of UBI and
make all on-flash data structures 64-bit aligned (of course unless it is
size-critical).

> +/**
> + * struct ubi_cp_long_pool - Checkpoint pool with long term used PEBs
> + * @magic: long pool magic numer (%UBI_CP_LPOOL_MAGIC)
> + * @size: current pool size
> + * @pebs: an array containing the location of all PEBs in this pool
> + */
> +struct ubi_cp_long_pool {
> +	__be32 magic;
> +	__be32 size;
> +	__be32 pebs[UBI_CP_MAX_POOL_SIZE];
> +} __packed;

What's the perpose of having these pools - once you read all the
information from the fastmap and the wl subsystem inserts it to the
RB-trees - you already know this data. Why you need to store this on the
flash? This whole pool think look redundant and unneeded.

> +/**
> + * struct ubi_cp_ec - stores the erase counter of a PEB
> + * @pnum: PEB number
> + * @ec: ec of this PEB
> + */
> +struct ubi_cp_ec {
> +	__be32 pnum;
> +	__be32 ec;
> +} __packed;

It is weird that you do not have an array of ECs instead for _every_
PEB. Why wasting the flash and time writing/reading this data?

> +/**
> + * struct ubi_cp_eba - denotes an association beween a PEB and LEB
> + * @lnum: LEB number
> + * @pnum: PEB number
> + */
> +struct ubi_cp_eba {
> +	__be32 lnum;
> +	__be32 pnum;
> +} __packed;

Same here - I'd expect a simple array for every PEB in the system.
Richard Weinberger May 11, 2012, 12:02 p.m. UTC | #2
Am 11.05.2012 13:17, schrieb Artem Bityutskiy:
> I'd like to a git tree for this stuff at some point, when you feel you
> are ready, and then do all further changes incrementally on top of that.
> Then would also be able to participate - at least do minor things myself
> and send patches to you.
> 
> On Wed, 2012-05-09 at 19:38 +0200, Richard Weinberger wrote:
>> +#ifdef CONFIG_MTD_UBI_CHECKPOINT
>> +#define UBI_CP_SB_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 1)
>> +#define UBI_CP_DATA_VOLUME_ID	(UBI_CP_SB_VOLUME_ID + 1)
> 
> #define UBI_CP_DATA_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 2)
> 
> is more readable I think.

Okay.

>> +#define UBI_CP_MAX_BLOCKS	32
> 
> This really needs a comment on top of it telling how big flash with say,
> 128KiB PEB size this would support.

This is a default value which was randomly chosen.
The last patch makes this value configurable via Kconfig.

It is one of these parameters where we have to find sane a default value.

>> +
>> +/**
>> + * struct ubi_cp_sb - UBI checkpoint super block
>> + * @magic: checkpoint super block magic number (%UBI_CP_SB_MAGIC)
>> + * @version: format version of this checkpoint
>> + * @data_crc: CRC over the checkpoint data
>> + * @nblocks: number of PEBs used by this checkpoint
>> + * @block_loc: an array containing the location of all PEBs of the checkpoint
>> + * @block_ec: the erase counter of each used PEB
>> + * @sqnum: highest sequence number value at the time while taking the checkpoint
>> + *
>> + * The checkpoint
>> + */
>> +struct ubi_cp_sb {
>> +	__be32 magic;
>> +	__u8 version;
>> +	__be32 data_crc;
>> +	__be32 nblocks;
>> +	__be32 block_loc[UBI_CP_MAX_BLOCKS];
>> +	__be32 block_ec[UBI_CP_MAX_BLOCKS];
>> +	__be64 sqnum;
>> +} __packed;
> 
> Please, unless it is size-critical, always leave some unused space in
> on-flash data structure for possible future extensions, and initialize
> them to 0.

If the fastmap on-flash layout changes, I'll increment the "version" field.
But I can leave some space.

> BTW, side-note, please, check that you follow the convention of UBI and
> make all on-flash data structures 64-bit aligned (of course unless it is
> size-critical).

Will check.

>> +/**
>> + * struct ubi_cp_long_pool - Checkpoint pool with long term used PEBs
>> + * @magic: long pool magic numer (%UBI_CP_LPOOL_MAGIC)
>> + * @size: current pool size
>> + * @pebs: an array containing the location of all PEBs in this pool
>> + */
>> +struct ubi_cp_long_pool {
>> +	__be32 magic;
>> +	__be32 size;
>> +	__be32 pebs[UBI_CP_MAX_POOL_SIZE];
>> +} __packed;
> 
> What's the perpose of having these pools - once you read all the
> information from the fastmap and the wl subsystem inserts it to the
> RB-trees - you already know this data. Why you need to store this on the
> flash? This whole pool think look redundant and unneeded.

We need this pool to find all PEBs that have changed since we wrote the last
checkpoint (or fastmap).
BTW: That's why we named it "UBI Checkpointing".
If all PEBs in the pool are used (IOW no longer empty) we fill the pool with empty PEBs
and write a new checkpoint.
While reading the checkpoint we know that only PEBs within the pool my have changed...

Without this pool we'd have to write the checkpoint every time the EBA changes
or we'd have to scan the whole list of free PEBs while attaching.

>> +/**
>> + * struct ubi_cp_ec - stores the erase counter of a PEB
>> + * @pnum: PEB number
>> + * @ec: ec of this PEB
>> + */
>> +struct ubi_cp_ec {
>> +	__be32 pnum;
>> +	__be32 ec;
>> +} __packed;
> 
> It is weird that you do not have an array of ECs instead for _every_
> PEB. Why wasting the flash and time writing/reading this data?

By array of ECs you mean that all ec values are written to the flash
and pnum is the index?
Sounds sane.

>> +/**
>> + * struct ubi_cp_eba - denotes an association beween a PEB and LEB
>> + * @lnum: LEB number
>> + * @pnum: PEB number
>> + */
>> +struct ubi_cp_eba {
>> +	__be32 lnum;
>> +	__be32 pnum;
>> +} __packed;
> 
> Same here - I'd expect a simple array for every PEB in the system.
> 

Sounds sane too. :)

Thanks,
//richard
Artem Bityutskiy May 11, 2012, 12:21 p.m. UTC | #3
On Fri, 2012-05-11 at 14:02 +0200, Richard Weinberger wrote:
> > Please, unless it is size-critical, always leave some unused space in
> > on-flash data structure for possible future extensions, and initialize
> > them to 0.
> 
> If the fastmap on-flash layout changes, I'll increment the "version" field.
> But I can leave some space.

But if you have extra space you _often_ have chances to add new features
in a backward-compatible way, without incrementing the version, which
would make new images backward-incompatible. It really may make a big
difference.

This is not _always_ possible, of course, but sometimes it is. So if it
does not hurt, leave extra space in all on flash data structures (but
probably not in those you have large arrays of, though). This is just a
good practice I think. For some reason I have a feeling that we
discussed this already :-) I think I sent you an e-mail with various
questions which you did not answer long time ago, but it was long time
ago, so does not matter anymore.

> > What's the perpose of having these pools - once you read all the
> > information from the fastmap and the wl subsystem inserts it to the
> > RB-trees - you already know this data. Why you need to store this on the
> > flash? This whole pool think look redundant and unneeded.
> 
> We need this pool to find all PEBs that have changed since we wrote the last
> checkpoint (or fastmap).

OK, I see, thanks.

> BTW: That's why we named it "UBI Checkpointing".
> If all PEBs in the pool are used (IOW no longer empty) we fill the pool with empty PEBs
> and write a new checkpoint.
> While reading the checkpoint we know that only PEBs within the pool my have changed...
> 
> Without this pool we'd have to write the checkpoint every time the EBA changes
> or we'd have to scan the whole list of free PEBs while attaching.

I just had an impression that you write the fastmap only on unmount or
at some kind of "sync" points, and power cuts always would lead to
re-scan.

> > It is weird that you do not have an array of ECs instead for _every_
> > PEB. Why wasting the flash and time writing/reading this data?
> 
> By array of ECs you mean that all ec values are written to the flash
> and pnum is the index?
> Sounds sane.

Yes, to me it sounds like the only sane way, unless there is a strong
reason to have redundant "pnum" fields. :-)
Richard Weinberger May 11, 2012, 5:15 p.m. UTC | #4
Am 11.05.2012 14:21, schrieb Artem Bityutskiy:
>>> It is weird that you do not have an array of ECs instead for _every_
>>> PEB. Why wasting the flash and time writing/reading this data?
>>
>> By array of ECs you mean that all ec values are written to the flash
>> and pnum is the index?
>> Sounds sane.
> 
> Yes, to me it sounds like the only sane way, unless there is a strong
> reason to have redundant "pnum" fields. :-)

While looking at my own code a bit closer I found out why I haven't used the
array approach. B-)
Currently only ec values for PEBs within the free and used list are stored.
Therefore, the array can have gaps. E.g. If PEB X is in the erroneous list.

Thanks,
//richard
Artem Bityutskiy May 11, 2012, 6:56 p.m. UTC | #5
On Fri, 2012-05-11 at 19:15 +0200, Richard Weinberger wrote:
> Am 11.05.2012 14:21, schrieb Artem Bityutskiy:
> >>> It is weird that you do not have an array of ECs instead for _every_
> >>> PEB. Why wasting the flash and time writing/reading this data?
> >>
> >> By array of ECs you mean that all ec values are written to the flash
> >> and pnum is the index?
> >> Sounds sane.
> > 
> > Yes, to me it sounds like the only sane way, unless there is a strong
> > reason to have redundant "pnum" fields. :-)
> 
> While looking at my own code a bit closer I found out why I haven't used the
> array approach. B-)
> Currently only ec values for PEBs within the free and used list are stored.
> Therefore, the array can have gaps. E.g. If PEB X is in the erroneous list.

I think this is not a good enough justification. I think we may use
0xFFFFFFFF and other high EC values to indicate that the block was bad
or erroneous or whatever.

BTW, did you think about scenario of moving dumping UBI2 on on one
device with one bad PEBs distribution and then flashing it to a
different device with a different bad PEB distribution? What would
happen when we have fastmap enabled? Also, what if I write it to a
larger flash with otherwise the same geometry?

I guess we could detect these things and fall-back to scanning?
Richard Weinberger May 11, 2012, 7:15 p.m. UTC | #6
Am 11.05.2012 20:56, schrieb Artem Bityutskiy:
> I think this is not a good enough justification. I think we may use
> 0xFFFFFFFF and other high EC values to indicate that the block was bad
> or erroneous or whatever.

Okay, then we have to store all PEB ec values. (used, free, erroneous and scrub)
This is not a big deal.
As I said, currently only used and free PEBs are stored.

I think we need also a better solution for the protection queue.
My current solution (ubi_flush_prot_queue) is not the right thing.
Today I've observed a data corruption issue an I'm sure it happened
because fastmap did the wrong thing with the protection queue.
The problem is that a PEB in the protection queue is not visible to fastmap.
(Because it writes only used and free PEBs on the flash).

> BTW, did you think about scenario of moving dumping UBI2 on on one
> device with one bad PEBs distribution and then flashing it to a
> different device with a different bad PEB distribution? What would
> happen when we have fastmap enabled? Also, what if I write it to a
> larger flash with otherwise the same geometry?
> 
> I guess we could detect these things and fall-back to scanning?

Falling back to scanning is easy.
But how can we detect such a change?

Thanks,
//richard
diff mbox

Patch

diff --git a/drivers/mtd/ubi/ubi-media.h b/drivers/mtd/ubi/ubi-media.h
index 6fb8ec2..7223b02 100644
--- a/drivers/mtd/ubi/ubi-media.h
+++ b/drivers/mtd/ubi/ubi-media.h
@@ -375,4 +375,139 @@  struct ubi_vtbl_record {
 	__be32  crc;
 } __packed;
 
+#ifdef CONFIG_MTD_UBI_CHECKPOINT
+#define UBI_CP_SB_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 1)
+#define UBI_CP_DATA_VOLUME_ID	(UBI_CP_SB_VOLUME_ID + 1)
+
+/* Checkoint format version */
+#define UBI_CP_FMT_VERSION	1
+
+#define UBI_CP_MAX_START	64
+#define UBI_CP_MAX_BLOCKS	32
+#define UBI_CP_MAX_POOL_SIZE	128
+#define UBI_CP_SB_MAGIC		0x7B11D69F
+#define UBI_CP_HDR_MAGIC	0xD4B82EF7
+#define UBI_CP_VHDR_MAGIC	0xFA370ED1
+#define UBI_CP_LPOOL_MAGIC	0x67AF4D08
+#define UBI_CP_SPOOL_MAGIC	0x67AF4D09
+#define UBI_CP_UPOOL_MAGIC	0x67AF4D0A
+
+/**
+ * struct ubi_cp_sb - UBI checkpoint super block
+ * @magic: checkpoint super block magic number (%UBI_CP_SB_MAGIC)
+ * @version: format version of this checkpoint
+ * @data_crc: CRC over the checkpoint data
+ * @nblocks: number of PEBs used by this checkpoint
+ * @block_loc: an array containing the location of all PEBs of the checkpoint
+ * @block_ec: the erase counter of each used PEB
+ * @sqnum: highest sequence number value at the time while taking the checkpoint
+ *
+ * The checkpoint
+ */
+struct ubi_cp_sb {
+	__be32 magic;
+	__u8 version;
+	__be32 data_crc;
+	__be32 nblocks;
+	__be32 block_loc[UBI_CP_MAX_BLOCKS];
+	__be32 block_ec[UBI_CP_MAX_BLOCKS];
+	__be64 sqnum;
+} __packed;
+
+/**
+ * struct ubi_cp_hdr - header of the checkpoint data set
+ * @magic: checkpoint header magic number (%UBI_CP_HDR_MAGIC)
+ * @nfree: number of free PEBs known by this checkpoint
+ * @nused: number of used PEBs known by this checkpoint
+ * @nvol: number of UBI volumes known by this checkpoint
+ */
+struct ubi_cp_hdr {
+	__be32 magic;
+	__be32 nfree;
+	__be32 nused;
+	__be32 nvol;
+} __packed;
+
+/* struct ubi_cp_hdr is followed by exactly three struct ub_cp_pool_* records
+ * long, short and unknown pool */
+
+/**
+ * struct ubi_cp_long_pool - Checkpoint pool with long term used PEBs
+ * @magic: long pool magic numer (%UBI_CP_LPOOL_MAGIC)
+ * @size: current pool size
+ * @pebs: an array containing the location of all PEBs in this pool
+ */
+struct ubi_cp_long_pool {
+	__be32 magic;
+	__be32 size;
+	__be32 pebs[UBI_CP_MAX_POOL_SIZE];
+} __packed;
+
+/**
+ * struct ubi_cp_short_pool - Checkpoint pool with short term used PEBs
+ * @magic: long pool magic numer (%UBI_CP_SPOOL_MAGIC)
+ * @size: current pool size
+ * @pebs: an array containing the location of all PEBs in this pool
+ */
+struct ubi_cp_short_pool {
+	__be32 magic;
+	__be32 size;
+	__be32 pebs[UBI_CP_MAX_POOL_SIZE];
+} __packed;
+
+/**
+ * struct ubi_cp_unk_pool - Checkpoint pool with all other PEBs
+ * @magic: long pool magic numer (%UBI_CP_UPOOL_MAGIC)
+ * @size: current pool size
+ * @pebs: an array containing the location of all PEBs in this pool
+ */
+struct ubi_cp_unk_pool {
+	__be32 magic;
+	__be32 size;
+	__be32 pebs[UBI_CP_MAX_POOL_SIZE];
+} __packed;
+
+/* struct ubi_cp_unk_pool is followed by nfree+nused struct ubi_cp_ec records */
+
+/**
+ * struct ubi_cp_ec - stores the erase counter of a PEB
+ * @pnum: PEB number
+ * @ec: ec of this PEB
+ */
+struct ubi_cp_ec {
+	__be32 pnum;
+	__be32 ec;
+} __packed;
+
+/**
+ * struct ubi_cp_volhdr - checkpoint volume header
+ * it identifies the start of an eba table
+ * @magic: checkpoint volume header magic number (%UBI_CP_VHDR_MAGIC)
+ * @vol_id: volume id of the checkpointed volume
+ * @vol_type: type of the checkpointed volume
+ * @data_pad: data_pad value of the checkpointed volume
+ * @used_ebs: number of used LEBs within this volume
+ * @last_eb_bytes: number of bytes used in the last LEB
+ */
+struct ubi_cp_volhdr {
+	__be32 magic;
+	__be32 vol_id;
+	__u8 vol_type;
+	__be32 data_pad;
+	__be32 used_ebs;
+	__be32 last_eb_bytes;
+} __packed;
+
+/* struct ubi_cp_volhdr is followed by nused struct ubi_cp_eba records */
+
+/**
+ * struct ubi_cp_eba - denotes an association beween a PEB and LEB
+ * @lnum: LEB number
+ * @pnum: PEB number
+ */
+struct ubi_cp_eba {
+	__be32 lnum;
+	__be32 pnum;
+} __packed;
+#endif /* CONFIG_MTD_UBI_CHECKPOINT */
 #endif /* !__UBI_MEDIA_H__ */