diff mbox series

spi-nor: Verify written data in paranoid mode

Message ID 20250415180434.513405-1-csokas.bence@prolan.hu
State New
Headers show
Series spi-nor: Verify written data in paranoid mode | expand

Commit Message

Csókás Bence April 15, 2025, 6:04 p.m. UTC
From: Csókás, Bence <csokas.bence@prolan.hu>

Add MTD_SPI_NOR_PARANOID config option for verifying all written data to
prevent silent bit errors to be undetected, at the cost of halving SPI
bandwidth.

Co-developed-by: Szentendrei, Tamás <szentendrei.tamas@prolan.hu>
Signed-off-by: Szentendrei, Tamás <szentendrei.tamas@prolan.hu>
Signed-off-by: Csókás, Bence <csokas.bence@prolan.hu>
---
 drivers/mtd/spi-nor/Kconfig | 10 ++++++++++
 drivers/mtd/spi-nor/core.c  | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)


base-commit: 834a4a689699090a406d1662b03affa8b155d025

Comments

Michael Walle April 16, 2025, 11:59 a.m. UTC | #1
Hi,

> Add MTD_SPI_NOR_PARANOID config option for verifying all written data to
> prevent silent bit errors to be undetected, at the cost of halving SPI
> bandwidth.

What is the use case for this? Why is it specific to SPI-NOR
flashes? Or should it rather be an MTD "feature". I'm not sure
whether this is the right way to do it, thus I'd love to hear more
about the background story to this.

> Co-developed-by: Szentendrei, Tamás <szentendrei.tamas@prolan.hu>
> Signed-off-by: Szentendrei, Tamás <szentendrei.tamas@prolan.hu>
> Signed-off-by: Csókás, Bence <csokas.bence@prolan.hu>
> ---
>  drivers/mtd/spi-nor/Kconfig | 10 ++++++++++
>  drivers/mtd/spi-nor/core.c  | 33 +++++++++++++++++++++++++++++++++
>  2 files changed, 43 insertions(+)
>
> diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig
> index 24cd25de2b8b..425ea9a22424 100644
> --- a/drivers/mtd/spi-nor/Kconfig
> +++ b/drivers/mtd/spi-nor/Kconfig
> @@ -68,6 +68,16 @@ config MTD_SPI_NOR_SWP_KEEP
>  
>  endchoice
>  
> +config MTD_SPI_NOR_PARANOID
> +	bool "Read back written data (paranoid mode)"

No kernel configs please. This doesn't scale. What if you have two
flashes and one should have this and one does not?

-michael

> +	help
> +	  This option makes the SPI NOR core read back all data on a write
> +	  and report an error if it doesn't match the written data. This can
> +	  safeguard against silent bit errors resulting from a faulty Flash,
> +	  controller oddities, bus noise etc.
> +
> +	  If you are unsure, select 'n'.
> +
>  source "drivers/mtd/spi-nor/controllers/Kconfig"
>  
>  endif # MTD_SPI_NOR
> diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
> index ac4b960101cc..ca05a6ec8afe 100644
> --- a/drivers/mtd/spi-nor/core.c
> +++ b/drivers/mtd/spi-nor/core.c
> @@ -2063,6 +2063,7 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
>  	size_t *retlen, const u_char *buf)
>  {
>  	struct spi_nor *nor = mtd_to_spi_nor(mtd);
> +	u_char *verify_buf = NULL;
>  	size_t i;
>  	ssize_t ret;
>  	u32 page_size = nor->params->page_size;
> @@ -2073,6 +2074,14 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
>  	if (ret)
>  		return ret;
>  
> +#if IS_ENABLED(CONFIG_MTD_SPI_NOR_PARANOID)
> +	verify_buf = devm_kmalloc(nor->dev, page_size, GFP_KERNEL);
> +	if (!verify_buf) {
> +		ret = -ENOMEM;
> +		goto write_err;
> +	}
> +#endif
> +
>  	for (i = 0; i < len; ) {
>  		ssize_t written;
>  		loff_t addr = to + i;
> @@ -2099,11 +2108,35 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
>  		ret = spi_nor_wait_till_ready(nor);
>  		if (ret)
>  			goto write_err;
> +
> +#if IS_ENABLED(CONFIG_MTD_SPI_NOR_PARANOID)
> +		/* read back to make sure it's correct */
> +		ret = spi_nor_read_data(nor, addr, written, verify_buf);
> +		if (ret < 0)
> +			goto write_err;
> +		if (ret != written) {
> +			/* We shouldn't see short reads */
> +			dev_err(nor->dev, "Verify failed, written %zd but only read %zd",
> +				written, ret);
> +			ret = -EIO;
> +			goto write_err;
> +		}
> +
> +		if (memcmp(verify_buf, buf + i, written)) {
> +			dev_err(nor->dev, "Verify failed, compare mismatch!");
> +			ret = -EIO;
> +			goto write_err;
> +		}
> +#endif
> +
> +		ret = 0;
> +
>  		*retlen += written;
>  		i += written;
>  	}
>  
>  write_err:
> +	devm_kfree(nor->dev, verify_buf);
>  	spi_nor_unlock_and_unprep_pe(nor, to, len);
>  
>  	return ret;
>
> base-commit: 834a4a689699090a406d1662b03affa8b155d025
Csókás Bence April 16, 2025, 12:38 p.m. UTC | #2
Hi,

On 2025. 04. 16. 13:59, Michael Walle wrote:
> Hi,
> 
>> Add MTD_SPI_NOR_PARANOID config option for verifying all written data to
>> prevent silent bit errors to be undetected, at the cost of halving SPI
>> bandwidth.
> 
> What is the use case for this? Why is it specific to SPI-NOR
> flashes? Or should it rather be an MTD "feature". I'm not sure
> whether this is the right way to do it, thus I'd love to hear more
> about the background story to this.

Well, our case is quite specific, but we wanted to provide a general 
solution for upstream. In our case we have a component in the data path 
that can cause a burst bit error, on average after about a hundred 
megabytes written.

We _could_ make it MTD-wide, in our case we only have a NOR Flash 
onboard so this is where we added it. If it were in the MTD core, where 
would it make sense?

* mtd_write()
* mtd_write_oob()
* mtd_write_oob_std()
* or somewhere else entirely?

>> Co-developed-by: Szentendrei, Tamás <szentendrei.tamas@prolan.hu>
>> Signed-off-by: Szentendrei, Tamás <szentendrei.tamas@prolan.hu>
>> Signed-off-by: Csókás, Bence <csokas.bence@prolan.hu>
>> ---
>>   drivers/mtd/spi-nor/Kconfig | 10 ++++++++++
>>   drivers/mtd/spi-nor/core.c  | 33 +++++++++++++++++++++++++++++++++
>>   2 files changed, 43 insertions(+)
>>
>> diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig
>> index 24cd25de2b8b..425ea9a22424 100644
>> --- a/drivers/mtd/spi-nor/Kconfig
>> +++ b/drivers/mtd/spi-nor/Kconfig
>> @@ -68,6 +68,16 @@ config MTD_SPI_NOR_SWP_KEEP
>>   
>>   endchoice
>>   
>> +config MTD_SPI_NOR_PARANOID
>> +	bool "Read back written data (paranoid mode)"
> 
> No kernel configs please. This doesn't scale. What if you have two
> flashes and one should have this and one does not?

Yes, we have thought about this, but concluded that "paranoid mode" is 
not device-specific, you either have a requirement to be extra sure 
about data integrity, or you can be fairly sure your system is sane, in 
both cases it is a judgement about the entire system. I also thought 
that maybe some devices could be exempt from this, contingent on a 
`no-paranoia` Device Tree property, to selectively sacrifice integrity 
for performance even in paranoid mode, but we only have one Flash 
anyway, so I didn't implement it.

> -michael

Bence
Richard Weinberger April 16, 2025, 1:09 p.m. UTC | #3
----- Ursprüngliche Mail -----
> Von: "Csókás Bence" <csokas.bence@prolan.hu>
>>> Add MTD_SPI_NOR_PARANOID config option for verifying all written data to
>>> prevent silent bit errors to be undetected, at the cost of halving SPI
>>> bandwidth.
>> 
>> What is the use case for this? Why is it specific to SPI-NOR
>> flashes? Or should it rather be an MTD "feature". I'm not sure
>> whether this is the right way to do it, thus I'd love to hear more
>> about the background story to this.
> 
> Well, our case is quite specific, but we wanted to provide a general
> solution for upstream. In our case we have a component in the data path
> that can cause a burst bit error, on average after about a hundred
> megabytes written.

Hmm. So, there is a serve hardware issue you're working around.

> We _could_ make it MTD-wide, in our case we only have a NOR Flash
> onboard so this is where we added it. If it were in the MTD core, where
> would it make sense?

I'm not so sure whether it makes sense at all.
In it's current form, there is no recovery. So anything non-trivial
on top of the MTD will just see an -EIO and has to give up.
E.g. a filesystem will remount read-only.

Thanks,
//richard
Csókás Bence April 16, 2025, 2:44 p.m. UTC | #4
Hi,

On 2025. 04. 16. 15:09, Richard Weinberger wrote:
> ----- Ursprüngliche Mail -----
>> Von: "Csókás Bence" <csokas.bence@prolan.hu>
>>>> Add MTD_SPI_NOR_PARANOID config option for verifying all written data to
>>>> prevent silent bit errors to be undetected, at the cost of halving SPI
>>>> bandwidth.
>>>
>>> What is the use case for this? Why is it specific to SPI-NOR
>>> flashes? Or should it rather be an MTD "feature". I'm not sure
>>> whether this is the right way to do it, thus I'd love to hear more
>>> about the background story to this.
>>
>> Well, our case is quite specific, but we wanted to provide a general
>> solution for upstream. In our case we have a component in the data path
>> that can cause a burst bit error, on average after about a hundred
>> megabytes written.
> 
> Hmm. So, there is a serve hardware issue you're working around.
> 
>> We _could_ make it MTD-wide, in our case we only have a NOR Flash
>> onboard so this is where we added it. If it were in the MTD core, where
>> would it make sense?
> 
> I'm not so sure whether it makes sense at all.
> In it's current form, there is no recovery. So anything non-trivial
> on top of the MTD will just see an -EIO and has to give up.
> E.g. a filesystem will remount read-only.

In our case, we use UBIFS on top of UBI, which in this case chooses 
another eraseblock to hold the data instead, then re-tests (erase+write 
cycles) the one which gave -EIO. Since the bus error is only transient, 
it goes away by this time, and thus UBIFS will recover from this cleanly.

So yes, it is up to the FS/upper layers to handle the error. If it can't 
recover from this, then yes, it will give up and enter some 'safe mode' 
(e.g. remount ro). But at least it *does* get notified that there is 
something up, and has a chance to react. Before it just thought 
everything was written with no errors, and then there would be data 
corruption *on the next read*.

> Thanks,
> //richard

Bence
Richard Weinberger April 16, 2025, 6:41 p.m. UTC | #5
----- Ursprüngliche Mail -----
> Von: "Csókás Bence" <csokas.bence@prolan.hu>
>> I'm not so sure whether it makes sense at all.
>> In it's current form, there is no recovery. So anything non-trivial
>> on top of the MTD will just see an -EIO and has to give up.
>> E.g. a filesystem will remount read-only.
> 
> In our case, we use UBIFS on top of UBI, which in this case chooses
> another eraseblock to hold the data instead, then re-tests (erase+write
> cycles) the one which gave -EIO. Since the bus error is only transient,
> it goes away by this time, and thus UBIFS will recover from this cleanly.

Are you sure about that?

I'd expect UBI to go into RO mode via a call path like:
ubi_eba_write_leb() -> ubi_io_write() -> mtd_write()
If mtd_write() returns an EIO, UBI will go into RO mode immediately.

(I'm assuming, your SPI-NOR has no bad block support, so ubi->bad_allowed
is false).

Thanks,
//richard
Csókás Bence May 8, 2025, 9:42 a.m. UTC | #6
Hi Richard,

On 2025. 04. 16. 20:41, Richard Weinberger wrote:
> ----- Ursprüngliche Mail -----
>> Von: "Csókás Bence" <csokas.bence@prolan.hu>
>>> I'm not so sure whether it makes sense at all.
>>> In it's current form, there is no recovery. So anything non-trivial
>>> on top of the MTD will just see an -EIO and has to give up.
>>> E.g. a filesystem will remount read-only.
>>
>> In our case, we use UBIFS on top of UBI, which in this case chooses
>> another eraseblock to hold the data instead, then re-tests (erase+write
>> cycles) the one which gave -EIO. Since the bus error is only transient,
>> it goes away by this time, and thus UBIFS will recover from this cleanly.
> 
> Are you sure about that?
> 
> I'd expect UBI to go into RO mode via a call path like:
> ubi_eba_write_leb() -> ubi_io_write() -> mtd_write()
> If mtd_write() returns an EIO, UBI will go into RO mode immediately.
> 
> (I'm assuming, your SPI-NOR has no bad block support, so ubi->bad_allowed
> is false).

You are right, in our case we had to patch bad_allowed to be true. But 
the point is, that UBIFS _does_ get notified, and it _does_ go into RO 
mode, instead of getting success from mtd_write(), even though the 
written data was corrupted.

On 2025. 04. 16. 14:38, Csókás Bence wrote:
 > We _could_ make it MTD-wide, in our case we only have a NOR Flash
 > onboard so this is where we added it. If it were in the MTD core, where
 > would it make sense?
 >
 > * mtd_write()
 > * mtd_write_oob()
 > * mtd_write_oob_std()
 > * or somewhere else entirely?

I'm now starting to think mtd_write_oob() would be the right place for 
it. Thoughts?

Bence
diff mbox series

Patch

diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig
index 24cd25de2b8b..425ea9a22424 100644
--- a/drivers/mtd/spi-nor/Kconfig
+++ b/drivers/mtd/spi-nor/Kconfig
@@ -68,6 +68,16 @@  config MTD_SPI_NOR_SWP_KEEP
 
 endchoice
 
+config MTD_SPI_NOR_PARANOID
+	bool "Read back written data (paranoid mode)"
+	help
+	  This option makes the SPI NOR core read back all data on a write
+	  and report an error if it doesn't match the written data. This can
+	  safeguard against silent bit errors resulting from a faulty Flash,
+	  controller oddities, bus noise etc.
+
+	  If you are unsure, select 'n'.
+
 source "drivers/mtd/spi-nor/controllers/Kconfig"
 
 endif # MTD_SPI_NOR
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index ac4b960101cc..ca05a6ec8afe 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2063,6 +2063,7 @@  static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
 	size_t *retlen, const u_char *buf)
 {
 	struct spi_nor *nor = mtd_to_spi_nor(mtd);
+	u_char *verify_buf = NULL;
 	size_t i;
 	ssize_t ret;
 	u32 page_size = nor->params->page_size;
@@ -2073,6 +2074,14 @@  static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
 	if (ret)
 		return ret;
 
+#if IS_ENABLED(CONFIG_MTD_SPI_NOR_PARANOID)
+	verify_buf = devm_kmalloc(nor->dev, page_size, GFP_KERNEL);
+	if (!verify_buf) {
+		ret = -ENOMEM;
+		goto write_err;
+	}
+#endif
+
 	for (i = 0; i < len; ) {
 		ssize_t written;
 		loff_t addr = to + i;
@@ -2099,11 +2108,35 @@  static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
 		ret = spi_nor_wait_till_ready(nor);
 		if (ret)
 			goto write_err;
+
+#if IS_ENABLED(CONFIG_MTD_SPI_NOR_PARANOID)
+		/* read back to make sure it's correct */
+		ret = spi_nor_read_data(nor, addr, written, verify_buf);
+		if (ret < 0)
+			goto write_err;
+		if (ret != written) {
+			/* We shouldn't see short reads */
+			dev_err(nor->dev, "Verify failed, written %zd but only read %zd",
+				written, ret);
+			ret = -EIO;
+			goto write_err;
+		}
+
+		if (memcmp(verify_buf, buf + i, written)) {
+			dev_err(nor->dev, "Verify failed, compare mismatch!");
+			ret = -EIO;
+			goto write_err;
+		}
+#endif
+
+		ret = 0;
+
 		*retlen += written;
 		i += written;
 	}
 
 write_err:
+	devm_kfree(nor->dev, verify_buf);
 	spi_nor_unlock_and_unprep_pe(nor, to, len);
 
 	return ret;