diff mbox series

[04/20] mtd: nand: ecc-bch: Stop exporting the private structure

Message ID 20200929230124.31491-5-miquel.raynal@bootlin.com
State Accepted
Delegated to: Miquel Raynal
Headers show
Series Create generic software ECC engines | expand

Commit Message

Miquel Raynal Sept. 29, 2020, 11:01 p.m. UTC
The NAND BCH control structure has nothing to do outside of this
driver, all users of the nand_bch_init/free() functions just save it
to chip->ecc.priv so do it in this driver directly and return a
regular error code instead.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/ecc-sw-bch.c       | 36 ++++++++++++++++-------------
 drivers/mtd/nand/raw/fsmc_nand.c    |  2 +-
 drivers/mtd/nand/raw/nand_base.c    | 12 ++++++----
 drivers/mtd/nand/raw/omap2.c        | 16 ++++++-------
 include/linux/mtd/nand-ecc-sw-bch.h | 11 ++++-----
 5 files changed, 41 insertions(+), 36 deletions(-)

Comments

Adam Ford Jan. 9, 2021, 2:46 p.m. UTC | #1
On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> The NAND BCH control structure has nothing to do outside of this
> driver, all users of the nand_bch_init/free() functions just save it
> to chip->ecc.priv so do it in this driver directly and return a
> regular error code instead.
>
> Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> ---

Starting with this commit:  3c0fe36abebe, the kernel either doesn't
build or returns errors on some omap2plus devices with the following
error:

    nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
    nand: Micron MT29F4G16ABBDA3W
    nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
    nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
    Invalid ECC layout
    omap2-nand 30000000.nand: unable to use BCH library
    omap2-nand: probe of 30000000.nand failed with error -22
    8<--- cut here ---

There are few commits using git bisect that have build errors, so it
wasn't possible for me to determine the exact commit that broke the
ECC.  If the build failed, I marked it as 'bad' to git bisect.

Newer commits have remedied the build issue, but the Invalid ECC
layout error still exists as of 5.11-RC2.

Do you have any suggestions on what I can do to remedy this?  I am
willing to try and test.



>  drivers/mtd/nand/ecc-sw-bch.c       | 36 ++++++++++++++++-------------
>  drivers/mtd/nand/raw/fsmc_nand.c    |  2 +-
>  drivers/mtd/nand/raw/nand_base.c    | 12 ++++++----
>  drivers/mtd/nand/raw/omap2.c        | 16 ++++++-------
>  include/linux/mtd/nand-ecc-sw-bch.h | 11 ++++-----
>  5 files changed, 41 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
> index fe080a0837d8..97221ec3876e 100644
> --- a/drivers/mtd/nand/ecc-sw-bch.c
> +++ b/drivers/mtd/nand/ecc-sw-bch.c
> @@ -90,7 +90,7 @@ EXPORT_SYMBOL(nand_bch_correct_data);
>
>  /**
>   * nand_bch_init - Initialize software BCH ECC engine
> - * @mtd: MTD device
> + * @chip: NAND chip object
>   *
>   * Returns: a pointer to a new NAND BCH control structure, or NULL upon failure
>   *
> @@ -105,24 +105,24 @@ EXPORT_SYMBOL(nand_bch_correct_data);
>   * @eccsize = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
>   * @eccbytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
>   */
> -struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
> +int nand_bch_init(struct nand_chip *chip)
>  {
> -       struct nand_chip *nand = mtd_to_nand(mtd);
> +       struct mtd_info *mtd = nand_to_mtd(chip);
>         unsigned int m, t, eccsteps, i;
>         struct nand_bch_control *nbc = NULL;
>         unsigned char *erased_page;
> -       unsigned int eccsize = nand->ecc.size;
> -       unsigned int eccbytes = nand->ecc.bytes;
> -       unsigned int eccstrength = nand->ecc.strength;
> +       unsigned int eccsize = chip->ecc.size;
> +       unsigned int eccbytes = chip->ecc.bytes;
> +       unsigned int eccstrength = chip->ecc.strength;
>
>         if (!eccbytes && eccstrength) {
>                 eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
> -               nand->ecc.bytes = eccbytes;
> +               chip->ecc.bytes = eccbytes;
>         }
>
>         if (!eccsize || !eccbytes) {
>                 pr_warn("ecc parameters not supplied\n");
> -               goto fail;
> +               return -EINVAL;
>         }
>
>         m = fls(1+8*eccsize);
> @@ -130,7 +130,9 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
>
>         nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
>         if (!nbc)
> -               goto fail;
> +               return -ENOMEM;
> +
> +       chip->ecc.priv = nbc;
>
>         nbc->bch = bch_init(m, t, 0, false);
>         if (!nbc->bch)
> @@ -165,8 +167,8 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
>          * FIXME: we should probably rework the sequencing in nand_scan_tail()
>          * to avoid setting those fields twice.
>          */
> -       nand->ecc.steps = eccsteps;
> -       nand->ecc.total = eccsteps * eccbytes;
> +       chip->ecc.steps = eccsteps;
> +       chip->ecc.total = eccsteps * eccbytes;
>         if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
>                 pr_warn("invalid ecc layout\n");
>                 goto fail;
> @@ -192,12 +194,12 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
>                 nbc->eccmask[i] ^= 0xff;
>
>         if (!eccstrength)
> -               nand->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
> +               chip->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
>
> -       return nbc;
> +       return 0;
>  fail:
> -       nand_bch_free(nbc);
> -       return NULL;
> +       nand_bch_free(chip);
> +       return -EINVAL;
>  }
>  EXPORT_SYMBOL(nand_bch_init);
>
> @@ -205,8 +207,10 @@ EXPORT_SYMBOL(nand_bch_init);
>   * nand_bch_free - Release NAND BCH ECC resources
>   * @nbc: NAND BCH control structure
>   */
> -void nand_bch_free(struct nand_bch_control *nbc)
> +void nand_bch_free(struct nand_chip *chip)
>  {
> +       struct nand_bch_control *nbc = chip->ecc.priv;
> +
>         if (nbc) {
>                 bch_free(nbc->bch);
>                 kfree(nbc->errloc);
> diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c
> index 4191831df182..1bc2462efeab 100644
> --- a/drivers/mtd/nand/raw/fsmc_nand.c
> +++ b/drivers/mtd/nand/raw/fsmc_nand.c
> @@ -927,7 +927,7 @@ static int fsmc_nand_attach_chip(struct nand_chip *nand)
>
>         /*
>          * Don't set layout for BCH4 SW ECC. This will be
> -        * generated later in nand_bch_init() later.
> +        * generated later during BCH initialization.
>          */
>         if (nand->ecc.engine_type == NAND_ECC_ENGINE_TYPE_ON_HOST) {
>                 switch (mtd->oobsize) {
> diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
> index 189d3a301a38..c47441ddc5cf 100644
> --- a/drivers/mtd/nand/raw/nand_base.c
> +++ b/drivers/mtd/nand/raw/nand_base.c
> @@ -5208,6 +5208,7 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
>         struct mtd_info *mtd = nand_to_mtd(chip);
>         struct nand_device *nanddev = mtd_to_nanddev(mtd);
>         struct nand_ecc_ctrl *ecc = &chip->ecc;
> +       int ret;
>
>         if (WARN_ON(ecc->engine_type != NAND_ECC_ENGINE_TYPE_SOFT))
>                 return -EINVAL;
> @@ -5294,13 +5295,14 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
>                         ecc->strength = bytes * 8 / fls(8 * ecc->size);
>                 }
>
> -               /* See nand_bch_init() for details. */
> +               /* See the software BCH ECC initialization for details */
>                 ecc->bytes = 0;
> -               ecc->priv = nand_bch_init(mtd);
> -               if (!ecc->priv) {
> +               ret = nand_bch_init(chip);
> +               if (ret) {
>                         WARN(1, "BCH ECC initialization failed!\n");
> -                       return -EINVAL;
> +                       return ret;
>                 }
> +
>                 return 0;
>         default:
>                 WARN(1, "Unsupported ECC algorithm!\n");
> @@ -5960,7 +5962,7 @@ void nand_cleanup(struct nand_chip *chip)
>  {
>         if (chip->ecc.engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
>             chip->ecc.algo == NAND_ECC_ALGO_BCH)
> -               nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
> +               nand_bch_free(chip);
>
>         nanddev_cleanup(&chip->base);
>
> diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
> index 0ef209e1cd87..6aab57336690 100644
> --- a/drivers/mtd/nand/raw/omap2.c
> +++ b/drivers/mtd/nand/raw/omap2.c
> @@ -2047,10 +2047,10 @@ static int omap_nand_attach_chip(struct nand_chip *chip)
>                 /* Reserve one byte for the OMAP marker */
>                 oobbytes_per_step       = chip->ecc.bytes + 1;
>                 /* Software BCH library is used for locating errors */
> -               chip->ecc.priv          = nand_bch_init(mtd);
> -               if (!chip->ecc.priv) {
> +               err = nand_bch_init(chip);
> +               if (err) {
>                         dev_err(dev, "Unable to use BCH library\n");
> -                       return -EINVAL;
> +                       return err;
>                 }
>                 break;
>
> @@ -2089,10 +2089,10 @@ static int omap_nand_attach_chip(struct nand_chip *chip)
>                 /* Reserve one byte for the OMAP marker */
>                 oobbytes_per_step       = chip->ecc.bytes + 1;
>                 /* Software BCH library is used for locating errors */
> -               chip->ecc.priv          = nand_bch_init(mtd);
> -               if (!chip->ecc.priv) {
> +               err = nand_bch_init(chip);
> +               if (err) {
>                         dev_err(dev, "unable to use BCH library\n");
> -                       return -EINVAL;
> +                       return err;
>                 }
>                 break;
>
> @@ -2272,7 +2272,7 @@ static int omap_nand_probe(struct platform_device *pdev)
>         if (!IS_ERR_OR_NULL(info->dma))
>                 dma_release_channel(info->dma);
>         if (nand_chip->ecc.priv) {
> -               nand_bch_free(nand_chip->ecc.priv);
> +               nand_bch_free(nand_chip);
>                 nand_chip->ecc.priv = NULL;
>         }
>         return err;
> @@ -2286,7 +2286,7 @@ static int omap_nand_remove(struct platform_device *pdev)
>         int ret;
>
>         if (nand_chip->ecc.priv) {
> -               nand_bch_free(nand_chip->ecc.priv);
> +               nand_bch_free(nand_chip);
>                 nand_chip->ecc.priv = NULL;
>         }
>         if (info->dma)
> diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
> index 1e1ee3af82b1..b62b8bd4669f 100644
> --- a/include/linux/mtd/nand-ecc-sw-bch.h
> +++ b/include/linux/mtd/nand-ecc-sw-bch.h
> @@ -10,7 +10,6 @@
>
>  struct mtd_info;
>  struct nand_chip;
> -struct nand_bch_control;
>
>  #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
>
> @@ -30,11 +29,11 @@ int nand_bch_correct_data(struct nand_chip *chip, u_char *dat,
>  /*
>   * Initialize BCH encoder/decoder
>   */
> -struct nand_bch_control *nand_bch_init(struct mtd_info *mtd);
> +int nand_bch_init(struct nand_chip *chip);
>  /*
>   * Release BCH encoder/decoder resources
>   */
> -void nand_bch_free(struct nand_bch_control *nbc);
> +void nand_bch_free(struct nand_chip *chip);
>
>  #else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
>
> @@ -54,12 +53,12 @@ nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
>         return -ENOTSUPP;
>  }
>
> -static inline struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
> +static inline int nand_bch_init(struct nand_chip *chip)
>  {
> -       return NULL;
> +       return -ENOTSUPP;
>  }
>
> -static inline void nand_bch_free(struct nand_bch_control *nbc) {}
> +static inline void nand_bch_free(struct nand_chip *chip) {}
>
>  #endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
>
> --
> 2.20.1
>
>
> ______________________________________________________
> Linux MTD discussion mailing list
> http://lists.infradead.org/mailman/listinfo/linux-mtd/
Miquel Raynal Jan. 11, 2021, 10:20 a.m. UTC | #2
Hi Adam,

Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:

> On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> >
> > The NAND BCH control structure has nothing to do outside of this
> > driver, all users of the nand_bch_init/free() functions just save it
> > to chip->ecc.priv so do it in this driver directly and return a
> > regular error code instead.
> >
> > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > ---  
> 
> Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> build or returns errors on some omap2plus devices with the following
> error:
> 
>     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
>     nand: Micron MT29F4G16ABBDA3W
>     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
>     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
>     Invalid ECC layout
>     omap2-nand 30000000.nand: unable to use BCH library
>     omap2-nand: probe of 30000000.nand failed with error -22
>     8<--- cut here ---
> 
> There are few commits using git bisect that have build errors, so it
> wasn't possible for me to determine the exact commit that broke the
> ECC.  If the build failed, I marked it as 'bad' to git bisect.

I am sorry to hear that, I regularly rebase with a make run between each
pick and push my branches to a 0-day repository to have robots check
for such errors, but sometimes I fail.

> Newer commits have remedied the build issue, but the Invalid ECC
> layout error still exists as of 5.11-RC2.

Ok so let's focus on these.

> Do you have any suggestions on what I can do to remedy this?  I am
> willing to try and test.

Glad to hear that.

Can you share the NAND controller DT node you are using?

Also, can you please add a few printk's like below and give me the
output?

---8<---

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index 0a0ac11d5725..0d3e948d02e9 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -205,6 +205,7 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
        }
 
        nsteps = mtd->writesize / conf->step_size;
+       printk("writesize %d, step_size %d, nsteps %d\n", mtd->writesize, conf->step_size, nsteps);
 
        /* Maximize */
        if (nand->ecc.user_conf.flags & NAND_ECC_MAXIMIZE_STRENGTH) {
@@ -213,11 +214,14 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
                /* Reserve 2 bytes for the BBM */
                code_size = (mtd->oobsize - 2) / nsteps;
                conf->strength = code_size * 8 / fls(8 * conf->step_size);
+               printk("Maximize => nsteps %d, code_size %d\n", nsteps, code_size);
        }
 
-       if (!code_size)
+       if (!code_size) {
                code_size = DIV_ROUND_UP(conf->strength *
                                         fls(8 * conf->step_size), 8);
+               printk("strength %d, step size %d, code_size %d\n", conf->strength, conf->step_size, code_size);
+       }
 
        if (!conf->strength)
                conf->strength = (code_size * 8) / fls(8 * conf->step_size);
@@ -252,6 +256,7 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
                goto free_bufs;
 
        /* Verify the layout validity */
+       printk("count eccbytes %d\n", mtd_ooblayout_count_eccbytes(mtd));
        if (mtd_ooblayout_count_eccbytes(mtd) !=
            engine_conf->nsteps * engine_conf->code_size) {
                pr_err("Invalid ECC layout\n");

--->8---

Thanks,
Miquèl
Miquel Raynal Jan. 12, 2021, 2:35 p.m. UTC | #3
Hi Adam,

Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
11:20:27 +0100:

> Hi Adam,
> 
> Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> 
> > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:  
> > >
> > > The NAND BCH control structure has nothing to do outside of this
> > > driver, all users of the nand_bch_init/free() functions just save it
> > > to chip->ecc.priv so do it in this driver directly and return a
> > > regular error code instead.
> > >
> > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > ---    
> > 
> > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > build or returns errors on some omap2plus devices with the following
> > error:
> > 
> >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> >     nand: Micron MT29F4G16ABBDA3W
> >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> >     Invalid ECC layout
> >     omap2-nand 30000000.nand: unable to use BCH library
> >     omap2-nand: probe of 30000000.nand failed with error -22
> >     8<--- cut here ---
> > 
> > There are few commits using git bisect that have build errors, so it
> > wasn't possible for me to determine the exact commit that broke the
> > ECC.  If the build failed, I marked it as 'bad' to git bisect.  
> 
> I am sorry to hear that, I regularly rebase with a make run between each
> pick and push my branches to a 0-day repository to have robots check
> for such errors, but sometimes I fail.
> 
> > Newer commits have remedied the build issue, but the Invalid ECC
> > layout error still exists as of 5.11-RC2.  
> 
> Ok so let's focus on these.
> 
> > Do you have any suggestions on what I can do to remedy this?  I am
> > willing to try and test.  
> 
> Glad to hear that.
> 
> Can you share the NAND controller DT node you are using?
> 
> Also, can you please add a few printk's like below and give me the
> output?

Will you have the time to check these soon? I am ready to help and
would like to fix it asap.

Cheers,
Miquèl
Adam Ford Jan. 12, 2021, 4:01 p.m. UTC | #4
On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> Hi Adam,
>
> Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> 11:20:27 +0100:
>
> > Hi Adam,
> >
> > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> >
> > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > >
> > > > The NAND BCH control structure has nothing to do outside of this
> > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > regular error code instead.
> > > >
> > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > ---
> > >
> > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > build or returns errors on some omap2plus devices with the following
> > > error:
> > >
> > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > >     nand: Micron MT29F4G16ABBDA3W
> > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > >     Invalid ECC layout
> > >     omap2-nand 30000000.nand: unable to use BCH library
> > >     omap2-nand: probe of 30000000.nand failed with error -22
> > >     8<--- cut here ---
> > >
> > > There are few commits using git bisect that have build errors, so it
> > > wasn't possible for me to determine the exact commit that broke the
> > > ECC.  If the build failed, I marked it as 'bad' to git bisect.
> >
> > I am sorry to hear that, I regularly rebase with a make run between each
> > pick and push my branches to a 0-day repository to have robots check
> > for such errors, but sometimes I fail.
> >
> > > Newer commits have remedied the build issue, but the Invalid ECC
> > > layout error still exists as of 5.11-RC2.
> >
> > Ok so let's focus on these.
> >
> > > Do you have any suggestions on what I can do to remedy this?  I am
> > > willing to try and test.
> >
> > Glad to hear that.
> >
> > Can you share the NAND controller DT node you are using?
> >
> > Also, can you please add a few printk's like below and give me the
> > output?
>
> Will you have the time to check these soon? I am ready to help and
> would like to fix it asap.

Sorry for the delay, I have to split my time with 3 different
projects.  I am hoping to get you data later today.

adam
>
> Cheers,
> Miquèl
Adam Ford Jan. 12, 2021, 5:20 p.m. UTC | #5
On Tue, Jan 12, 2021 at 10:01 AM Adam Ford <aford173@gmail.com> wrote:
>
> On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> >
> > Hi Adam,
> >
> > Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> > 11:20:27 +0100:
> >
> > > Hi Adam,
> > >
> > > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> > >
> > > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > > >
> > > > > The NAND BCH control structure has nothing to do outside of this
> > > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > > regular error code instead.
> > > > >
> > > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > > ---
> > > >
> > > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > > build or returns errors on some omap2plus devices with the following
> > > > error:
> > > >
> > > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > >     nand: Micron MT29F4G16ABBDA3W
> > > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > >     Invalid ECC layout
> > > >     omap2-nand 30000000.nand: unable to use BCH library
> > > >     omap2-nand: probe of 30000000.nand failed with error -22
> > > >     8<--- cut here ---
> > > >
> > > > There are few commits using git bisect that have build errors, so it
> > > > wasn't possible for me to determine the exact commit that broke the
> > > > ECC.  If the build failed, I marked it as 'bad' to git bisect.
> > >
> > > I am sorry to hear that, I regularly rebase with a make run between each
> > > pick and push my branches to a 0-day repository to have robots check
> > > for such errors, but sometimes I fail.
> > >
> > > > Newer commits have remedied the build issue, but the Invalid ECC
> > > > layout error still exists as of 5.11-RC2.
> > >
> > > Ok so let's focus on these.
> > >
> > > > Do you have any suggestions on what I can do to remedy this?  I am
> > > > willing to try and test.
> > >
> > > Glad to hear that.
> > >
> > > Can you share the NAND controller DT node you are using?
> > >
> > > Also, can you please add a few printk's like below and give me the
> > > output?
> >
> > Will you have the time to check these soon? I am ready to help and
> > would like to fix it asap.
>
> Sorry for the delay, I have to split my time with 3 different
> projects.  I am hoping to get you data later today.
>
Miquel,

Here is the dump from my boot sequence:

[    2.629089] omap2-nand 30000000.nand: GPIO lookup for consumer rb
[    2.635253] omap2-nand 30000000.nand: using device tree for GPIO lookup
[    2.642150] of_get_named_gpiod_flags: parsed 'rb-gpios' property of node '/o)
[    2.653900] gpio gpiochip6: Persistence not supported for GPIO 0
[    2.660339] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
[    2.666900] nand: Micron MT29F4G16ABBDA3W
[    2.670959] nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB si4
[    2.678710] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
[    2.684234] writesize 2048, step_size 512, nsteps 4
[    2.689300] strength 8, step size 512, code_size 13
[    2.696807] count eccbytes 0
[    2.699707] Invalid ECC layout
[    2.702789] omap2-nand 30000000.nand: unable to use BCH library
[    2.709014] omap2-nand: probe of 30000000.nand failed with error -22
[    2.722656] 8<--- cut here ---

Let me know if/what else you want me to try.

adam

> adam
> >
> > Cheers,
> > Miquèl
Miquel Raynal Jan. 14, 2021, 3:42 p.m. UTC | #6
Hi Adam,

Adam Ford <aford173@gmail.com> wrote on Tue, 12 Jan 2021 11:20:24 -0600:

> On Tue, Jan 12, 2021 at 10:01 AM Adam Ford <aford173@gmail.com> wrote:
> >
> > On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:  
> > >
> > > Hi Adam,
> > >
> > > Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> > > 11:20:27 +0100:
> > >  
> > > > Hi Adam,
> > > >
> > > > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> > > >  
> > > > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:  
> > > > > >
> > > > > > The NAND BCH control structure has nothing to do outside of this
> > > > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > > > regular error code instead.
> > > > > >
> > > > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > > > ---  
> > > > >
> > > > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > > > build or returns errors on some omap2plus devices with the following
> > > > > error:
> > > > >
> > > > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > >     nand: Micron MT29F4G16ABBDA3W
> > > > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > >     Invalid ECC layout
> > > > >     omap2-nand 30000000.nand: unable to use BCH library
> > > > >     omap2-nand: probe of 30000000.nand failed with error -22
> > > > >     8<--- cut here ---
> > > > >
> > > > > There are few commits using git bisect that have build errors, so it
> > > > > wasn't possible for me to determine the exact commit that broke the
> > > > > ECC.  If the build failed, I marked it as 'bad' to git bisect.  
> > > >
> > > > I am sorry to hear that, I regularly rebase with a make run between each
> > > > pick and push my branches to a 0-day repository to have robots check
> > > > for such errors, but sometimes I fail.
> > > >  
> > > > > Newer commits have remedied the build issue, but the Invalid ECC
> > > > > layout error still exists as of 5.11-RC2.  
> > > >
> > > > Ok so let's focus on these.
> > > >  
> > > > > Do you have any suggestions on what I can do to remedy this?  I am
> > > > > willing to try and test.  
> > > >
> > > > Glad to hear that.
> > > >
> > > > Can you share the NAND controller DT node you are using?
> > > >
> > > > Also, can you please add a few printk's like below and give me the
> > > > output?  
> > >
> > > Will you have the time to check these soon? I am ready to help and
> > > would like to fix it asap.  
> >
> > Sorry for the delay, I have to split my time with 3 different
> > projects.  I am hoping to get you data later today.
> >  
> Miquel,
> 
> Here is the dump from my boot sequence:
> 
> [    2.629089] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> [    2.635253] omap2-nand 30000000.nand: using device tree for GPIO lookup
> [    2.642150] of_get_named_gpiod_flags: parsed 'rb-gpios' property of node '/o)
> [    2.653900] gpio gpiochip6: Persistence not supported for GPIO 0
> [    2.660339] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> [    2.666900] nand: Micron MT29F4G16ABBDA3W
> [    2.670959] nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB si4
> [    2.678710] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> [    2.684234] writesize 2048, step_size 512, nsteps 4
> [    2.689300] strength 8, step size 512, code_size 13

Until here, everything looks fine.

> [    2.696807] count eccbytes 0

This is the cause of the error, the MTD OOB layout reports not ECC byte.

Can you please check that we effectively call the large page helpers
(in particular nand_ooblayout_ecc_lp()) . I bet this function returns
-ERANGE on its first call, which reduces the eccbytes variable above to
zero.

What is strange is that, the only reason this would happen (to my eyes)
is nand->ecc.ctx.total being 0. Can you please check its effective
value?

I do not see the immediate reason because nand->ecc.ctx.total is set to
nsteps (4) * code_size (13) right before calling
mtd_ooblayout_count_eccbytes().

Can you please verify my sayings and perhaps tackle the root cause of
this issue? Please do not hesitate to ask questions, I'll do my best to
help because this is a critical section that is not only breaking
OMAP boards, unfortunately.

Thanks,
Miquèl
Adam Ford Jan. 15, 2021, 12:23 p.m. UTC | #7
On Thu, Jan 14, 2021 at 9:42 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> Hi Adam,
>
> Adam Ford <aford173@gmail.com> wrote on Tue, 12 Jan 2021 11:20:24 -0600:
>
> > On Tue, Jan 12, 2021 at 10:01 AM Adam Ford <aford173@gmail.com> wrote:
> > >
> > > On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > >
> > > > Hi Adam,
> > > >
> > > > Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> > > > 11:20:27 +0100:
> > > >
> > > > > Hi Adam,
> > > > >
> > > > > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> > > > >
> > > > > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > > > > >
> > > > > > > The NAND BCH control structure has nothing to do outside of this
> > > > > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > > > > regular error code instead.
> > > > > > >
> > > > > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > > > > ---
> > > > > >
> > > > > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > > > > build or returns errors on some omap2plus devices with the following
> > > > > > error:
> > > > > >
> > > > > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > > >     nand: Micron MT29F4G16ABBDA3W
> > > > > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > > > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > > >     Invalid ECC layout
> > > > > >     omap2-nand 30000000.nand: unable to use BCH library
> > > > > >     omap2-nand: probe of 30000000.nand failed with error -22
> > > > > >     8<--- cut here ---
> > > > > >
> > > > > > There are few commits using git bisect that have build errors, so it
> > > > > > wasn't possible for me to determine the exact commit that broke the
> > > > > > ECC.  If the build failed, I marked it as 'bad' to git bisect.
> > > > >
> > > > > I am sorry to hear that, I regularly rebase with a make run between each
> > > > > pick and push my branches to a 0-day repository to have robots check
> > > > > for such errors, but sometimes I fail.
> > > > >
> > > > > > Newer commits have remedied the build issue, but the Invalid ECC
> > > > > > layout error still exists as of 5.11-RC2.
> > > > >
> > > > > Ok so let's focus on these.
> > > > >
> > > > > > Do you have any suggestions on what I can do to remedy this?  I am
> > > > > > willing to try and test.
> > > > >
> > > > > Glad to hear that.
> > > > >
> > > > > Can you share the NAND controller DT node you are using?
> > > > >
> > > > > Also, can you please add a few printk's like below and give me the
> > > > > output?
> > > >
> > > > Will you have the time to check these soon? I am ready to help and
> > > > would like to fix it asap.
> > >
> > > Sorry for the delay, I have to split my time with 3 different
> > > projects.  I am hoping to get you data later today.
> > >
> > Miquel,
> >
> > Here is the dump from my boot sequence:
> >
> > [    2.629089] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > [    2.635253] omap2-nand 30000000.nand: using device tree for GPIO lookup
> > [    2.642150] of_get_named_gpiod_flags: parsed 'rb-gpios' property of node '/o)
> > [    2.653900] gpio gpiochip6: Persistence not supported for GPIO 0
> > [    2.660339] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > [    2.666900] nand: Micron MT29F4G16ABBDA3W
> > [    2.670959] nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB si4
> > [    2.678710] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > [    2.684234] writesize 2048, step_size 512, nsteps 4
> > [    2.689300] strength 8, step size 512, code_size 13
>
> Until here, everything looks fine.
>
> > [    2.696807] count eccbytes 0
>
> This is the cause of the error, the MTD OOB layout reports not ECC byte.
>
> Can you please check that we effectively call the large page helpers
> (in particular nand_ooblayout_ecc_lp()) . I bet this function returns
> -ERANGE on its first call, which reduces the eccbytes variable above to
> zero.

I will do what I can, but I am out of my element with this mtd and nand stuff.

I added a printk to the beginning of nand_ooblayout_ecc_lp() and it's
not appearing, so I wonder if nand_ooblayout_ecc_lp() is not getting
called.

I also added some printk's to the drivers/mtd/nand/raw/omap2.c to see
what's being defined for section and ecc.steps.

[    2.621978] mtdoops: mtd device (mtddev=name/number) must be supplied
[    2.629699] omap2-nand 30000000.nand: GPIO lookup for consumer rb
[    2.635864] omap2-nand 30000000.nand: using device tree for GPIO lookup
[    2.642761] of_get_named_gpiod_flags: parsed 'rb-gpios' property of
node '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
[    2.654510] gpio gpiochip6: Persistence not supported for GPIO 0
[    2.660949] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
[    2.667510] nand: Micron MT29F4G16ABBDA3W
[    2.671569] nand: 512 MiB, SLC, erase size: 128 KiB, page size:
2048, OOB size: 64
[    2.679321] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
[    2.684844] writesize 2048, step_size 512, nsteps 4
[    2.689910] strength 8, step size 512, code_size 13
[    2.694824] nand->ecc.ctx.total = 52
[    2.700988] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0

omap_sw_ooblayout_ecc() returns -ERANGE if section => chip->ecc.steps
which appears to be the case here.

Is it safe to assume that ecc.steps should be 4 if nsteps is 4?

[    2.707031] count eccbytes 0
[    2.709930] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
[    2.715820] Invalid ECC layout
[    2.719055] omap2-nand 30000000.nand: unable to use BCH library
[    2.725067] omap2-nand: probe of 30000000.nand failed with error -22
[    2.738983] 8<--- cut here ---

>
> What is strange is that, the only reason this would happen (to my eyes)
> is nand->ecc.ctx.total being 0. Can you please check its effective
> value?

[    2.694824] nand->ecc.ctx.total = 52

>
> I do not see the immediate reason because nand->ecc.ctx.total is set to
> nsteps (4) * code_size (13) right before calling
> mtd_ooblayout_count_eccbytes().
>
> Can you please verify my sayings and perhaps tackle the root cause of
> this issue? Please do not hesitate to ask questions, I'll do my best to
> help because this is a critical section that is not only breaking
> OMAP boards, unfortunately.
>
> Thanks,
> Miquèl
Adam Ford Jan. 15, 2021, 4:06 p.m. UTC | #8
On Fri, Jan 15, 2021 at 6:23 AM Adam Ford <aford173@gmail.com> wrote:
>
> On Thu, Jan 14, 2021 at 9:42 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> >
> > Hi Adam,
> >
> > Adam Ford <aford173@gmail.com> wrote on Tue, 12 Jan 2021 11:20:24 -0600:
> >
> > > On Tue, Jan 12, 2021 at 10:01 AM Adam Ford <aford173@gmail.com> wrote:
> > > >
> > > > On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > > >
> > > > > Hi Adam,
> > > > >
> > > > > Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> > > > > 11:20:27 +0100:
> > > > >
> > > > > > Hi Adam,
> > > > > >
> > > > > > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> > > > > >
> > > > > > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > > > > > >
> > > > > > > > The NAND BCH control structure has nothing to do outside of this
> > > > > > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > > > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > > > > > regular error code instead.
> > > > > > > >
> > > > > > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > > > > > ---
> > > > > > >
> > > > > > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > > > > > build or returns errors on some omap2plus devices with the following
> > > > > > > error:
> > > > > > >
> > > > > > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > > > >     nand: Micron MT29F4G16ABBDA3W
> > > > > > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > > > > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > > > >     Invalid ECC layout
> > > > > > >     omap2-nand 30000000.nand: unable to use BCH library
> > > > > > >     omap2-nand: probe of 30000000.nand failed with error -22
> > > > > > >     8<--- cut here ---
> > > > > > >
> > > > > > > There are few commits using git bisect that have build errors, so it
> > > > > > > wasn't possible for me to determine the exact commit that broke the
> > > > > > > ECC.  If the build failed, I marked it as 'bad' to git bisect.
> > > > > >
> > > > > > I am sorry to hear that, I regularly rebase with a make run between each
> > > > > > pick and push my branches to a 0-day repository to have robots check
> > > > > > for such errors, but sometimes I fail.
> > > > > >
> > > > > > > Newer commits have remedied the build issue, but the Invalid ECC
> > > > > > > layout error still exists as of 5.11-RC2.
> > > > > >
> > > > > > Ok so let's focus on these.
> > > > > >
> > > > > > > Do you have any suggestions on what I can do to remedy this?  I am
> > > > > > > willing to try and test.
> > > > > >
> > > > > > Glad to hear that.
> > > > > >
> > > > > > Can you share the NAND controller DT node you are using?
> > > > > >
> > > > > > Also, can you please add a few printk's like below and give me the
> > > > > > output?
> > > > >
> > > > > Will you have the time to check these soon? I am ready to help and
> > > > > would like to fix it asap.
> > > >
> > > > Sorry for the delay, I have to split my time with 3 different
> > > > projects.  I am hoping to get you data later today.
> > > >
> > > Miquel,
> > >
> > > Here is the dump from my boot sequence:
> > >
> > > [    2.629089] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > > [    2.635253] omap2-nand 30000000.nand: using device tree for GPIO lookup
> > > [    2.642150] of_get_named_gpiod_flags: parsed 'rb-gpios' property of node '/o)
> > > [    2.653900] gpio gpiochip6: Persistence not supported for GPIO 0
> > > [    2.660339] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > [    2.666900] nand: Micron MT29F4G16ABBDA3W
> > > [    2.670959] nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB si4
> > > [    2.678710] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > [    2.684234] writesize 2048, step_size 512, nsteps 4
> > > [    2.689300] strength 8, step size 512, code_size 13
> >
> > Until here, everything looks fine.
> >
> > > [    2.696807] count eccbytes 0
> >
> > This is the cause of the error, the MTD OOB layout reports not ECC byte.
> >
> > Can you please check that we effectively call the large page helpers
> > (in particular nand_ooblayout_ecc_lp()) . I bet this function returns
> > -ERANGE on its first call, which reduces the eccbytes variable above to
> > zero.
>
> I will do what I can, but I am out of my element with this mtd and nand stuff.
>
> I added a printk to the beginning of nand_ooblayout_ecc_lp() and it's
> not appearing, so I wonder if nand_ooblayout_ecc_lp() is not getting
> called.
>
> I also added some printk's to the drivers/mtd/nand/raw/omap2.c to see
> what's being defined for section and ecc.steps.
>
> [    2.621978] mtdoops: mtd device (mtddev=name/number) must be supplied
> [    2.629699] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> [    2.635864] omap2-nand 30000000.nand: using device tree for GPIO lookup
> [    2.642761] of_get_named_gpiod_flags: parsed 'rb-gpios' property of
> node '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
> [    2.654510] gpio gpiochip6: Persistence not supported for GPIO 0
> [    2.660949] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> [    2.667510] nand: Micron MT29F4G16ABBDA3W
> [    2.671569] nand: 512 MiB, SLC, erase size: 128 KiB, page size:
> 2048, OOB size: 64
> [    2.679321] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> [    2.684844] writesize 2048, step_size 512, nsteps 4
> [    2.689910] strength 8, step size 512, code_size 13
> [    2.694824] nand->ecc.ctx.total = 52
> [    2.700988] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
>
> omap_sw_ooblayout_ecc() returns -ERANGE if section => chip->ecc.steps
> which appears to be the case here.
>
> Is it safe to assume that ecc.steps should be 4 if nsteps is 4?
>
> [    2.707031] count eccbytes 0
> [    2.709930] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
> [    2.715820] Invalid ECC layout
> [    2.719055] omap2-nand 30000000.nand: unable to use BCH library
> [    2.725067] omap2-nand: probe of 30000000.nand failed with error -22
> [    2.738983] 8<--- cut here ---
>
> >
> > What is strange is that, the only reason this would happen (to my eyes)
> > is nand->ecc.ctx.total being 0. Can you please check its effective
> > value?
>
> [    2.694824] nand->ecc.ctx.total = 52
>
> >
> > I do not see the immediate reason because nand->ecc.ctx.total is set to
> > nsteps (4) * code_size (13) right before calling
> > mtd_ooblayout_count_eccbytes().
> >
> > Can you please verify my sayings and perhaps tackle the root cause of
> > this issue? Please do not hesitate to ask questions, I'll do my best to
> > help because this is a critical section that is not only breaking
> > OMAP boards, unfortunately.

I appear to have the NAND flash working with the following patch:

@@ -247,11 +253,21 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
        nand->ecc.ctx.priv = engine_conf;
        nand->ecc.ctx.total = nsteps * code_size;

+       struct nand_chip *chip = mtd_to_nand(mtd);
+       chip->ecc.steps = nsteps;
+       chip->ecc.size =  conf->step_size;

I am guessing it's not exactly what you want, but appears that the
ecc.steps and ecc.size wasn't getting propagated to the mtd layer, so
when omap_sw_ooblayout_ecc() was called, those fields were blank.

If you have a suggestion on how you want this implemented, I can push a patch.

adam



> >
> > Thanks,
> > Miquèl
Miquel Raynal Jan. 15, 2021, 4:17 p.m. UTC | #9
Hi Adam,

Adam Ford <aford173@gmail.com> wrote on Fri, 15 Jan 2021 10:06:14 -0600:

> On Fri, Jan 15, 2021 at 6:23 AM Adam Ford <aford173@gmail.com> wrote:
> >
> > On Thu, Jan 14, 2021 at 9:42 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:  
> > >
> > > Hi Adam,
> > >
> > > Adam Ford <aford173@gmail.com> wrote on Tue, 12 Jan 2021 11:20:24 -0600:
> > >  
> > > > On Tue, Jan 12, 2021 at 10:01 AM Adam Ford <aford173@gmail.com> wrote:  
> > > > >
> > > > > On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:  
> > > > > >
> > > > > > Hi Adam,
> > > > > >
> > > > > > Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> > > > > > 11:20:27 +0100:
> > > > > >  
> > > > > > > Hi Adam,
> > > > > > >
> > > > > > > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> > > > > > >  
> > > > > > > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:  
> > > > > > > > >
> > > > > > > > > The NAND BCH control structure has nothing to do outside of this
> > > > > > > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > > > > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > > > > > > regular error code instead.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > > > > > > ---  
> > > > > > > >
> > > > > > > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > > > > > > build or returns errors on some omap2plus devices with the following
> > > > > > > > error:
> > > > > > > >
> > > > > > > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > > > > >     nand: Micron MT29F4G16ABBDA3W
> > > > > > > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > > > > > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > > > > >     Invalid ECC layout
> > > > > > > >     omap2-nand 30000000.nand: unable to use BCH library
> > > > > > > >     omap2-nand: probe of 30000000.nand failed with error -22
> > > > > > > >     8<--- cut here ---
> > > > > > > >
> > > > > > > > There are few commits using git bisect that have build errors, so it
> > > > > > > > wasn't possible for me to determine the exact commit that broke the
> > > > > > > > ECC.  If the build failed, I marked it as 'bad' to git bisect.  
> > > > > > >
> > > > > > > I am sorry to hear that, I regularly rebase with a make run between each
> > > > > > > pick and push my branches to a 0-day repository to have robots check
> > > > > > > for such errors, but sometimes I fail.
> > > > > > >  
> > > > > > > > Newer commits have remedied the build issue, but the Invalid ECC
> > > > > > > > layout error still exists as of 5.11-RC2.  
> > > > > > >
> > > > > > > Ok so let's focus on these.
> > > > > > >  
> > > > > > > > Do you have any suggestions on what I can do to remedy this?  I am
> > > > > > > > willing to try and test.  
> > > > > > >
> > > > > > > Glad to hear that.
> > > > > > >
> > > > > > > Can you share the NAND controller DT node you are using?
> > > > > > >
> > > > > > > Also, can you please add a few printk's like below and give me the
> > > > > > > output?  
> > > > > >
> > > > > > Will you have the time to check these soon? I am ready to help and
> > > > > > would like to fix it asap.  
> > > > >
> > > > > Sorry for the delay, I have to split my time with 3 different
> > > > > projects.  I am hoping to get you data later today.
> > > > >  
> > > > Miquel,
> > > >
> > > > Here is the dump from my boot sequence:
> > > >
> > > > [    2.629089] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > > > [    2.635253] omap2-nand 30000000.nand: using device tree for GPIO lookup
> > > > [    2.642150] of_get_named_gpiod_flags: parsed 'rb-gpios' property of node '/o)
> > > > [    2.653900] gpio gpiochip6: Persistence not supported for GPIO 0
> > > > [    2.660339] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > [    2.666900] nand: Micron MT29F4G16ABBDA3W
> > > > [    2.670959] nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB si4
> > > > [    2.678710] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > [    2.684234] writesize 2048, step_size 512, nsteps 4
> > > > [    2.689300] strength 8, step size 512, code_size 13  
> > >
> > > Until here, everything looks fine.
> > >  
> > > > [    2.696807] count eccbytes 0  
> > >
> > > This is the cause of the error, the MTD OOB layout reports not ECC byte.
> > >
> > > Can you please check that we effectively call the large page helpers
> > > (in particular nand_ooblayout_ecc_lp()) . I bet this function returns
> > > -ERANGE on its first call, which reduces the eccbytes variable above to
> > > zero.  
> >
> > I will do what I can, but I am out of my element with this mtd and nand stuff.
> >
> > I added a printk to the beginning of nand_ooblayout_ecc_lp() and it's
> > not appearing, so I wonder if nand_ooblayout_ecc_lp() is not getting
> > called.
> >
> > I also added some printk's to the drivers/mtd/nand/raw/omap2.c to see
> > what's being defined for section and ecc.steps.
> >
> > [    2.621978] mtdoops: mtd device (mtddev=name/number) must be supplied
> > [    2.629699] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > [    2.635864] omap2-nand 30000000.nand: using device tree for GPIO lookup
> > [    2.642761] of_get_named_gpiod_flags: parsed 'rb-gpios' property of
> > node '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
> > [    2.654510] gpio gpiochip6: Persistence not supported for GPIO 0
> > [    2.660949] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > [    2.667510] nand: Micron MT29F4G16ABBDA3W
> > [    2.671569] nand: 512 MiB, SLC, erase size: 128 KiB, page size:
> > 2048, OOB size: 64
> > [    2.679321] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > [    2.684844] writesize 2048, step_size 512, nsteps 4
> > [    2.689910] strength 8, step size 512, code_size 13
> > [    2.694824] nand->ecc.ctx.total = 52
> > [    2.700988] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
> >
> > omap_sw_ooblayout_ecc() returns -ERANGE if section => chip->ecc.steps
> > which appears to be the case here.
> >
> > Is it safe to assume that ecc.steps should be 4 if nsteps is 4?
> >
> > [    2.707031] count eccbytes 0
> > [    2.709930] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
> > [    2.715820] Invalid ECC layout
> > [    2.719055] omap2-nand 30000000.nand: unable to use BCH library
> > [    2.725067] omap2-nand: probe of 30000000.nand failed with error -22
> > [    2.738983] 8<--- cut here ---
> >  
> > >
> > > What is strange is that, the only reason this would happen (to my eyes)
> > > is nand->ecc.ctx.total being 0. Can you please check its effective
> > > value?  
> >
> > [    2.694824] nand->ecc.ctx.total = 52
> >  
> > >
> > > I do not see the immediate reason because nand->ecc.ctx.total is set to
> > > nsteps (4) * code_size (13) right before calling
> > > mtd_ooblayout_count_eccbytes().
> > >
> > > Can you please verify my sayings and perhaps tackle the root cause of
> > > this issue? Please do not hesitate to ask questions, I'll do my best to
> > > help because this is a critical section that is not only breaking
> > > OMAP boards, unfortunately.  
> 
> I appear to have the NAND flash working with the following patch:
> 
> @@ -247,11 +253,21 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
>         nand->ecc.ctx.priv = engine_conf;
>         nand->ecc.ctx.total = nsteps * code_size;
> 
> +       struct nand_chip *chip = mtd_to_nand(mtd);
> +       chip->ecc.steps = nsteps;
> +       chip->ecc.size =  conf->step_size;
> 
> I am guessing it's not exactly what you want, but appears that the
> ecc.steps and ecc.size wasn't getting propagated to the mtd layer, so
> when omap_sw_ooblayout_ecc() was called, those fields were blank.
> 
> If you have a suggestion on how you want this implemented, I can push a patch.

Great, I'll check more deeply why is this happening and get back to you
early next week.

Thanks,
Miquèl
Adam Ford Jan. 15, 2021, 4:28 p.m. UTC | #10
On Fri, Jan 15, 2021 at 10:17 AM Miquel Raynal
<miquel.raynal@bootlin.com> wrote:
>
> Hi Adam,
>
> Adam Ford <aford173@gmail.com> wrote on Fri, 15 Jan 2021 10:06:14 -0600:
>
> > On Fri, Jan 15, 2021 at 6:23 AM Adam Ford <aford173@gmail.com> wrote:
> > >
> > > On Thu, Jan 14, 2021 at 9:42 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > >
> > > > Hi Adam,
> > > >
> > > > Adam Ford <aford173@gmail.com> wrote on Tue, 12 Jan 2021 11:20:24 -0600:
> > > >
> > > > > On Tue, Jan 12, 2021 at 10:01 AM Adam Ford <aford173@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Jan 12, 2021 at 8:35 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > > > > >
> > > > > > > Hi Adam,
> > > > > > >
> > > > > > > Miquel Raynal <miquel.raynal@bootlin.com> wrote on Mon, 11 Jan 2021
> > > > > > > 11:20:27 +0100:
> > > > > > >
> > > > > > > > Hi Adam,
> > > > > > > >
> > > > > > > > Adam Ford <aford173@gmail.com> wrote on Sat, 9 Jan 2021 08:46:44 -0600:
> > > > > > > >
> > > > > > > > > On Tue, Sep 29, 2020 at 6:09 PM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> > > > > > > > > >
> > > > > > > > > > The NAND BCH control structure has nothing to do outside of this
> > > > > > > > > > driver, all users of the nand_bch_init/free() functions just save it
> > > > > > > > > > to chip->ecc.priv so do it in this driver directly and return a
> > > > > > > > > > regular error code instead.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > > > > > > > ---
> > > > > > > > >
> > > > > > > > > Starting with this commit:  3c0fe36abebe, the kernel either doesn't
> > > > > > > > > build or returns errors on some omap2plus devices with the following
> > > > > > > > > error:
> > > > > > > > >
> > > > > > > > >     nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > > > > > >     nand: Micron MT29F4G16ABBDA3W
> > > > > > > > >     nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > > > > > > >     nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > > > > > >     Invalid ECC layout
> > > > > > > > >     omap2-nand 30000000.nand: unable to use BCH library
> > > > > > > > >     omap2-nand: probe of 30000000.nand failed with error -22
> > > > > > > > >     8<--- cut here ---
> > > > > > > > >
> > > > > > > > > There are few commits using git bisect that have build errors, so it
> > > > > > > > > wasn't possible for me to determine the exact commit that broke the
> > > > > > > > > ECC.  If the build failed, I marked it as 'bad' to git bisect.
> > > > > > > >
> > > > > > > > I am sorry to hear that, I regularly rebase with a make run between each
> > > > > > > > pick and push my branches to a 0-day repository to have robots check
> > > > > > > > for such errors, but sometimes I fail.
> > > > > > > >
> > > > > > > > > Newer commits have remedied the build issue, but the Invalid ECC
> > > > > > > > > layout error still exists as of 5.11-RC2.
> > > > > > > >
> > > > > > > > Ok so let's focus on these.
> > > > > > > >
> > > > > > > > > Do you have any suggestions on what I can do to remedy this?  I am
> > > > > > > > > willing to try and test.
> > > > > > > >
> > > > > > > > Glad to hear that.
> > > > > > > >
> > > > > > > > Can you share the NAND controller DT node you are using?
> > > > > > > >
> > > > > > > > Also, can you please add a few printk's like below and give me the
> > > > > > > > output?
> > > > > > >
> > > > > > > Will you have the time to check these soon? I am ready to help and
> > > > > > > would like to fix it asap.
> > > > > >
> > > > > > Sorry for the delay, I have to split my time with 3 different
> > > > > > projects.  I am hoping to get you data later today.
> > > > > >
> > > > > Miquel,
> > > > >
> > > > > Here is the dump from my boot sequence:
> > > > >
> > > > > [    2.629089] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > > > > [    2.635253] omap2-nand 30000000.nand: using device tree for GPIO lookup
> > > > > [    2.642150] of_get_named_gpiod_flags: parsed 'rb-gpios' property of node '/o)
> > > > > [    2.653900] gpio gpiochip6: Persistence not supported for GPIO 0
> > > > > [    2.660339] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > > > [    2.666900] nand: Micron MT29F4G16ABBDA3W
> > > > > [    2.670959] nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB si4
> > > > > [    2.678710] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > > > [    2.684234] writesize 2048, step_size 512, nsteps 4
> > > > > [    2.689300] strength 8, step size 512, code_size 13
> > > >
> > > > Until here, everything looks fine.
> > > >
> > > > > [    2.696807] count eccbytes 0
> > > >
> > > > This is the cause of the error, the MTD OOB layout reports not ECC byte.
> > > >
> > > > Can you please check that we effectively call the large page helpers
> > > > (in particular nand_ooblayout_ecc_lp()) . I bet this function returns
> > > > -ERANGE on its first call, which reduces the eccbytes variable above to
> > > > zero.
> > >
> > > I will do what I can, but I am out of my element with this mtd and nand stuff.
> > >
> > > I added a printk to the beginning of nand_ooblayout_ecc_lp() and it's
> > > not appearing, so I wonder if nand_ooblayout_ecc_lp() is not getting
> > > called.
> > >
> > > I also added some printk's to the drivers/mtd/nand/raw/omap2.c to see
> > > what's being defined for section and ecc.steps.
> > >
> > > [    2.621978] mtdoops: mtd device (mtddev=name/number) must be supplied
> > > [    2.629699] omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > > [    2.635864] omap2-nand 30000000.nand: using device tree for GPIO lookup
> > > [    2.642761] of_get_named_gpiod_flags: parsed 'rb-gpios' property of
> > > node '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
> > > [    2.654510] gpio gpiochip6: Persistence not supported for GPIO 0
> > > [    2.660949] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > [    2.667510] nand: Micron MT29F4G16ABBDA3W
> > > [    2.671569] nand: 512 MiB, SLC, erase size: 128 KiB, page size:
> > > 2048, OOB size: 64
> > > [    2.679321] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > [    2.684844] writesize 2048, step_size 512, nsteps 4
> > > [    2.689910] strength 8, step size 512, code_size 13
> > > [    2.694824] nand->ecc.ctx.total = 52
> > > [    2.700988] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
> > >
> > > omap_sw_ooblayout_ecc() returns -ERANGE if section => chip->ecc.steps
> > > which appears to be the case here.
> > >
> > > Is it safe to assume that ecc.steps should be 4 if nsteps is 4?
> > >
> > > [    2.707031] count eccbytes 0
> > > [    2.709930] omap_sw_ooblayout_ecc section 0, chip->ecc.steps 0
> > > [    2.715820] Invalid ECC layout
> > > [    2.719055] omap2-nand 30000000.nand: unable to use BCH library
> > > [    2.725067] omap2-nand: probe of 30000000.nand failed with error -22
> > > [    2.738983] 8<--- cut here ---
> > >
> > > >
> > > > What is strange is that, the only reason this would happen (to my eyes)
> > > > is nand->ecc.ctx.total being 0. Can you please check its effective
> > > > value?
> > >
> > > [    2.694824] nand->ecc.ctx.total = 52
> > >
> > > >
> > > > I do not see the immediate reason because nand->ecc.ctx.total is set to
> > > > nsteps (4) * code_size (13) right before calling
> > > > mtd_ooblayout_count_eccbytes().
> > > >
> > > > Can you please verify my sayings and perhaps tackle the root cause of
> > > > this issue? Please do not hesitate to ask questions, I'll do my best to
> > > > help because this is a critical section that is not only breaking
> > > > OMAP boards, unfortunately.
> >
> > I appear to have the NAND flash working with the following patch:
> >
> > @@ -247,11 +253,21 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
> >         nand->ecc.ctx.priv = engine_conf;
> >         nand->ecc.ctx.total = nsteps * code_size;
> >
> > +       struct nand_chip *chip = mtd_to_nand(mtd);
> > +       chip->ecc.steps = nsteps;
> > +       chip->ecc.size =  conf->step_size;
> >
> > I am guessing it's not exactly what you want, but appears that the
> > ecc.steps and ecc.size wasn't getting propagated to the mtd layer, so
> > when omap_sw_ooblayout_ecc() was called, those fields were blank.
> >
> > If you have a suggestion on how you want this implemented, I can push a patch.
>
> Great, I'll check more deeply why is this happening and get back to you
> early next week.

Sounds great.

FYI,  with my hack, this is my boot log:

[    2.622100] mtdoops: mtd device (mtddev=name/number) must be supplied
[    2.629821] omap2-nand 30000000.nand: GPIO lookup for consumer rb
[    2.635986] omap2-nand 30000000.nand: using device tree for GPIO lookup
[    2.642883] of_get_named_gpiod_flags: parsed 'rb-gpios' property of
node '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
[    2.654632] gpio gpiochip6: Persistence not supported for GPIO 0
[    2.661071] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
[    2.667633] nand: Micron MT29F4G16ABBDA3W
[    2.671691] nand: 512 MiB, SLC, erase size: 128 KiB, page size:
2048, OOB size: 64
[    2.679443] nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
[    2.684967] writesize 2048, step_size 512, nsteps 4
[    2.690032] strength 8, step size 512, code_size 13
[    2.700775] nand->ecc.ctx.total = 52
[    2.712646] count eccbytes 52
[    2.715698] mtd->ecc_step_size = 512
[    2.719482] nand_base: steps = 4, ecc->bytes = 13, ecc->total = 52
[    2.725769] 6 cmdlinepart partitions found on MTD device omap2-nand.0
[    2.732391] Creating 6 MTD partitions on "omap2-nand.0":
...

I have not verified being able to read/write and or check data
integrity, but with my hack, it no longer causes a panic on boot.

adam

>
> Thanks,
> Miquèl
Miquel Raynal Jan. 19, 2021, 11:56 a.m. UTC | #11
Hi Adam,

Thank you very much for troubleshooting this, here is my proposal.

> > > I appear to have the NAND flash working with the following patch:
> > >
> > > @@ -247,11 +253,21 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
> > >         nand->ecc.ctx.priv = engine_conf;
> > >         nand->ecc.ctx.total = nsteps * code_size;
> > >
> > > +       struct nand_chip *chip = mtd_to_nand(mtd);
> > > +       chip->ecc.steps = nsteps;
> > > +       chip->ecc.size =  conf->step_size;

I was fearing that many boards would be affected by this issue but it
appears that the problem will only show up here because the OMAP driver
makes a strange use of the BCH library: it initializes it itself
because it only needs it for a single operation while usually, the core
is in charge of doing that. During the initialization, the OOB layout
is verified. Usually, the BCH driver is used with one of the generic OOB
layouts, while here the OMAP driver uses its own, which reads raw NAND
chip entries.

I recently moved the BCH driver to only use "generic" NAND bits, which
produced the bug because the entries derived by the layout helpers
have not been updated yet.

So using raw NAND bits in the BCH driver is not an option here.
Instead, I think the best way to address this is to export the
declaration of the BCH internal configuration structure to the OMAP
driver and use the right values, recently derived by the driver:

---8<---

Author: Miquel Raynal <miquel.raynal@bootlin.com>
Date:   Tue Jan 19 12:27:07 2021 +0100

    wip: fix omap
    
    Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>

diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index fbb9955f2467..2c3e65cb68f3 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -15,6 +15,7 @@
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/mtd/mtd.h>
+#include <linux/mtd/nand-ecc-sw-bch.h>
 #include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/omap-dma.h>
@@ -1866,18 +1867,19 @@ static const struct mtd_ooblayout_ops omap_ooblayout_ops = {
 static int omap_sw_ooblayout_ecc(struct mtd_info *mtd, int section,
                                 struct mtd_oob_region *oobregion)
 {
-       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_device *nand = mtd_to_nanddev(mtd);
+       const struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
        int off = BADBLOCK_MARKER_LENGTH;
 
-       if (section >= chip->ecc.steps)
+       if (section >= engine_conf->nsteps)
                return -ERANGE;
 
        /*
         * When SW correction is employed, one OMAP specific marker byte is
         * reserved after each ECC step.
         */
-       oobregion->offset = off + (section * (chip->ecc.bytes + 1));
-       oobregion->length = chip->ecc.bytes;
+       oobregion->offset = off + (section * (engine_conf->code_size + 1));
+       oobregion->length = engine_conf->code_size;
 
        return 0;
 }
@@ -1885,7 +1887,8 @@ static int omap_sw_ooblayout_ecc(struct mtd_info *mtd, int section,
 static int omap_sw_ooblayout_free(struct mtd_info *mtd, int section,
                                  struct mtd_oob_region *oobregion)
 {
-       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_device *nand = mtd_to_nanddev(mtd);
+       const struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
        int off = BADBLOCK_MARKER_LENGTH;
 
        if (section)
@@ -1895,7 +1898,7 @@ static int omap_sw_ooblayout_free(struct mtd_info *mtd, int section,
         * When SW correction is employed, one OMAP specific marker byte is
         * reserved after each ECC step.
         */
-       off += ((chip->ecc.bytes + 1) * chip->ecc.steps);
+       off += ((engine_conf->code_size + 1) * engine_conf->nsteps);
        if (off >= mtd->oobsize)
                return -ERANGE;
 
--->8---

Can you please try this patch and compare the values between your hack
and mine of:
- chip->ecc.bytes vs. engine_conf->code_size
- chip->ecc.steps vs. engine_conf->nsteps
The values should be the same, but I prefer to be sure.

Thanks,
Miquèl
Adam Ford Jan. 19, 2021, 2:21 p.m. UTC | #12
On Tue, Jan 19, 2021 at 5:56 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> Hi Adam,
>
> Thank you very much for troubleshooting this, here is my proposal.
>
> > > > I appear to have the NAND flash working with the following patch:
> > > >
> > > > @@ -247,11 +253,21 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
> > > >         nand->ecc.ctx.priv = engine_conf;
> > > >         nand->ecc.ctx.total = nsteps * code_size;
> > > >
> > > > +       struct nand_chip *chip = mtd_to_nand(mtd);
> > > > +       chip->ecc.steps = nsteps;
> > > > +       chip->ecc.size =  conf->step_size;
>
> I was fearing that many boards would be affected by this issue but it
> appears that the problem will only show up here because the OMAP driver
> makes a strange use of the BCH library: it initializes it itself
> because it only needs it for a single operation while usually, the core
> is in charge of doing that. During the initialization, the OOB layout
> is verified. Usually, the BCH driver is used with one of the generic OOB
> layouts, while here the OMAP driver uses its own, which reads raw NAND
> chip entries.
>
> I recently moved the BCH driver to only use "generic" NAND bits, which
> produced the bug because the entries derived by the layout helpers
> have not been updated yet.
>
> So using raw NAND bits in the BCH driver is not an option here.
> Instead, I think the best way to address this is to export the
> declaration of the BCH internal configuration structure to the OMAP
> driver and use the right values, recently derived by the driver:
>
> ---8<---
>
> Author: Miquel Raynal <miquel.raynal@bootlin.com>
> Date:   Tue Jan 19 12:27:07 2021 +0100
>
>     wip: fix omap
>
>     Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
>
Thanks for fixing this.

I tested your patch, and I no longer get a Panic and the MTD device
appears to appear correctly:

mtdoops: mtd device (mtddev=name/number) must be supplied
omap2-nand 30000000.nand: GPIO lookup for consumer rb
omap2-nand 30000000.nand: using device tree for GPIO lookup
of_get_named_gpiod_flags: parsed 'rb-gpios' property of node
'/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
gpio gpiochip6: Persistence not supported for GPIO 0
nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
nand: Micron MT29F4G16ABBDA3W
nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
6 cmdlinepart partitions found on MTD device omap2-nand.0
Creating 6 MTD partitions on "omap2-nand.0":
...

When you submit a formal patch, CC me on the patch, and I'll respond
with a 'tested-by'

adam

> diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
> index fbb9955f2467..2c3e65cb68f3 100644
> --- a/drivers/mtd/nand/raw/omap2.c
> +++ b/drivers/mtd/nand/raw/omap2.c
> @@ -15,6 +15,7 @@
>  #include <linux/jiffies.h>
>  #include <linux/sched.h>
>  #include <linux/mtd/mtd.h>
> +#include <linux/mtd/nand-ecc-sw-bch.h>
>  #include <linux/mtd/rawnand.h>
>  #include <linux/mtd/partitions.h>
>  #include <linux/omap-dma.h>
> @@ -1866,18 +1867,19 @@ static const struct mtd_ooblayout_ops omap_ooblayout_ops = {
>  static int omap_sw_ooblayout_ecc(struct mtd_info *mtd, int section,
>                                  struct mtd_oob_region *oobregion)
>  {
> -       struct nand_chip *chip = mtd_to_nand(mtd);
> +       struct nand_device *nand = mtd_to_nanddev(mtd);
> +       const struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
>         int off = BADBLOCK_MARKER_LENGTH;
>
> -       if (section >= chip->ecc.steps)
> +       if (section >= engine_conf->nsteps)
>                 return -ERANGE;
>
>         /*
>          * When SW correction is employed, one OMAP specific marker byte is
>          * reserved after each ECC step.
>          */
> -       oobregion->offset = off + (section * (chip->ecc.bytes + 1));
> -       oobregion->length = chip->ecc.bytes;
> +       oobregion->offset = off + (section * (engine_conf->code_size + 1));
> +       oobregion->length = engine_conf->code_size;
>
>         return 0;
>  }
> @@ -1885,7 +1887,8 @@ static int omap_sw_ooblayout_ecc(struct mtd_info *mtd, int section,
>  static int omap_sw_ooblayout_free(struct mtd_info *mtd, int section,
>                                   struct mtd_oob_region *oobregion)
>  {
> -       struct nand_chip *chip = mtd_to_nand(mtd);
> +       struct nand_device *nand = mtd_to_nanddev(mtd);
> +       const struct nand_ecc_sw_bch_conf *engine_conf = nand->ecc.ctx.priv;
>         int off = BADBLOCK_MARKER_LENGTH;
>
>         if (section)
> @@ -1895,7 +1898,7 @@ static int omap_sw_ooblayout_free(struct mtd_info *mtd, int section,
>          * When SW correction is employed, one OMAP specific marker byte is
>          * reserved after each ECC step.
>          */
> -       off += ((chip->ecc.bytes + 1) * chip->ecc.steps);
> +       off += ((engine_conf->code_size + 1) * engine_conf->nsteps);
>         if (off >= mtd->oobsize)
>                 return -ERANGE;
>
> --->8---
>
> Can you please try this patch and compare the values between your hack
> and mine of:
> - chip->ecc.bytes vs. engine_conf->code_size
> - chip->ecc.steps vs. engine_conf->nsteps
> The values should be the same, but I prefer to be sure.
>
> Thanks,
> Miquèl
Miquel Raynal Jan. 19, 2021, 2:36 p.m. UTC | #13
Hi Adam,

> > ---8<---
> >
> > Author: Miquel Raynal <miquel.raynal@bootlin.com>
> > Date:   Tue Jan 19 12:27:07 2021 +0100
> >
> >     wip: fix omap
> >
> >     Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> >  
> Thanks for fixing this.
> 
> I tested your patch, and I no longer get a Panic and the MTD device
> appears to appear correctly:
> 
> mtdoops: mtd device (mtddev=name/number) must be supplied
> omap2-nand 30000000.nand: GPIO lookup for consumer rb
> omap2-nand 30000000.nand: using device tree for GPIO lookup
> of_get_named_gpiod_flags: parsed 'rb-gpios' property of node
> '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
> gpio gpiochip6: Persistence not supported for GPIO 0
> nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> nand: Micron MT29F4G16ABBDA3W
> nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> 6 cmdlinepart partitions found on MTD device omap2-nand.0
> Creating 6 MTD partitions on "omap2-nand.0":
> ...

Good to know. Can you just tell me if the values of
- chip->ecc.bytes vs. engine_conf->code_size
- chip->ecc.steps vs. engine_conf->nsteps
are the same in both cases (your patch and mine)? Otherwise your data
might appear corrupted somehow.

> 
> When you submit a formal patch, CC me on the patch, and I'll respond
> with a 'tested-by'

Of course, I'll also add a Reported-by.

Thanks,
Miquèl
Adam Ford Jan. 19, 2021, 3:49 p.m. UTC | #14
On Tue, Jan 19, 2021 at 8:36 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> Hi Adam,
>
> > > ---8<---
> > >
> > > Author: Miquel Raynal <miquel.raynal@bootlin.com>
> > > Date:   Tue Jan 19 12:27:07 2021 +0100
> > >
> > >     wip: fix omap
> > >
> > >     Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > >
> > Thanks for fixing this.
> >
> > I tested your patch, and I no longer get a Panic and the MTD device
> > appears to appear correctly:
> >
> > mtdoops: mtd device (mtddev=name/number) must be supplied
> > omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > omap2-nand 30000000.nand: using device tree for GPIO lookup
> > of_get_named_gpiod_flags: parsed 'rb-gpios' property of node
> > '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
> > gpio gpiochip6: Persistence not supported for GPIO 0
> > nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > nand: Micron MT29F4G16ABBDA3W
> > nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > 6 cmdlinepart partitions found on MTD device omap2-nand.0
> > Creating 6 MTD partitions on "omap2-nand.0":
> > ...
>
> Good to know. Can you just tell me if the values of
> - chip->ecc.bytes vs. engine_conf->code_size
> - chip->ecc.steps vs. engine_conf->nsteps
> are the same in both cases (your patch and mine)? Otherwise your data
> might appear corrupted somehow.

I didn't fully vet my hack, beyond eliminating the Kernel panic, so I
felt more comfortable comparing the values to a stable release.
I compared the values in 5.10 to 5.11-rc4 + your patch, and the number
of steps and oobregion->length are identical between them.

5.10.5:
  chip->ecc.steps = 4
  oobregion->length = d

5.11-rc4 + patch:
  engine_conf->nsteps = 4
  oobregion->length = d

adam
>
> >
> > When you submit a formal patch, CC me on the patch, and I'll respond
> > with a 'tested-by'
>
> Of course, I'll also add a Reported-by.
>
> Thanks,
> Miquèl
Miquel Raynal Jan. 19, 2021, 3:53 p.m. UTC | #15
Hi Adam,

Adam Ford <aford173@gmail.com> wrote on Tue, 19 Jan 2021 09:49:36 -0600:

> On Tue, Jan 19, 2021 at 8:36 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
> >
> > Hi Adam,
> >  
> > > > ---8<---
> > > >
> > > > Author: Miquel Raynal <miquel.raynal@bootlin.com>
> > > > Date:   Tue Jan 19 12:27:07 2021 +0100
> > > >
> > > >     wip: fix omap
> > > >
> > > >     Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
> > > >  
> > > Thanks for fixing this.
> > >
> > > I tested your patch, and I no longer get a Panic and the MTD device
> > > appears to appear correctly:
> > >
> > > mtdoops: mtd device (mtddev=name/number) must be supplied
> > > omap2-nand 30000000.nand: GPIO lookup for consumer rb
> > > omap2-nand 30000000.nand: using device tree for GPIO lookup
> > > of_get_named_gpiod_flags: parsed 'rb-gpios' property of node
> > > '/ocp@68000000/gpmc@6e000000/nand@0,0[0]' - status (0)
> > > gpio gpiochip6: Persistence not supported for GPIO 0
> > > nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xbc
> > > nand: Micron MT29F4G16ABBDA3W
> > > nand: 512 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
> > > nand: using OMAP_ECC_BCH8_CODE_HW_DETECTION_SW
> > > 6 cmdlinepart partitions found on MTD device omap2-nand.0
> > > Creating 6 MTD partitions on "omap2-nand.0":
> > > ...  
> >
> > Good to know. Can you just tell me if the values of
> > - chip->ecc.bytes vs. engine_conf->code_size
> > - chip->ecc.steps vs. engine_conf->nsteps
> > are the same in both cases (your patch and mine)? Otherwise your data
> > might appear corrupted somehow.  
> 
> I didn't fully vet my hack, beyond eliminating the Kernel panic, so I
> felt more comfortable comparing the values to a stable release.
> I compared the values in 5.10 to 5.11-rc4 + your patch, and the number
> of steps and oobregion->length are identical between them.
> 
> 5.10.5:
>   chip->ecc.steps = 4
>   oobregion->length = d
> 
> 5.11-rc4 + patch:
>   engine_conf->nsteps = 4
>   oobregion->length = d

Great, thanks for checking.

I'll send the formal patch then.

Thanks,
Miquèl
diff mbox series

Patch

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index fe080a0837d8..97221ec3876e 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -90,7 +90,7 @@  EXPORT_SYMBOL(nand_bch_correct_data);
 
 /**
  * nand_bch_init - Initialize software BCH ECC engine
- * @mtd: MTD device
+ * @chip: NAND chip object
  *
  * Returns: a pointer to a new NAND BCH control structure, or NULL upon failure
  *
@@ -105,24 +105,24 @@  EXPORT_SYMBOL(nand_bch_correct_data);
  * @eccsize = 512 (thus, m = 13 is the smallest integer such that 2^m - 1 > 512 * 8)
  * @eccbytes = 7 (7 bytes are required to store m * t = 13 * 4 = 52 bits)
  */
-struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
+int nand_bch_init(struct nand_chip *chip)
 {
-	struct nand_chip *nand = mtd_to_nand(mtd);
+	struct mtd_info *mtd = nand_to_mtd(chip);
 	unsigned int m, t, eccsteps, i;
 	struct nand_bch_control *nbc = NULL;
 	unsigned char *erased_page;
-	unsigned int eccsize = nand->ecc.size;
-	unsigned int eccbytes = nand->ecc.bytes;
-	unsigned int eccstrength = nand->ecc.strength;
+	unsigned int eccsize = chip->ecc.size;
+	unsigned int eccbytes = chip->ecc.bytes;
+	unsigned int eccstrength = chip->ecc.strength;
 
 	if (!eccbytes && eccstrength) {
 		eccbytes = DIV_ROUND_UP(eccstrength * fls(8 * eccsize), 8);
-		nand->ecc.bytes = eccbytes;
+		chip->ecc.bytes = eccbytes;
 	}
 
 	if (!eccsize || !eccbytes) {
 		pr_warn("ecc parameters not supplied\n");
-		goto fail;
+		return -EINVAL;
 	}
 
 	m = fls(1+8*eccsize);
@@ -130,7 +130,9 @@  struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
 	nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
 	if (!nbc)
-		goto fail;
+		return -ENOMEM;
+
+	chip->ecc.priv = nbc;
 
 	nbc->bch = bch_init(m, t, 0, false);
 	if (!nbc->bch)
@@ -165,8 +167,8 @@  struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 	 * FIXME: we should probably rework the sequencing in nand_scan_tail()
 	 * to avoid setting those fields twice.
 	 */
-	nand->ecc.steps = eccsteps;
-	nand->ecc.total = eccsteps * eccbytes;
+	chip->ecc.steps = eccsteps;
+	chip->ecc.total = eccsteps * eccbytes;
 	if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
 		pr_warn("invalid ecc layout\n");
 		goto fail;
@@ -192,12 +194,12 @@  struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 		nbc->eccmask[i] ^= 0xff;
 
 	if (!eccstrength)
-		nand->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
+		chip->ecc.strength = (eccbytes * 8) / fls(8 * eccsize);
 
-	return nbc;
+	return 0;
 fail:
-	nand_bch_free(nbc);
-	return NULL;
+	nand_bch_free(chip);
+	return -EINVAL;
 }
 EXPORT_SYMBOL(nand_bch_init);
 
@@ -205,8 +207,10 @@  EXPORT_SYMBOL(nand_bch_init);
  * nand_bch_free - Release NAND BCH ECC resources
  * @nbc: NAND BCH control structure
  */
-void nand_bch_free(struct nand_bch_control *nbc)
+void nand_bch_free(struct nand_chip *chip)
 {
+	struct nand_bch_control *nbc = chip->ecc.priv;
+
 	if (nbc) {
 		bch_free(nbc->bch);
 		kfree(nbc->errloc);
diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c
index 4191831df182..1bc2462efeab 100644
--- a/drivers/mtd/nand/raw/fsmc_nand.c
+++ b/drivers/mtd/nand/raw/fsmc_nand.c
@@ -927,7 +927,7 @@  static int fsmc_nand_attach_chip(struct nand_chip *nand)
 
 	/*
 	 * Don't set layout for BCH4 SW ECC. This will be
-	 * generated later in nand_bch_init() later.
+	 * generated later during BCH initialization.
 	 */
 	if (nand->ecc.engine_type == NAND_ECC_ENGINE_TYPE_ON_HOST) {
 		switch (mtd->oobsize) {
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 189d3a301a38..c47441ddc5cf 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5208,6 +5208,7 @@  static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct nand_device *nanddev = mtd_to_nanddev(mtd);
 	struct nand_ecc_ctrl *ecc = &chip->ecc;
+	int ret;
 
 	if (WARN_ON(ecc->engine_type != NAND_ECC_ENGINE_TYPE_SOFT))
 		return -EINVAL;
@@ -5294,13 +5295,14 @@  static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 			ecc->strength = bytes * 8 / fls(8 * ecc->size);
 		}
 
-		/* See nand_bch_init() for details. */
+		/* See the software BCH ECC initialization for details */
 		ecc->bytes = 0;
-		ecc->priv = nand_bch_init(mtd);
-		if (!ecc->priv) {
+		ret = nand_bch_init(chip);
+		if (ret) {
 			WARN(1, "BCH ECC initialization failed!\n");
-			return -EINVAL;
+			return ret;
 		}
+
 		return 0;
 	default:
 		WARN(1, "Unsupported ECC algorithm!\n");
@@ -5960,7 +5962,7 @@  void nand_cleanup(struct nand_chip *chip)
 {
 	if (chip->ecc.engine_type == NAND_ECC_ENGINE_TYPE_SOFT &&
 	    chip->ecc.algo == NAND_ECC_ALGO_BCH)
-		nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
+		nand_bch_free(chip);
 
 	nanddev_cleanup(&chip->base);
 
diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index 0ef209e1cd87..6aab57336690 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -2047,10 +2047,10 @@  static int omap_nand_attach_chip(struct nand_chip *chip)
 		/* Reserve one byte for the OMAP marker */
 		oobbytes_per_step	= chip->ecc.bytes + 1;
 		/* Software BCH library is used for locating errors */
-		chip->ecc.priv		= nand_bch_init(mtd);
-		if (!chip->ecc.priv) {
+		err = nand_bch_init(chip);
+		if (err) {
 			dev_err(dev, "Unable to use BCH library\n");
-			return -EINVAL;
+			return err;
 		}
 		break;
 
@@ -2089,10 +2089,10 @@  static int omap_nand_attach_chip(struct nand_chip *chip)
 		/* Reserve one byte for the OMAP marker */
 		oobbytes_per_step	= chip->ecc.bytes + 1;
 		/* Software BCH library is used for locating errors */
-		chip->ecc.priv		= nand_bch_init(mtd);
-		if (!chip->ecc.priv) {
+		err = nand_bch_init(chip);
+		if (err) {
 			dev_err(dev, "unable to use BCH library\n");
-			return -EINVAL;
+			return err;
 		}
 		break;
 
@@ -2272,7 +2272,7 @@  static int omap_nand_probe(struct platform_device *pdev)
 	if (!IS_ERR_OR_NULL(info->dma))
 		dma_release_channel(info->dma);
 	if (nand_chip->ecc.priv) {
-		nand_bch_free(nand_chip->ecc.priv);
+		nand_bch_free(nand_chip);
 		nand_chip->ecc.priv = NULL;
 	}
 	return err;
@@ -2286,7 +2286,7 @@  static int omap_nand_remove(struct platform_device *pdev)
 	int ret;
 
 	if (nand_chip->ecc.priv) {
-		nand_bch_free(nand_chip->ecc.priv);
+		nand_bch_free(nand_chip);
 		nand_chip->ecc.priv = NULL;
 	}
 	if (info->dma)
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index 1e1ee3af82b1..b62b8bd4669f 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -10,7 +10,6 @@ 
 
 struct mtd_info;
 struct nand_chip;
-struct nand_bch_control;
 
 #if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 
@@ -30,11 +29,11 @@  int nand_bch_correct_data(struct nand_chip *chip, u_char *dat,
 /*
  * Initialize BCH encoder/decoder
  */
-struct nand_bch_control *nand_bch_init(struct mtd_info *mtd);
+int nand_bch_init(struct nand_chip *chip);
 /*
  * Release BCH encoder/decoder resources
  */
-void nand_bch_free(struct nand_bch_control *nbc);
+void nand_bch_free(struct nand_chip *chip);
 
 #else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
 
@@ -54,12 +53,12 @@  nand_bch_correct_data(struct nand_chip *chip, unsigned char *buf,
 	return -ENOTSUPP;
 }
 
-static inline struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
+static inline int nand_bch_init(struct nand_chip *chip)
 {
-	return NULL;
+	return -ENOTSUPP;
 }
 
-static inline void nand_bch_free(struct nand_bch_control *nbc) {}
+static inline void nand_bch_free(struct nand_chip *chip) {}
 
 #endif /* CONFIG_MTD_NAND_ECC_SW_BCH */