Message ID | 20230726055917.3291633-7-kai.heng.feng@canonical.com |
---|---|
State | New |
Headers | show |
Series | Fix UBSAN in Intel EDAC driver | expand |
On Wed, Jul 26, 2023 at 01:59:17PM +0800, Kai-Heng Feng wrote: > From: Qiuxu Zhuo <qiuxu.zhuo@intel.com> > > BugLink: https://bugs.launchpad.net/bugs/2028746 > > Some Sapphire Rapids workstations' absent memory controllers > still appear as PCIe devices that fool the i10nm_edac driver > and result in "shift exponent -66 is negative" call traces > from skx_get_dimm_info(). > > Skip the absent memory controllers to avoid the call traces. > > Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com> > Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2 > Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com> > Reported-by: Koba Ko <koba.ko@canonical.com> > Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t > Tested-by: Koba Ko <koba.ko@canonical.com> > Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors") > Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> > Signed-off-by: Tony Luck <tony.luck@intel.com> > Link: https://lore.kernel.org/r/20230710013232.59712-1-qiuxu.zhuo@intel.com > (cherry picked from commit c545f5e412250555bd4e717d062b117f20bab418 linux-next) > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com> Applied to mantic/linux-unstable. Thanks, -Andrea > --- > drivers/edac/i10nm_base.c | 54 +++++++++++++++++++++++++++++++++++---- > 1 file changed, 49 insertions(+), 5 deletions(-) > > diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c > index 46034310b78e..7ed98b85aa90 100644 > --- a/drivers/edac/i10nm_base.c > +++ b/drivers/edac/i10nm_base.c > @@ -622,13 +622,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi > return mdev; > } > > +/** > + * i10nm_imc_absent() - Check whether the memory controller @imc is absent > + * > + * @imc : The pointer to the structure of memory controller EDAC device. > + * > + * RETURNS : true if the memory controller EDAC device is absent, false otherwise. > + */ > +static bool i10nm_imc_absent(struct skx_imc *imc) > +{ > + u32 mcmtr; > + int i; > + > + switch (res_cfg->type) { > + case SPR: > + for (i = 0; i < res_cfg->ddr_chan_num; i++) { > + mcmtr = I10NM_GET_MCMTR(imc, i); > + edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr); > + if (mcmtr != ~0) > + return false; > + } > + > + /* > + * Some workstations' absent memory controllers still > + * appear as PCIe devices, misleading the EDAC driver. > + * By observing that the MMIO registers of these absent > + * memory controllers consistently hold the value of ~0. > + * > + * We identify a memory controller as absent by checking > + * if its MMIO register "mcmtr" == ~0 in all its channels. > + */ > + return true; > + default: > + return false; > + } > +} > + > static int i10nm_get_ddr_munits(void) > { > struct pci_dev *mdev; > void __iomem *mbase; > unsigned long size; > struct skx_dev *d; > - int i, j = 0; > + int i, lmc, j = 0; > u32 reg, off; > u64 base; > > @@ -654,7 +690,7 @@ static int i10nm_get_ddr_munits(void) > edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", > j++, base, reg); > > - for (i = 0; i < res_cfg->ddr_imc_num; i++) { > + for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) { > mdev = get_ddr_munit(d, i, &off, &size); > > if (i == 0 && !mdev) { > @@ -664,8 +700,6 @@ static int i10nm_get_ddr_munits(void) > if (!mdev) > continue; > > - d->imc[i].mdev = mdev; > - > edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n", > i, base + off, size, reg); > > @@ -676,7 +710,17 @@ static int i10nm_get_ddr_munits(void) > return -ENODEV; > } > > - d->imc[i].mbase = mbase; > + d->imc[lmc].mbase = mbase; > + if (i10nm_imc_absent(&d->imc[lmc])) { > + pci_dev_put(mdev); > + iounmap(mbase); > + d->imc[lmc].mbase = NULL; > + edac_dbg(2, "Skip absent mc%d\n", i); > + continue; > + } else { > + d->imc[lmc].mdev = mdev; > + lmc++; > + } > } > } > > -- > 2.34.1 > > > -- > kernel-team mailing list > kernel-team@lists.ubuntu.com > https://lists.ubuntu.com/mailman/listinfo/kernel-team
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 46034310b78e..7ed98b85aa90 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -622,13 +622,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi return mdev; } +/** + * i10nm_imc_absent() - Check whether the memory controller @imc is absent + * + * @imc : The pointer to the structure of memory controller EDAC device. + * + * RETURNS : true if the memory controller EDAC device is absent, false otherwise. + */ +static bool i10nm_imc_absent(struct skx_imc *imc) +{ + u32 mcmtr; + int i; + + switch (res_cfg->type) { + case SPR: + for (i = 0; i < res_cfg->ddr_chan_num; i++) { + mcmtr = I10NM_GET_MCMTR(imc, i); + edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr); + if (mcmtr != ~0) + return false; + } + + /* + * Some workstations' absent memory controllers still + * appear as PCIe devices, misleading the EDAC driver. + * By observing that the MMIO registers of these absent + * memory controllers consistently hold the value of ~0. + * + * We identify a memory controller as absent by checking + * if its MMIO register "mcmtr" == ~0 in all its channels. + */ + return true; + default: + return false; + } +} + static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; void __iomem *mbase; unsigned long size; struct skx_dev *d; - int i, j = 0; + int i, lmc, j = 0; u32 reg, off; u64 base; @@ -654,7 +690,7 @@ static int i10nm_get_ddr_munits(void) edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", j++, base, reg); - for (i = 0; i < res_cfg->ddr_imc_num; i++) { + for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) { mdev = get_ddr_munit(d, i, &off, &size); if (i == 0 && !mdev) { @@ -664,8 +700,6 @@ static int i10nm_get_ddr_munits(void) if (!mdev) continue; - d->imc[i].mdev = mdev; - edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n", i, base + off, size, reg); @@ -676,7 +710,17 @@ static int i10nm_get_ddr_munits(void) return -ENODEV; } - d->imc[i].mbase = mbase; + d->imc[lmc].mbase = mbase; + if (i10nm_imc_absent(&d->imc[lmc])) { + pci_dev_put(mdev); + iounmap(mbase); + d->imc[lmc].mbase = NULL; + edac_dbg(2, "Skip absent mc%d\n", i); + continue; + } else { + d->imc[lmc].mdev = mdev; + lmc++; + } } }