diff mbox

[v4] cxl: mask slice error interrupts after first occurrence

Message ID 20170501005331.20131-3-alastair@au1.ibm.com (mailing list archive)
State Accepted
Commit a715626a8e904e7226915d1bc4885317ea9da141
Headers show

Commit Message

Alastair D'Silva May 1, 2017, 12:53 a.m. UTC
From: Alastair D'Silva <alastair@d-silva.org>

In some situations, a faulty AFU slice may create an interrupt storm of
slice errors, rendering the machine unusable. Since these interrupts are
informational only, present the interrupt once, then mask it off to
prevent it from being retriggered until the AFU is reset.

Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
--- 
Changelog:
v4:
	Fix duplicate/missing entries in aggregate macros
	Minor textual changes
v3
        Add CXL_PSL_SERR_An_IRQS, CXL_PSL_SERR_An_IRQ_MASKS macros
        Explicitly reenable masked interrupts after reset
        Issue an info line that subsequent interrupts will be masked
v2
        Rebase against linux-next
---
 drivers/misc/cxl/cxl.h    | 18 ++++++++++++++++++
 drivers/misc/cxl/native.c | 19 +++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

Comments

Andrew Donnellan May 1, 2017, 2:45 a.m. UTC | #1
On 01/05/17 10:53, Alastair D'Silva wrote:
> From: Alastair D'Silva <alastair@d-silva.org>
>
> In some situations, a faulty AFU slice may create an interrupt storm of
> slice errors, rendering the machine unusable. Since these interrupts are
> informational only, present the interrupt once, then mask it off to
> prevent it from being retriggered until the AFU is reset.
>
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>

Thanks for the fixups.

Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

> ---
> Changelog:
> v4:
> 	Fix duplicate/missing entries in aggregate macros
> 	Minor textual changes
> v3
>         Add CXL_PSL_SERR_An_IRQS, CXL_PSL_SERR_An_IRQ_MASKS macros
>         Explicitly reenable masked interrupts after reset
>         Issue an info line that subsequent interrupts will be masked
> v2
>         Rebase against linux-next
> ---
>  drivers/misc/cxl/cxl.h    | 18 ++++++++++++++++++
>  drivers/misc/cxl/native.c | 19 +++++++++++++++++--
>  2 files changed, 35 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
> index 452e209..c8568ea 100644
> --- a/drivers/misc/cxl/cxl.h
> +++ b/drivers/misc/cxl/cxl.h
> @@ -228,6 +228,24 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
>  #define CXL_PSL_SERR_An_llcmdto	(1ull << (63-6))
>  #define CXL_PSL_SERR_An_afupar	(1ull << (63-7))
>  #define CXL_PSL_SERR_An_afudup	(1ull << (63-8))
> +#define CXL_PSL_SERR_An_IRQS	( \
> +	CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \
> +	CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \
> +	CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup)
> +#define CXL_PSL_SERR_An_afuto_mask	(1ull << (63-32))
> +#define CXL_PSL_SERR_An_afudis_mask	(1ull << (63-33))
> +#define CXL_PSL_SERR_An_afuov_mask	(1ull << (63-34))
> +#define CXL_PSL_SERR_An_badsrc_mask	(1ull << (63-35))
> +#define CXL_PSL_SERR_An_badctx_mask	(1ull << (63-36))
> +#define CXL_PSL_SERR_An_llcmdis_mask	(1ull << (63-37))
> +#define CXL_PSL_SERR_An_llcmdto_mask	(1ull << (63-38))
> +#define CXL_PSL_SERR_An_afupar_mask	(1ull << (63-39))
> +#define CXL_PSL_SERR_An_afudup_mask	(1ull << (63-40))
> +#define CXL_PSL_SERR_An_IRQ_MASKS	( \
> +	CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \
> +	CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \
> +	CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask)
> +
>  #define CXL_PSL_SERR_An_AE	(1ull << (63-30))
>
>  /****** CXL_PSL_SCNTL_An ****************************************************/
> diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
> index 194c58e..290950b 100644
> --- a/drivers/misc/cxl/native.c
> +++ b/drivers/misc/cxl/native.c
> @@ -95,12 +95,23 @@ int cxl_afu_disable(struct cxl_afu *afu)
>  /* This will disable as well as reset */
>  static int native_afu_reset(struct cxl_afu *afu)
>  {
> +	int rc;
> +	u64 serr;
> +
>  	pr_devel("AFU reset request\n");
>
> -	return afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
> +	rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
>  			   CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
>  			   CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
>  			   false);
> +
> +	/* Re-enable any masked interrupts */
> +	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
> +	serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
> +	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
> +
> +
> +	return rc;
>  }
>
>  static int native_afu_check_and_enable(struct cxl_afu *afu)
> @@ -1205,7 +1216,7 @@ static irqreturn_t native_slice_irq_err(int irq, void *data)
>  {
>  	struct cxl_afu *afu = data;
>  	u64 errstat, serr, afu_error, dsisr;
> -	u64 fir_slice, afu_debug;
> +	u64 fir_slice, afu_debug, irq_mask;
>
>  	/*
>  	 * slice err interrupt is only used with full PSL (no XSL)
> @@ -1226,7 +1237,11 @@ static irqreturn_t native_slice_irq_err(int irq, void *data)
>  	dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error);
>  	dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr);
>
> +	/* mask off the IRQ so it won't retrigger until the AFU is reset */
> +	irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32;
> +	serr |= irq_mask;
>  	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
> +	dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n");
>
>  	return IRQ_HANDLED;
>  }
>
Vaibhav Jain May 2, 2017, 2:28 a.m. UTC | #2
Thanks for upadating the patch with review comments Alastair.

Alastair D'Silva <alastair@au1.ibm.com> writes:

> From: Alastair D'Silva <alastair@d-silva.org>
>
> In some situations, a faulty AFU slice may create an interrupt storm of
> slice errors, rendering the machine unusable. Since these interrupts are
> informational only, present the interrupt once, then mask it off to
> prevent it from being retriggered until the AFU is reset.
>
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
> ---

Reviewed-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Frederic Barrat May 2, 2017, 8:24 a.m. UTC | #3
Le 01/05/2017 à 02:53, Alastair D'Silva a écrit :
> From: Alastair D'Silva <alastair@d-silva.org>
>
> In some situations, a faulty AFU slice may create an interrupt storm of
> slice errors, rendering the machine unusable. Since these interrupts are
> informational only, present the interrupt once, then mask it off to
> prevent it from being retriggered until the AFU is reset.
>
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
> ---


Thanks!

Acked-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>



> Changelog:
> v4:
> 	Fix duplicate/missing entries in aggregate macros
> 	Minor textual changes
> v3
>         Add CXL_PSL_SERR_An_IRQS, CXL_PSL_SERR_An_IRQ_MASKS macros
>         Explicitly reenable masked interrupts after reset
>         Issue an info line that subsequent interrupts will be masked
> v2
>         Rebase against linux-next
> ---
>  drivers/misc/cxl/cxl.h    | 18 ++++++++++++++++++
>  drivers/misc/cxl/native.c | 19 +++++++++++++++++--
>  2 files changed, 35 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
> index 452e209..c8568ea 100644
> --- a/drivers/misc/cxl/cxl.h
> +++ b/drivers/misc/cxl/cxl.h
> @@ -228,6 +228,24 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
>  #define CXL_PSL_SERR_An_llcmdto	(1ull << (63-6))
>  #define CXL_PSL_SERR_An_afupar	(1ull << (63-7))
>  #define CXL_PSL_SERR_An_afudup	(1ull << (63-8))
> +#define CXL_PSL_SERR_An_IRQS	( \
> +	CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \
> +	CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \
> +	CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup)
> +#define CXL_PSL_SERR_An_afuto_mask	(1ull << (63-32))
> +#define CXL_PSL_SERR_An_afudis_mask	(1ull << (63-33))
> +#define CXL_PSL_SERR_An_afuov_mask	(1ull << (63-34))
> +#define CXL_PSL_SERR_An_badsrc_mask	(1ull << (63-35))
> +#define CXL_PSL_SERR_An_badctx_mask	(1ull << (63-36))
> +#define CXL_PSL_SERR_An_llcmdis_mask	(1ull << (63-37))
> +#define CXL_PSL_SERR_An_llcmdto_mask	(1ull << (63-38))
> +#define CXL_PSL_SERR_An_afupar_mask	(1ull << (63-39))
> +#define CXL_PSL_SERR_An_afudup_mask	(1ull << (63-40))
> +#define CXL_PSL_SERR_An_IRQ_MASKS	( \
> +	CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \
> +	CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \
> +	CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask)
> +
>  #define CXL_PSL_SERR_An_AE	(1ull << (63-30))
>
>  /****** CXL_PSL_SCNTL_An ****************************************************/
> diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
> index 194c58e..290950b 100644
> --- a/drivers/misc/cxl/native.c
> +++ b/drivers/misc/cxl/native.c
> @@ -95,12 +95,23 @@ int cxl_afu_disable(struct cxl_afu *afu)
>  /* This will disable as well as reset */
>  static int native_afu_reset(struct cxl_afu *afu)
>  {
> +	int rc;
> +	u64 serr;
> +
>  	pr_devel("AFU reset request\n");
>
> -	return afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
> +	rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
>  			   CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
>  			   CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
>  			   false);
> +
> +	/* Re-enable any masked interrupts */
> +	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
> +	serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
> +	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
> +
> +
> +	return rc;
>  }
>
>  static int native_afu_check_and_enable(struct cxl_afu *afu)
> @@ -1205,7 +1216,7 @@ static irqreturn_t native_slice_irq_err(int irq, void *data)
>  {
>  	struct cxl_afu *afu = data;
>  	u64 errstat, serr, afu_error, dsisr;
> -	u64 fir_slice, afu_debug;
> +	u64 fir_slice, afu_debug, irq_mask;
>
>  	/*
>  	 * slice err interrupt is only used with full PSL (no XSL)
> @@ -1226,7 +1237,11 @@ static irqreturn_t native_slice_irq_err(int irq, void *data)
>  	dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error);
>  	dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr);
>
> +	/* mask off the IRQ so it won't retrigger until the AFU is reset */
> +	irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32;
> +	serr |= irq_mask;
>  	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
> +	dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n");
>
>  	return IRQ_HANDLED;
>  }
>
Michael Ellerman May 3, 2017, 10:19 p.m. UTC | #4
On Mon, 2017-05-01 at 00:53:31 UTC, Alastair D'Silva wrote:
> From: Alastair D'Silva <alastair@d-silva.org>
> 
> In some situations, a faulty AFU slice may create an interrupt storm of
> slice errors, rendering the machine unusable. Since these interrupts are
> informational only, present the interrupt once, then mask it off to
> prevent it from being retriggered until the AFU is reset.
> 
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
> Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
> Reviewed-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a715626a8e904e7226915d1bc48853

cheers
diff mbox

Patch

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 452e209..c8568ea 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -228,6 +228,24 @@  static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_PSL_SERR_An_llcmdto	(1ull << (63-6))
 #define CXL_PSL_SERR_An_afupar	(1ull << (63-7))
 #define CXL_PSL_SERR_An_afudup	(1ull << (63-8))
+#define CXL_PSL_SERR_An_IRQS	( \
+	CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \
+	CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \
+	CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup)
+#define CXL_PSL_SERR_An_afuto_mask	(1ull << (63-32))
+#define CXL_PSL_SERR_An_afudis_mask	(1ull << (63-33))
+#define CXL_PSL_SERR_An_afuov_mask	(1ull << (63-34))
+#define CXL_PSL_SERR_An_badsrc_mask	(1ull << (63-35))
+#define CXL_PSL_SERR_An_badctx_mask	(1ull << (63-36))
+#define CXL_PSL_SERR_An_llcmdis_mask	(1ull << (63-37))
+#define CXL_PSL_SERR_An_llcmdto_mask	(1ull << (63-38))
+#define CXL_PSL_SERR_An_afupar_mask	(1ull << (63-39))
+#define CXL_PSL_SERR_An_afudup_mask	(1ull << (63-40))
+#define CXL_PSL_SERR_An_IRQ_MASKS	( \
+	CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \
+	CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \
+	CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask)
+
 #define CXL_PSL_SERR_An_AE	(1ull << (63-30))
 
 /****** CXL_PSL_SCNTL_An ****************************************************/
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 194c58e..290950b 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -95,12 +95,23 @@  int cxl_afu_disable(struct cxl_afu *afu)
 /* This will disable as well as reset */
 static int native_afu_reset(struct cxl_afu *afu)
 {
+	int rc;
+	u64 serr;
+
 	pr_devel("AFU reset request\n");
 
-	return afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
+	rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
 			   CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
 			   CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
 			   false);
+
+	/* Re-enable any masked interrupts */
+	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+	serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
+	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+
+
+	return rc;
 }
 
 static int native_afu_check_and_enable(struct cxl_afu *afu)
@@ -1205,7 +1216,7 @@  static irqreturn_t native_slice_irq_err(int irq, void *data)
 {
 	struct cxl_afu *afu = data;
 	u64 errstat, serr, afu_error, dsisr;
-	u64 fir_slice, afu_debug;
+	u64 fir_slice, afu_debug, irq_mask;
 
 	/*
 	 * slice err interrupt is only used with full PSL (no XSL)
@@ -1226,7 +1237,11 @@  static irqreturn_t native_slice_irq_err(int irq, void *data)
 	dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error);
 	dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr);
 
+	/* mask off the IRQ so it won't retrigger until the AFU is reset */
+	irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32;
+	serr |= irq_mask;
 	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+	dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n");
 
 	return IRQ_HANDLED;
 }