diff mbox series

[4/5] opal/eeh: Send an error callout on EEH error.

Message ID 159599166522.67334.389838163476654740.stgit@jupiter
State Changes Requested
Headers show
Series Add support to report EEH errors to BMC/FSP (eSEL) | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (abe4c4799ffee4be12674ad59fc0bc521b0724f3)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Mahesh J Salgaonkar July 29, 2020, 3:01 a.m. UTC
On EEH error send out an error log (eSEL) with hardware callout. To avoid
generating multiple events for same error, use a bit flag in generic PHB
structure. Use two bits i.e SEND and SENT bit.  Whenever an EEH
freeze/fence is detected, a SEND error log bit is set. Once the error log
is queued, a SENT error log bit is set. These bits are sticky and gets
reset when PHB is reinitialized to clear the EEH error. The error log
includes FRU details and PHB diag data

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
---
 core/pci-opal.c    |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
 hw/phb3.c          |    6 ++++++
 hw/phb4.c          |    6 ++++++
 include/errorlog.h |    1 +
 include/pci.h      |    5 +++++
 5 files changed, 67 insertions(+)

Comments

Oliver O'Halloran July 30, 2020, 1:24 a.m. UTC | #1
On Wed, Jul 29, 2020 at 1:01 PM Mahesh Salgaonkar <mahesh@linux.ibm.com> wrote:
>
> On EEH error send out an error log (eSEL) with hardware callout. To avoid
> generating multiple events for same error, use a bit flag in generic PHB
> structure. Use two bits i.e SEND and SENT bit. Whenever an EEH
> freeze/fence is detected, a SEND error log bit is set. Once the error log
> is queued, a SENT error log bit is set.

I don't think this is the right approach. There's a couple of
different events we need to be concerned with namely:

1) Full PHB Fences
2) Multi-PE freezes (PELT-V)
3) Single PE freezes

1) is the rarest kind and it looks like this is the only type this
patch actually addresses. 2) and 3) are the common cases so we need to
log those too and call out the correct slots when they occur. They're
a bit trickier since we'd need to track whether a log has been sent on
a per-PE basis. However, a bitmap that tracks it on a per-PE basis is
probably sufficent.

> These bits are sticky and gets reset when PHB is reinitialized to clear the EEH error.

Stickness is a hardware concept and I'd prefer we didn't refer to
random bits of software state as "sticky" since it just confuses
matters.

> The error log includes FRU details and PHB diag data
>
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
> ---
>  core/pci-opal.c    |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/phb3.c          |    6 ++++++
>  hw/phb4.c          |    6 ++++++
>  include/errorlog.h |    1 +
>  include/pci.h      |    5 +++++
>  5 files changed, 67 insertions(+)
>
> diff --git a/core/pci-opal.c b/core/pci-opal.c
> index aa375c6aa..47503d5d2 100644
> --- a/core/pci-opal.c
> +++ b/core/pci-opal.c
> @@ -13,6 +13,12 @@
>  #include <opal-msg.h>
>  #include <timebase.h>
>  #include <timer.h>
> +#include <errorlog.h>
> +#include <chip.h>
> +
> +DEFINE_LOG_ENTRY(OPAL_RC_PCI_RESET_PHB, OPAL_INPUT_OUTPUT_ERR_EVT,
> +               OPAL_PCI, OPAL_IO_DEVICES, OPAL_UNRECOVERABLE_ERR_GENERAL,
> +               OPAL_NA);
>
>  #define OPAL_PCICFG_ACCESS_READ(op, cb, type)  \
>  static int64_t opal_pci_config_##op(uint64_t phb_id,                   \
> @@ -984,6 +990,45 @@ static int64_t opal_pci_set_power_state(uint64_t async_token,
>  }
>  opal_call(OPAL_PCI_SET_POWER_STATE, opal_pci_set_power_state, 3);
>
> +static void send_eeh_serviceable_event(struct phb *phb, void *diag_buffer)
> +{
> +       struct errorlog *buf;
> +       const char *loc, *part, *serial;
> +       uint32_t chip_id, len;
> +       struct OpalIoPhbErrorCommon *common;
> +
> +       /* Generate and send an error log/eSEL */
> +       buf = opal_elog_create(&e_info(OPAL_RC_PCI_RESET_PHB), 0);

OPAL_RC_PCI_RESET_PHB is aliased with OPAL_RC_PCI_INIT_SLOT so eh...
I'd prefer we had a seperate elog type for reporting errors,
especially considering PE freezes don't require a PHB reset.

> +       if (!buf) {
> +               prerror("Unable to send EEH error log (eSEL)\n");
> +               return;
> +       }
> +       log_append_msg(buf, "PHB#%x Freeze/Fence detected!\n", phb->opal_id);
> +       log_mark_serviceable(buf);
> +
> +       /* Add PHB base location code */
> +       loc = phb->base_loc_code;

I'm fairly certain this isn't the right location code to be reporting.
I think this is populated from the io-base-loc code which is going to
refer to the system planar rather than the slot containing the device
which EEHed. The right thing would be to log the location codes
corresponding to slots with a frozen PE.

> +       log_add_callout_section(buf, loc, NULL, NULL);
> +
> +       /* Add FRU callout of associated chip id */
> +       chip_id = dt_get_chip_id(phb->dt_node);
> +       loc = chip_loc_code(chip_id);
> +       part = chip_part_number(chip_id);
> +       serial = chip_serial_number(chip_id);
> +       log_add_callout_section(buf, loc, part, serial);
> +
> +       /* Insert the phb diag data. */
> +       common = (struct OpalIoPhbErrorCommon *)diag_buffer;

don't cast void pointers

> +       len = be32_to_cpu(common->len);
> +
> +       log_add_section(buf, OPAL_ELOG_SEC_DIAG);
> +       log_append_data(buf, diag_buffer, len);
> +       log_commit(buf);
> +
> +       phb->flags |= PCI_EEH_ERR_LOG_SENT;
> +}
> +
>  static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
>                                            void *diag_buffer,
>                                            uint64_t diag_buffer_len)
> @@ -1000,6 +1045,10 @@ static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
>                 return OPAL_UNSUPPORTED;
>         phb_lock(phb);
>         rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
> +
> +       /* Send an error log if required */
> +       if ((phb->flags & PCI_EEH_ERR_LOG_MASK) == PCI_EEH_ERR_LOG_SEND)
> +               send_eeh_serviceable_event(phb, diag_buffer);
>         phb_unlock(phb);
>
>         return rc;
> diff --git a/hw/phb3.c b/hw/phb3.c
> index 8af6b6164..0d7370f52 100644
> --- a/hw/phb3.c
> +++ b/hw/phb3.c
> @@ -68,6 +68,9 @@ static bool phb3_fenced(struct phb3 *p)
>         if (nfir & PPC_BIT(16)) {
>                 p->flags |= PHB3_AIB_FENCED;
>
> +               /* Mark flag to send an error log */
> +               p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
> +
>                 phb3_eeh_dump_regs(p, NULL);
>                 return true;
>         }
> @@ -2758,6 +2761,9 @@ static int64_t phb3_creset(struct pci_slot *slot)
>                  */
>                 p->flags &= ~PHB3_AIB_FENCED;
>                 p->flags &= ~PHB3_CAPP_RECOVERY;
> +
> +               /* Reset the error logging related flag */
> +               p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
>                 phb3_init_hw(p, false);
>
>                 if (p->flags & PHB3_CAPP_DISABLING) {
> diff --git a/hw/phb4.c b/hw/phb4.c
> index 3f22a2c4d..8e59cdba4 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -2550,6 +2550,9 @@ static bool phb4_fenced(struct phb4 *p)
>         /* Mark ourselves fenced */
>         p->flags |= PHB4_AIB_FENCED;
>
> +       /* Mark flag to send an error log */
> +       p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
> +
>         PHBERR(p, "PHB Freeze/Fence detected !\n");
>         phb4_dump_pec_err_regs(p);
>
> @@ -3444,6 +3447,9 @@ static int64_t phb4_creset(struct pci_slot *slot)
>                 p->flags &= ~PHB4_AIB_FENCED;
>                 p->flags &= ~PHB4_CAPP_RECOVERY;
>                 p->flags &= ~PHB4_CFG_USE_ASB;
> +
> +               /* Reset the error logging related flag */
> +               p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
>                 phb4_init_hw(p);
>                 pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
>
> diff --git a/include/errorlog.h b/include/errorlog.h
> index 24d12c4d8..9490e4ec4 100644
> --- a/include/errorlog.h
> +++ b/include/errorlog.h
> @@ -341,6 +341,7 @@ enum opal_reasoncode {
>  };
>
>  #define OPAL_ELOG_SEC_DESC     0x44455343
> +#define OPAL_ELOG_SEC_DIAG     0x44494147      /* For EEH diag data */
>
>  #define DEFINE_LOG_ENTRY(reason, type, id, subsys,                     \
>  severity, subtype) static struct opal_err_info err_##reason =          \
> diff --git a/include/pci.h b/include/pci.h
> index eb23a6d9b..feadbf21d 100644
> --- a/include/pci.h
> +++ b/include/pci.h
> @@ -382,6 +382,11 @@ struct phb {
>
>         /* Additional data the platform might need to attach */
>         void                    *platform_data;
> +
> +       uint32_t                flags;
> +#define PCI_EEH_ERR_LOG_SEND   0x1
> +#define PCI_EEH_ERR_LOG_SENT   0x2
> +#define PCI_EEH_ERR_LOG_MASK   0x3

Why do we need seperate SEND and SENT bits?

>  };
>
>  static inline void phb_lock(struct phb *phb)
>
>
diff mbox series

Patch

diff --git a/core/pci-opal.c b/core/pci-opal.c
index aa375c6aa..47503d5d2 100644
--- a/core/pci-opal.c
+++ b/core/pci-opal.c
@@ -13,6 +13,12 @@ 
 #include <opal-msg.h>
 #include <timebase.h>
 #include <timer.h>
+#include <errorlog.h>
+#include <chip.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_PCI_RESET_PHB, OPAL_INPUT_OUTPUT_ERR_EVT,
+		OPAL_PCI, OPAL_IO_DEVICES, OPAL_UNRECOVERABLE_ERR_GENERAL,
+		OPAL_NA);
 
 #define OPAL_PCICFG_ACCESS_READ(op, cb, type)	\
 static int64_t opal_pci_config_##op(uint64_t phb_id,			\
@@ -984,6 +990,45 @@  static int64_t opal_pci_set_power_state(uint64_t async_token,
 }
 opal_call(OPAL_PCI_SET_POWER_STATE, opal_pci_set_power_state, 3);
 
+static void send_eeh_serviceable_event(struct phb *phb, void *diag_buffer)
+{
+	struct errorlog *buf;
+	const char *loc, *part, *serial;
+	uint32_t chip_id, len;
+	struct OpalIoPhbErrorCommon *common;
+
+	/* Generate and send an error log/eSEL */
+	buf = opal_elog_create(&e_info(OPAL_RC_PCI_RESET_PHB), 0);
+	if (!buf) {
+		prerror("Unable to send EEH error log (eSEL)\n");
+		return;
+	}
+
+	log_append_msg(buf, "PHB#%x Freeze/Fence detected!\n", phb->opal_id);
+	log_mark_serviceable(buf);
+
+	/* Add PHB base location code */
+	loc = phb->base_loc_code;
+	log_add_callout_section(buf, loc, NULL, NULL);
+
+	/* Add FRU callout of associated chip id */
+	chip_id = dt_get_chip_id(phb->dt_node);
+	loc = chip_loc_code(chip_id);
+	part = chip_part_number(chip_id);
+	serial = chip_serial_number(chip_id);
+	log_add_callout_section(buf, loc, part, serial);
+
+	/* Insert the phb diag data. */
+	common = (struct OpalIoPhbErrorCommon *)diag_buffer;
+	len = be32_to_cpu(common->len);
+
+	log_add_section(buf, OPAL_ELOG_SEC_DIAG);
+	log_append_data(buf, diag_buffer, len);
+	log_commit(buf);
+
+	phb->flags |= PCI_EEH_ERR_LOG_SENT;
+}
+
 static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
 					   void *diag_buffer,
 					   uint64_t diag_buffer_len)
@@ -1000,6 +1045,10 @@  static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
 		return OPAL_UNSUPPORTED;
 	phb_lock(phb);
 	rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
+
+	/* Send an error log if required */
+	if ((phb->flags & PCI_EEH_ERR_LOG_MASK) == PCI_EEH_ERR_LOG_SEND)
+		send_eeh_serviceable_event(phb, diag_buffer);
 	phb_unlock(phb);
 
 	return rc;
diff --git a/hw/phb3.c b/hw/phb3.c
index 8af6b6164..0d7370f52 100644
--- a/hw/phb3.c
+++ b/hw/phb3.c
@@ -68,6 +68,9 @@  static bool phb3_fenced(struct phb3 *p)
 	if (nfir & PPC_BIT(16)) {
 		p->flags |= PHB3_AIB_FENCED;
 
+		/* Mark flag to send an error log */
+		p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
+
 		phb3_eeh_dump_regs(p, NULL);
 		return true;
 	}
@@ -2758,6 +2761,9 @@  static int64_t phb3_creset(struct pci_slot *slot)
 		 */
 		p->flags &= ~PHB3_AIB_FENCED;
 		p->flags &= ~PHB3_CAPP_RECOVERY;
+
+		/* Reset the error logging related flag */
+		p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
 		phb3_init_hw(p, false);
 
 		if (p->flags & PHB3_CAPP_DISABLING) {
diff --git a/hw/phb4.c b/hw/phb4.c
index 3f22a2c4d..8e59cdba4 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -2550,6 +2550,9 @@  static bool phb4_fenced(struct phb4 *p)
 	/* Mark ourselves fenced */
 	p->flags |= PHB4_AIB_FENCED;
 
+	/* Mark flag to send an error log */
+	p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
+
 	PHBERR(p, "PHB Freeze/Fence detected !\n");
 	phb4_dump_pec_err_regs(p);
 
@@ -3444,6 +3447,9 @@  static int64_t phb4_creset(struct pci_slot *slot)
 		p->flags &= ~PHB4_AIB_FENCED;
 		p->flags &= ~PHB4_CAPP_RECOVERY;
 		p->flags &= ~PHB4_CFG_USE_ASB;
+
+		/* Reset the error logging related flag */
+		p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
 		phb4_init_hw(p);
 		pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
 
diff --git a/include/errorlog.h b/include/errorlog.h
index 24d12c4d8..9490e4ec4 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -341,6 +341,7 @@  enum opal_reasoncode {
 };
 
 #define OPAL_ELOG_SEC_DESC	0x44455343
+#define OPAL_ELOG_SEC_DIAG	0x44494147	/* For EEH diag data */
 
 #define DEFINE_LOG_ENTRY(reason, type, id, subsys,			\
 severity, subtype) static struct opal_err_info err_##reason =		\
diff --git a/include/pci.h b/include/pci.h
index eb23a6d9b..feadbf21d 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -382,6 +382,11 @@  struct phb {
 
 	/* Additional data the platform might need to attach */
 	void			*platform_data;
+
+	uint32_t		flags;
+#define PCI_EEH_ERR_LOG_SEND	0x1
+#define PCI_EEH_ERR_LOG_SENT	0x2
+#define PCI_EEH_ERR_LOG_MASK	0x3
 };
 
 static inline void phb_lock(struct phb *phb)