diff mbox series

[3/3] hack: pci-quirk: Scan for config ranges that need the workaround

Message ID 20190719093821.14278-3-oohall@gmail.com
State Superseded
Headers show
Series [1/3] pci-quirk: Re-order struct members | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (3a6fdede6ce117facec0108afe716cf5d0472c3f)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot fail Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco fail Signed-off-by missing

Commit Message

Oliver O'Halloran July 19, 2019, 9:38 a.m. UTC
More robust, but *really* slow.

$ grep Applied /sys/firmware/opal/msglog -A1
[  451.128374154,5] PCI-QUIRK: PHB#0001:01:00.0 Applied UR workaround to [9c...100)
[  451.352419458,5] PCI-QUIRK: PHB#0001:01:00.0 Applied UR workaround to [284...7f8)
[  452.098609721,5] PHB#0001:01:00.0 [SWUP] 11f8 4052 R:00 C:060400 B:02..06 SLOT=005  x8
---
[  452.131262166,5] PCI-QUIRK: PHB#0001:02:00.0 Applied UR workaround to [9c...100)
[  452.355301176,5] PCI-QUIRK: PHB#0001:02:00.0 Applied UR workaround to [288...7f8)
[  453.101497705,5] PHB#0001:02:00.0 [SWDN] 11f8 4052 R:00 C:060400 B:03..03 SLOT=005  x8
---
[  453.136158940,5] PCI-QUIRK: PHB#0001:02:01.0 Applied UR workaround to [9c...100)
[  453.360198154,5] PCI-QUIRK: PHB#0001:02:01.0 Applied UR workaround to [288...7f8)
[  454.106394518,5] PHB#0001:02:01.0 [SWDN] 11f8 4052 R:00 C:060400 B:04..04 SLOT=C11  x8
---
[  454.141055796,5] PCI-QUIRK: PHB#0001:02:02.0 Applied UR workaround to [9c...100)
[  454.365094880,5] PCI-QUIRK: PHB#0001:02:02.0 Applied UR workaround to [288...7f8)
[  455.111291381,5] PHB#0001:02:02.0 [SWDN] 11f8 4052 R:00 C:060400 B:05..05 SLOT=C12 x16
---
[  455.144948035,5] PCI-QUIRK: PHB#0001:02:03.0 Applied UR workaround to [9c...100)
[  455.368987098,5] PCI-QUIRK: PHB#0001:02:03.0 Applied UR workaround to [288...7f8)
[  456.115183503,5] PHB#0001:02:03.0 [SWDN] 11f8 4052 R:00 C:060400 B:06..06 SLOT=C49  x8
---
[  456.115686350,5] PHB#0001:06:00.0 [EP  ] 1014 034a R:02 C:010400 (          raid) LOC_CODE=C49
[  456.148337007,5] PCI-QUIRK: PHB#0001:01:00.1 Applied UR workaround to [a0...100)
[  457.118453865,5] PCI-QUIRK: PHB#0001:01:00.1 Applied UR workaround to [17c...fff)
[  457.118572672,5] PHB#0001:01:00.1 [EP  ] 11f8 4052 R:00 C:058000 (memory-controller) LOC_CODE=003

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 core/pci-quirk.c | 40 ++++++++++++++++++----------------------
 core/pci.c       | 15 ++++++++++-----
 include/pci.h    |  2 ++
 3 files changed, 30 insertions(+), 27 deletions(-)

Comments

Alistair Popple July 24, 2019, 2 a.m. UTC | #1
On Friday, 19 July 2019 7:38:21 PM AEST Oliver O'Halloran wrote:
> More robust, but *really* slow.

Could we cache the results in NVRAM? That way it would only be slow after the 
first firmware boot. Seems like it might be fair bit of work for something that 
shouldn't ever change though so I'd be ok with the hardcoded approach. We can 
always require a firmware update in the unlikely event it changes.

- Alistair

> $ grep Applied /sys/firmware/opal/msglog -A1
> [  451.128374154,5] PCI-QUIRK: PHB#0001:01:00.0 Applied UR workaround to
> [9c...100) [  451.352419458,5] PCI-QUIRK: PHB#0001:01:00.0 Applied UR
> workaround to [284...7f8) [  452.098609721,5] PHB#0001:01:00.0 [SWUP] 11f8
> 4052 R:00 C:060400 B:02..06 SLOT=005  x8 ---
> [  452.131262166,5] PCI-QUIRK: PHB#0001:02:00.0 Applied UR workaround to
> [9c...100) [  452.355301176,5] PCI-QUIRK: PHB#0001:02:00.0 Applied UR
> workaround to [288...7f8) [  453.101497705,5] PHB#0001:02:00.0 [SWDN] 11f8
> 4052 R:00 C:060400 B:03..03 SLOT=005  x8 ---
> [  453.136158940,5] PCI-QUIRK: PHB#0001:02:01.0 Applied UR workaround to
> [9c...100) [  453.360198154,5] PCI-QUIRK: PHB#0001:02:01.0 Applied UR
> workaround to [288...7f8) [  454.106394518,5] PHB#0001:02:01.0 [SWDN] 11f8
> 4052 R:00 C:060400 B:04..04 SLOT=C11  x8 ---
> [  454.141055796,5] PCI-QUIRK: PHB#0001:02:02.0 Applied UR workaround to
> [9c...100) [  454.365094880,5] PCI-QUIRK: PHB#0001:02:02.0 Applied UR
> workaround to [288...7f8) [  455.111291381,5] PHB#0001:02:02.0 [SWDN] 11f8
> 4052 R:00 C:060400 B:05..05 SLOT=C12 x16 ---
> [  455.144948035,5] PCI-QUIRK: PHB#0001:02:03.0 Applied UR workaround to
> [9c...100) [  455.368987098,5] PCI-QUIRK: PHB#0001:02:03.0 Applied UR
> workaround to [288...7f8) [  456.115183503,5] PHB#0001:02:03.0 [SWDN] 11f8
> 4052 R:00 C:060400 B:06..06 SLOT=C49  x8 ---
> [  456.115686350,5] PHB#0001:06:00.0 [EP  ] 1014 034a R:02 C:010400 (       
>   raid) LOC_CODE=C49 [  456.148337007,5] PCI-QUIRK: PHB#0001:01:00.1
> Applied UR workaround to [a0...100) [  457.118453865,5] PCI-QUIRK:
> PHB#0001:01:00.1 Applied UR workaround to [17c...fff) [  457.118572672,5]
> PHB#0001:01:00.1 [EP  ] 11f8 4052 R:00 C:058000 (memory-controller)
> LOC_CODE=003
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  core/pci-quirk.c | 40 ++++++++++++++++++----------------------
>  core/pci.c       | 15 ++++++++++-----
>  include/pci.h    |  2 ++
>  3 files changed, 30 insertions(+), 27 deletions(-)
> 
> diff --git a/core/pci-quirk.c b/core/pci-quirk.c
> index 371ff62b4b72..2452409a1c2b 100644
> --- a/core/pci-quirk.c
> +++ b/core/pci-quirk.c
> @@ -54,29 +54,25 @@ static int64_t cfg_block_filter(void *dev __unused,
> 
>  static void quirk_microsemi_gen4_sw(struct phb *phb, struct pci_device *pd)
> {
> -	/*
> -	 * The gen4 pcie switch used on some ZZ systems has a bug where it'll
> -	 * throw URs in response to a cfg read to a range that's "reserved"
> -	 * work around it by blackholing.
> -	 */
> -	if (pd->dev_type == PCIE_TYPE_ENDPOINT && pd->class == 0x058000) {
> -		/*
> -		 * we match on the class code too since the switch might
> -		 * support an NTB endpoint.
> -		 */
> -		BLOCK_CFG_RANGE(pd, 0xa0, 0xff);
> -		BLOCK_CFG_RANGE(pd, 0x17c, 0xfff);
> -	} else if (pd->dev_type == PCIE_TYPE_SWITCH_UPPORT) {
> -		BLOCK_CFG_RANGE(pd, 0x09c, 0xff);
> -		BLOCK_CFG_RANGE(pd, 0x284, 0x7f7);
> -	} else if (pd->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
> -		BLOCK_CFG_RANGE(pd, 0x09c, 0xff);
> -		BLOCK_CFG_RANGE(pd, 0x288, 0x7f7);
> -	} else {
> -		return;
> -	}
> +	uint8_t data;
> +	bool frozen;
> +	int offset;
> +	int start;
> +
> +	pci_check_clear_freeze(phb);
> 
> -	PCINOTICE(phb, pd->bdfn, "Applied Microsemi Gen4 UR workaround\n");
> +	for (start = -1, offset = 0; offset < 4096; offset++) {
> +		pci_cfg_read8(phb, pd->bdfn, offset, &data);
> +		frozen = pci_check_clear_freeze(phb);
> +
> +		if (start >= 0 && (!frozen || offset == 4095)) { /* end of range */
> +			BLOCK_CFG_RANGE(pd, start, offset - 1);
> +			PCINOTICE(phb, pd->bdfn, "Applied UR workaround to [%03x..%03x)
\n",
> start, offset - 1); +			start = -1;
> +		} else if (frozen && start < 0) { /* new UR range */
> +			start = offset;
> +		}
> +	}
>  }
> 
>  static void quirk_astbmc_vga(struct phb *phb __unused,
> diff --git a/core/pci.c b/core/pci.c
> index e870d09b5c55..2a36290d6598 100644
> --- a/core/pci.c
> +++ b/core/pci.c
> @@ -316,10 +316,12 @@ static struct pci_device *pci_scan_one(struct phb
> *phb, struct pci_device *paren *                          everything
> (default state of our backend) so *                          we just check
> and clear the state of PE#0 *
> + *                          returns true if a freeze was detected
> + *
>   * NOTE: We currently only handle simple PE freeze, not PHB fencing
>   *       (or rather our backend does)
>   */
> -static void pci_check_clear_freeze(struct phb *phb)
> +bool pci_check_clear_freeze(struct phb *phb)
>  {
>  	uint8_t freeze_state;
>  	uint16_t pci_error_type, sev;
> @@ -330,23 +332,26 @@ static void pci_check_clear_freeze(struct phb *phb)
>  	if (phb->ops->get_reserved_pe_number)
>  		pe_number = phb->ops->get_reserved_pe_number(phb);
>  	if (pe_number < 0)
> -		return;
> +		return false;
> 
>  	/* Retrieve the frozen state */
>  	rc = phb->ops->eeh_freeze_status(phb, pe_number, &freeze_state,
>  					 &pci_error_type, &sev);
>  	if (rc)
> -		return;
> +		return true; /* phb fence? */
> +
>  	if (freeze_state == OPAL_EEH_STOPPED_NOT_FROZEN)
> -		return;
> +		return false;
>  	/* We can't handle anything worse than an ER here */
>  	if (sev > OPAL_EEH_SEV_NO_ERROR &&
>  	    sev < OPAL_EEH_SEV_PE_ER) {
>  		PCIERR(phb, 0, "Fatal probe in %s error !\n", __func__);
> -		return;
> +		return true;
>  	}
> +
>  	phb->ops->eeh_freeze_clear(phb, pe_number,
>  				   OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +	return true;
>  }
> 
>  /*
> diff --git a/include/pci.h b/include/pci.h
> index c10d79418e70..2d1328ea31d4 100644
> --- a/include/pci.h
> +++ b/include/pci.h
> @@ -404,6 +404,8 @@ static inline void phb_unlock(struct phb *phb)
>  	unlock(&phb->lock);
>  }
> 
> +bool pci_check_clear_freeze(struct phb *phb);
> +
>  /* Config space ops wrappers */
>  static inline int64_t pci_cfg_read8(struct phb *phb, uint32_t bdfn,
>  				    uint32_t offset, uint8_t *data)
Oliver O'Halloran July 24, 2019, 2:09 a.m. UTC | #2
On Wed, Jul 24, 2019 at 12:00 PM Alistair Popple <alistair@popple.id.au> wrote:
>
> On Friday, 19 July 2019 7:38:21 PM AEST Oliver O'Halloran wrote:
> > More robust, but *really* slow.
>
> Could we cache the results in NVRAM? That way it would only be slow after the
> first firmware boot. Seems like it might be fair bit of work for something that
> shouldn't ever change though so I'd be ok with the hardcoded approach. We can
> always require a firmware update in the unlikely event it changes.

good idea
diff mbox series

Patch

diff --git a/core/pci-quirk.c b/core/pci-quirk.c
index 371ff62b4b72..2452409a1c2b 100644
--- a/core/pci-quirk.c
+++ b/core/pci-quirk.c
@@ -54,29 +54,25 @@  static int64_t cfg_block_filter(void *dev __unused,
 
 static void quirk_microsemi_gen4_sw(struct phb *phb, struct pci_device *pd)
 {
-	/*
-	 * The gen4 pcie switch used on some ZZ systems has a bug where it'll
-	 * throw URs in response to a cfg read to a range that's "reserved"
-	 * work around it by blackholing.
-	 */
-	if (pd->dev_type == PCIE_TYPE_ENDPOINT && pd->class == 0x058000) {
-		/*
-		 * we match on the class code too since the switch might
-		 * support an NTB endpoint.
-		 */
-		BLOCK_CFG_RANGE(pd, 0xa0, 0xff);
-		BLOCK_CFG_RANGE(pd, 0x17c, 0xfff);
-	} else if (pd->dev_type == PCIE_TYPE_SWITCH_UPPORT) {
-		BLOCK_CFG_RANGE(pd, 0x09c, 0xff);
-		BLOCK_CFG_RANGE(pd, 0x284, 0x7f7);
-	} else if (pd->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
-		BLOCK_CFG_RANGE(pd, 0x09c, 0xff);
-		BLOCK_CFG_RANGE(pd, 0x288, 0x7f7);
-	} else {
-		return;
-	}
+	uint8_t data;
+	bool frozen;
+	int offset;
+	int start;
+
+	pci_check_clear_freeze(phb);
 
-	PCINOTICE(phb, pd->bdfn, "Applied Microsemi Gen4 UR workaround\n");
+	for (start = -1, offset = 0; offset < 4096; offset++) {
+		pci_cfg_read8(phb, pd->bdfn, offset, &data);
+		frozen = pci_check_clear_freeze(phb);
+
+		if (start >= 0 && (!frozen || offset == 4095)) { /* end of range */
+			BLOCK_CFG_RANGE(pd, start, offset - 1);
+			PCINOTICE(phb, pd->bdfn, "Applied UR workaround to [%03x..%03x)\n", start, offset - 1);
+			start = -1;
+		} else if (frozen && start < 0) { /* new UR range */
+			start = offset;
+		}
+	}
 }
 
 static void quirk_astbmc_vga(struct phb *phb __unused,
diff --git a/core/pci.c b/core/pci.c
index e870d09b5c55..2a36290d6598 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -316,10 +316,12 @@  static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device *paren
  *                          everything (default state of our backend) so
  *                          we just check and clear the state of PE#0
  *
+ *                          returns true if a freeze was detected
+ *
  * NOTE: We currently only handle simple PE freeze, not PHB fencing
  *       (or rather our backend does)
  */
-static void pci_check_clear_freeze(struct phb *phb)
+bool pci_check_clear_freeze(struct phb *phb)
 {
 	uint8_t freeze_state;
 	uint16_t pci_error_type, sev;
@@ -330,23 +332,26 @@  static void pci_check_clear_freeze(struct phb *phb)
 	if (phb->ops->get_reserved_pe_number)
 		pe_number = phb->ops->get_reserved_pe_number(phb);
 	if (pe_number < 0)
-		return;
+		return false;
 
 	/* Retrieve the frozen state */
 	rc = phb->ops->eeh_freeze_status(phb, pe_number, &freeze_state,
 					 &pci_error_type, &sev);
 	if (rc)
-		return;
+		return true; /* phb fence? */
+
 	if (freeze_state == OPAL_EEH_STOPPED_NOT_FROZEN)
-		return;
+		return false;
 	/* We can't handle anything worse than an ER here */
 	if (sev > OPAL_EEH_SEV_NO_ERROR &&
 	    sev < OPAL_EEH_SEV_PE_ER) {
 		PCIERR(phb, 0, "Fatal probe in %s error !\n", __func__);
-		return;
+		return true;
 	}
+
 	phb->ops->eeh_freeze_clear(phb, pe_number,
 				   OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	return true;
 }
 
 /*
diff --git a/include/pci.h b/include/pci.h
index c10d79418e70..2d1328ea31d4 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -404,6 +404,8 @@  static inline void phb_unlock(struct phb *phb)
 	unlock(&phb->lock);
 }
 
+bool pci_check_clear_freeze(struct phb *phb);
+
 /* Config space ops wrappers */
 static inline int64_t pci_cfg_read8(struct phb *phb, uint32_t bdfn,
 				    uint32_t offset, uint8_t *data)