diff mbox

BMC/PCI: Check slot tables against detected devices

Message ID 1481154699-19404-1-git-send-email-stewart@linux.vnet.ibm.com
State Accepted
Headers show

Commit Message

Stewart Smith Dec. 7, 2016, 11:51 p.m. UTC
On BMC machines, we have slot tables of built in PHBs, slots and devices
that are physically present in the system (such as the BMC itself). We
can use these tables to check what we *detected* against what *should*
be in the system and throw an error if they differ.

We have seen this occur a couple of times while still booting, giving the
user just an empty petitboot screen and not much else to go on. This
patch helps in that we get a skiboot error message, and at some point
in the future when we pump them up to the OS we could get a big friendly
error message telling you you're having a bad day.

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
---
 platforms/astbmc/astbmc.h    |  1 +
 platforms/astbmc/firestone.c |  1 +
 platforms/astbmc/garrison.c  |  1 +
 platforms/astbmc/habanero.c  |  1 +
 platforms/astbmc/p8dtu.c     |  2 +
 platforms/astbmc/slots.c     | 87 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 93 insertions(+)

Comments

Russell Currey Dec. 9, 2016, 2:50 a.m. UTC | #1
On Thu, 2016-12-08 at 10:51 +1100, Stewart Smith wrote:
> On BMC machines, we have slot tables of built in PHBs, slots and devices
> that are physically present in the system (such as the BMC itself). We
> can use these tables to check what we *detected* against what *should*
> be in the system and throw an error if they differ.
> 
> We have seen this occur a couple of times while still booting, giving the
> user just an empty petitboot screen and not much else to go on. This
> patch helps in that we get a skiboot error message, and at some point
> in the future when we pump them up to the OS we could get a big friendly
> error message telling you you're having a bad day.
> 
> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
> ---
>  platforms/astbmc/astbmc.h    |  1 +
>  platforms/astbmc/firestone.c |  1 +
>  platforms/astbmc/garrison.c  |  1 +
>  platforms/astbmc/habanero.c  |  1 +
>  platforms/astbmc/p8dtu.c     |  2 +
>  platforms/astbmc/slots.c     | 87
> ++++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 93 insertions(+)
> 
> diff --git a/platforms/astbmc/astbmc.h b/platforms/astbmc/astbmc.h
> index 3ef8dbf0f0c1..999580017059 100644
> --- a/platforms/astbmc/astbmc.h
> +++ b/platforms/astbmc/astbmc.h
> @@ -49,6 +49,7 @@ extern int64_t astbmc_ipmi_power_down(uint64_t request);
>  extern void astbmc_init(void);
>  extern void astbmc_ext_irq_serirq_cpld(unsigned int chip_id);
>  extern int pnor_init(void);
> +extern void check_all_slot_table(void);
>  
>  extern void slot_table_init(const struct slot_table_entry *top_table);
>  extern void slot_table_get_slot_info(struct phb *phb, struct pci_device *
> pd);
> diff --git a/platforms/astbmc/firestone.c b/platforms/astbmc/firestone.c
> index d2c4d1464b03..fc6575b13728 100644
> --- a/platforms/astbmc/firestone.c
> +++ b/platforms/astbmc/firestone.c
> @@ -150,6 +150,7 @@ DECLARE_PLATFORM(firestone) = {
>  	.probe			= firestone_probe,
>  	.init			= astbmc_init,
>  	.pci_get_slot_info	= slot_table_get_slot_info,
> +	.pci_probe_complete	= check_all_slot_table,
>  	.external_irq		= astbmc_ext_irq_serirq_cpld,
>  	.cec_power_down         = astbmc_ipmi_power_down,
>  	.cec_reboot             = astbmc_ipmi_reboot,
> diff --git a/platforms/astbmc/garrison.c b/platforms/astbmc/garrison.c
> index ed504ac837ce..db886cbbfd63 100644
> --- a/platforms/astbmc/garrison.c
> +++ b/platforms/astbmc/garrison.c
> @@ -297,6 +297,7 @@ DECLARE_PLATFORM(garrison) = {
>  	.probe			= garrison_probe,
>  	.init			= astbmc_init,
>  	.pci_get_slot_info	= slot_table_get_slot_info,
> +	.pci_probe_complete	= check_all_slot_table,
>  	.cec_power_down         = astbmc_ipmi_power_down,
>  	.cec_reboot             = astbmc_ipmi_reboot,
>  	.elog_commit		= ipmi_elog_commit,
> diff --git a/platforms/astbmc/habanero.c b/platforms/astbmc/habanero.c
> index 86e49bf7c98e..0d3a01f56668 100644
> --- a/platforms/astbmc/habanero.c
> +++ b/platforms/astbmc/habanero.c
> @@ -145,6 +145,7 @@ DECLARE_PLATFORM(habanero) = {
>  	.probe			= habanero_probe,
>  	.init			= astbmc_init,
>  	.pci_get_slot_info	= slot_table_get_slot_info,
> +	.pci_probe_complete	= check_all_slot_table,
>  	.external_irq		= astbmc_ext_irq_serirq_cpld,
>  	.cec_power_down         = astbmc_ipmi_power_down,
>  	.cec_reboot             = astbmc_ipmi_reboot,
> diff --git a/platforms/astbmc/p8dtu.c b/platforms/astbmc/p8dtu.c
> index ac7a19119b48..8d7f92f0eeca 100644
> --- a/platforms/astbmc/p8dtu.c
> +++ b/platforms/astbmc/p8dtu.c
> @@ -240,6 +240,7 @@ DECLARE_PLATFORM(p8dtu1u) = {
>  	.bmc			= &astbmc_smc,
>  	.init			= astbmc_init,
>  	.pci_get_slot_info	= slot_table_get_slot_info,
> +	.pci_probe_complete	= check_all_slot_table,
>  	.external_irq		= astbmc_ext_irq_serirq_cpld,
>  	.cec_power_down         = astbmc_ipmi_power_down,
>  	.cec_reboot             = astbmc_ipmi_reboot,
> @@ -256,6 +257,7 @@ DECLARE_PLATFORM(p8dtu2u) = {
>  	.bmc			= &astbmc_smc,
>  	.init			= astbmc_init,
>  	.pci_get_slot_info	= slot_table_get_slot_info,
> +	.pci_probe_complete	= check_all_slot_table,
>  	.external_irq		= astbmc_ext_irq_serirq_cpld,
>  	.cec_power_down         = astbmc_ipmi_power_down,
>  	.cec_reboot             = astbmc_ipmi_reboot,
> diff --git a/platforms/astbmc/slots.c b/platforms/astbmc/slots.c
> index 10a99bbd7c1d..aeca00723b15 100644
> --- a/platforms/astbmc/slots.c
> +++ b/platforms/astbmc/slots.c
> @@ -188,3 +188,90 @@ void slot_table_get_slot_info(struct phb *phb, struct
> pci_device *pd)
>  	pluggable = !!(ent->etype == st_pluggable_slot);
>  	init_slot_info(slot, pluggable, (void *)ent);
>  }
> +
> +static int __pci_find_dev_by_location(struct phb *phb,
> +				      struct pci_device *pd, void *userdata)
> +{
> +	uint16_t location = *((uint16_t *)userdata);
> +
> +	if (!phb || !pd)
> +		return 0;
> +
> +	if ((pd->bdfn & 0xff) == location)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static struct pci_device *pci_find_dev_by_location(struct phb *phb, uint16_t
> location)
> +{
> +	return pci_walk_dev(phb, NULL, __pci_find_dev_by_location,
> &location);
> +}
> +
> +static struct phb* get_phb_by_location(uint32_t location)
> +{
> +	struct phb *phb = NULL;
> +	uint32_t chip_id, phb_idx;
> +
> +	for_each_phb(phb) {
> +		chip_id = dt_get_chip_id(phb->dt_node);
> +		phb_idx = dt_prop_get_u32_def(phb->dt_node,
> +					      "ibm,phb-index", 0);
> +		if (location == ST_LOC_PHB(chip_id, phb_idx))
> +			break;
> +	}
> +
> +	return phb;
> +}
> +
> +static int check_slot_table(struct phb *phb,
> +			    const struct slot_table_entry *parent)
> +{
> +	const struct slot_table_entry *ent;
> +	struct pci_device *dev = NULL;
> +	int r = 0;
> +
> +	if (parent == NULL)
> +		return 0;
> +
> +	for (ent = parent; ent->etype != st_end; ent++) {
> +		switch (ent->etype) {
> +		case st_phb:
> +			phb = get_phb_by_location(ent->location);
> +			if (!phb) {
> +				prlog(PR_ERR, "PCI: PHB %s (%x) not found\n",
> +				      ent->name, ent->location);
> +				r++;
> +			}
> +			break;
> +		case st_pluggable_slot:
> +		case st_builtin_dev:
> +			if (!phb)
> +				break;
> +			phb_lock(phb);
> +			dev = pci_find_dev_by_location(phb, ent->location);
> +			phb_unlock(phb);

Why does this require a lock?

> +			if (!dev) {
> +				prlog(PR_ERR, "PCI: built-in device not
> found: %s (loc: %x)\n",
> +				      ent->name, ent->location);
> +				r++;
> +			}
> +			break;
> +		case st_end:
> +		case st_npu_slot:
> +			break;
> +		}
> +		if (ent->children)
> +			r+= check_slot_table(phb, ent->children);

For style this should be r += ... (I think)

> +	}
> +	return r;
> +}
> +
> +void check_all_slot_table(void)
> +{
> +	if (!slot_top_table)
> +		return;
> +
> +	prlog(PR_DEBUG, "PCI: Checking slot table against detected
> devices\n");
> +	check_slot_table(NULL, slot_top_table);
> +}
Stewart Smith Dec. 13, 2016, 2:14 a.m. UTC | #2
Russell Currey <ruscur@russell.cc> writes:
>> +		case st_builtin_dev:
>> +			if (!phb)
>> +				break;
>> +			phb_lock(phb);
>> +			dev = pci_find_dev_by_location(phb, ent->location);
>> +			phb_unlock(phb);
>
> Why does this require a lock?

It may not. By this time during boot we should be all good and no longer
be touching anything. An abundance of caution perhaps?

>
>> +			if (!dev) {
>> +				prlog(PR_ERR, "PCI: built-in device not
>> found: %s (loc: %x)\n",
>> +				      ent->name, ent->location);
>> +				r++;
>> +			}
>> +			break;
>> +		case st_end:
>> +		case st_npu_slot:
>> +			break;
>> +		}
>> +		if (ent->children)
>> +			r+= check_slot_table(phb, ent->children);
>
> For style this should be r += ... (I think)

It should. Periodically I relapse into MySQL code style.
Russell Currey Dec. 13, 2016, 2:17 a.m. UTC | #3
On Tue, 2016-12-13 at 13:14 +1100, Stewart Smith wrote:
> Russell Currey <ruscur@russell.cc> writes:
> > > +		case st_builtin_dev:
> > > +			if (!phb)
> > > +				break;
> > > +			phb_lock(phb);
> > > +			dev = pci_find_dev_by_location(phb, ent-
> > > >location);
> > > +			phb_unlock(phb);
> > 
> > Why does this require a lock?
> 
> It may not. By this time during boot we should be all good and no longer
> be touching anything. An abundance of caution perhaps?

Just curious because I couldn't see a specific reason, I think it's fine
Stewart Smith Dec. 13, 2016, 3:24 a.m. UTC | #4
Russell Currey <ruscur@russell.cc> writes:
> On Tue, 2016-12-13 at 13:14 +1100, Stewart Smith wrote:
>> Russell Currey <ruscur@russell.cc> writes:
>> > > +		case st_builtin_dev:
>> > > +			if (!phb)
>> > > +				break;
>> > > +			phb_lock(phb);
>> > > +			dev = pci_find_dev_by_location(phb, ent-
>> > > >location);
>> > > +			phb_unlock(phb);
>> > 
>> > Why does this require a lock?
>> 
>> It may not. By this time during boot we should be all good and no longer
>> be touching anything. An abundance of caution perhaps?
>
> Just curious because I couldn't see a specific reason, I think it's fine

That sounds almost like an ack :)
Russell Currey Dec. 13, 2016, 3:58 a.m. UTC | #5
On Tue, 2016-12-13 at 14:24 +1100, Stewart Smith wrote:
> Russell Currey <ruscur@russell.cc> writes:
> > On Tue, 2016-12-13 at 13:14 +1100, Stewart Smith wrote:
> > > Russell Currey <ruscur@russell.cc> writes:
> > > > > +		case st_builtin_dev:
> > > > > +			if (!phb)
> > > > > +				break;
> > > > > +			phb_lock(phb);
> > > > > +			dev = pci_find_dev_by_location(phb, ent-
> > > > > > location);
> > > > > 
> > > > > +			phb_unlock(phb);
> > > > 
> > > > Why does this require a lock?
> > > 
> > > It may not. By this time during boot we should be all good and no longer
> > > be touching anything. An abundance of caution perhaps?
> > 
> > Just curious because I couldn't see a specific reason, I think it's fine
> 
> That sounds almost like an ack :)
> 

Acked-by: Russell Currey <ruscur@russell.cc>

(may have forgot I was supposed to do that)
Stewart Smith Dec. 13, 2016, 5:35 a.m. UTC | #6
Stewart Smith <stewart@linux.vnet.ibm.com> writes:
> On BMC machines, we have slot tables of built in PHBs, slots and devices
> that are physically present in the system (such as the BMC itself). We
> can use these tables to check what we *detected* against what *should*
> be in the system and throw an error if they differ.
>
> We have seen this occur a couple of times while still booting, giving the
> user just an empty petitboot screen and not much else to go on. This
> patch helps in that we get a skiboot error message, and at some point
> in the future when we pump them up to the OS we could get a big friendly
> error message telling you you're having a bad day.
>
> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
> ---
>  platforms/astbmc/astbmc.h    |  1 +
>  platforms/astbmc/firestone.c |  1 +
>  platforms/astbmc/garrison.c  |  1 +
>  platforms/astbmc/habanero.c  |  1 +
>  platforms/astbmc/p8dtu.c     |  2 +
>  platforms/astbmc/slots.c     | 87 ++++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 93 insertions(+)


Merged to master (with the addition of barreleye) as of 4a6d1a7.
diff mbox

Patch

diff --git a/platforms/astbmc/astbmc.h b/platforms/astbmc/astbmc.h
index 3ef8dbf0f0c1..999580017059 100644
--- a/platforms/astbmc/astbmc.h
+++ b/platforms/astbmc/astbmc.h
@@ -49,6 +49,7 @@  extern int64_t astbmc_ipmi_power_down(uint64_t request);
 extern void astbmc_init(void);
 extern void astbmc_ext_irq_serirq_cpld(unsigned int chip_id);
 extern int pnor_init(void);
+extern void check_all_slot_table(void);
 
 extern void slot_table_init(const struct slot_table_entry *top_table);
 extern void slot_table_get_slot_info(struct phb *phb, struct pci_device * pd);
diff --git a/platforms/astbmc/firestone.c b/platforms/astbmc/firestone.c
index d2c4d1464b03..fc6575b13728 100644
--- a/platforms/astbmc/firestone.c
+++ b/platforms/astbmc/firestone.c
@@ -150,6 +150,7 @@  DECLARE_PLATFORM(firestone) = {
 	.probe			= firestone_probe,
 	.init			= astbmc_init,
 	.pci_get_slot_info	= slot_table_get_slot_info,
+	.pci_probe_complete	= check_all_slot_table,
 	.external_irq		= astbmc_ext_irq_serirq_cpld,
 	.cec_power_down         = astbmc_ipmi_power_down,
 	.cec_reboot             = astbmc_ipmi_reboot,
diff --git a/platforms/astbmc/garrison.c b/platforms/astbmc/garrison.c
index ed504ac837ce..db886cbbfd63 100644
--- a/platforms/astbmc/garrison.c
+++ b/platforms/astbmc/garrison.c
@@ -297,6 +297,7 @@  DECLARE_PLATFORM(garrison) = {
 	.probe			= garrison_probe,
 	.init			= astbmc_init,
 	.pci_get_slot_info	= slot_table_get_slot_info,
+	.pci_probe_complete	= check_all_slot_table,
 	.cec_power_down         = astbmc_ipmi_power_down,
 	.cec_reboot             = astbmc_ipmi_reboot,
 	.elog_commit		= ipmi_elog_commit,
diff --git a/platforms/astbmc/habanero.c b/platforms/astbmc/habanero.c
index 86e49bf7c98e..0d3a01f56668 100644
--- a/platforms/astbmc/habanero.c
+++ b/platforms/astbmc/habanero.c
@@ -145,6 +145,7 @@  DECLARE_PLATFORM(habanero) = {
 	.probe			= habanero_probe,
 	.init			= astbmc_init,
 	.pci_get_slot_info	= slot_table_get_slot_info,
+	.pci_probe_complete	= check_all_slot_table,
 	.external_irq		= astbmc_ext_irq_serirq_cpld,
 	.cec_power_down         = astbmc_ipmi_power_down,
 	.cec_reboot             = astbmc_ipmi_reboot,
diff --git a/platforms/astbmc/p8dtu.c b/platforms/astbmc/p8dtu.c
index ac7a19119b48..8d7f92f0eeca 100644
--- a/platforms/astbmc/p8dtu.c
+++ b/platforms/astbmc/p8dtu.c
@@ -240,6 +240,7 @@  DECLARE_PLATFORM(p8dtu1u) = {
 	.bmc			= &astbmc_smc,
 	.init			= astbmc_init,
 	.pci_get_slot_info	= slot_table_get_slot_info,
+	.pci_probe_complete	= check_all_slot_table,
 	.external_irq		= astbmc_ext_irq_serirq_cpld,
 	.cec_power_down         = astbmc_ipmi_power_down,
 	.cec_reboot             = astbmc_ipmi_reboot,
@@ -256,6 +257,7 @@  DECLARE_PLATFORM(p8dtu2u) = {
 	.bmc			= &astbmc_smc,
 	.init			= astbmc_init,
 	.pci_get_slot_info	= slot_table_get_slot_info,
+	.pci_probe_complete	= check_all_slot_table,
 	.external_irq		= astbmc_ext_irq_serirq_cpld,
 	.cec_power_down         = astbmc_ipmi_power_down,
 	.cec_reboot             = astbmc_ipmi_reboot,
diff --git a/platforms/astbmc/slots.c b/platforms/astbmc/slots.c
index 10a99bbd7c1d..aeca00723b15 100644
--- a/platforms/astbmc/slots.c
+++ b/platforms/astbmc/slots.c
@@ -188,3 +188,90 @@  void slot_table_get_slot_info(struct phb *phb, struct pci_device *pd)
 	pluggable = !!(ent->etype == st_pluggable_slot);
 	init_slot_info(slot, pluggable, (void *)ent);
 }
+
+static int __pci_find_dev_by_location(struct phb *phb,
+				      struct pci_device *pd, void *userdata)
+{
+	uint16_t location = *((uint16_t *)userdata);
+
+	if (!phb || !pd)
+		return 0;
+
+	if ((pd->bdfn & 0xff) == location)
+		return 1;
+
+	return 0;
+}
+
+static struct pci_device *pci_find_dev_by_location(struct phb *phb, uint16_t location)
+{
+	return pci_walk_dev(phb, NULL, __pci_find_dev_by_location, &location);
+}
+
+static struct phb* get_phb_by_location(uint32_t location)
+{
+	struct phb *phb = NULL;
+	uint32_t chip_id, phb_idx;
+
+	for_each_phb(phb) {
+		chip_id = dt_get_chip_id(phb->dt_node);
+		phb_idx = dt_prop_get_u32_def(phb->dt_node,
+					      "ibm,phb-index", 0);
+		if (location == ST_LOC_PHB(chip_id, phb_idx))
+			break;
+	}
+
+	return phb;
+}
+
+static int check_slot_table(struct phb *phb,
+			    const struct slot_table_entry *parent)
+{
+	const struct slot_table_entry *ent;
+	struct pci_device *dev = NULL;
+	int r = 0;
+
+	if (parent == NULL)
+		return 0;
+
+	for (ent = parent; ent->etype != st_end; ent++) {
+		switch (ent->etype) {
+		case st_phb:
+			phb = get_phb_by_location(ent->location);
+			if (!phb) {
+				prlog(PR_ERR, "PCI: PHB %s (%x) not found\n",
+				      ent->name, ent->location);
+				r++;
+			}
+			break;
+		case st_pluggable_slot:
+		case st_builtin_dev:
+			if (!phb)
+				break;
+			phb_lock(phb);
+			dev = pci_find_dev_by_location(phb, ent->location);
+			phb_unlock(phb);
+			if (!dev) {
+				prlog(PR_ERR, "PCI: built-in device not found: %s (loc: %x)\n",
+				      ent->name, ent->location);
+				r++;
+			}
+			break;
+		case st_end:
+		case st_npu_slot:
+			break;
+		}
+		if (ent->children)
+			r+= check_slot_table(phb, ent->children);
+	}
+	return r;
+}
+
+void check_all_slot_table(void)
+{
+	if (!slot_top_table)
+		return;
+
+	prlog(PR_DEBUG, "PCI: Checking slot table against detected devices\n");
+	check_slot_table(NULL, slot_top_table);
+}