diff mbox

[v2,4/5] hw/phb3: add host sync notifier to trigger creset/CAPP disable on kexec

Message ID a5ef766f4dd1fb9e1ff99bc6d115b859c3f738d7.1484284153.git-series.andrew.donnellan@au1.ibm.com
State Superseded
Headers show

Commit Message

Andrew Donnellan Jan. 13, 2017, 5:09 a.m. UTC
To support kexec in Linux, we need to trigger a creset to disable CAPP mode
on each PHB that has been attached to a CAPP.

Add a host sync notifier, phb3_host_sync_reset(), that will be triggered by
the opal_sync_host_reboot() call that Linux makes when "shutting down" a
powernv system (this includes bringing the system down to prepare it for
kexec). This notifier will trigger a creset only on PHBs that need it, and
will poll regularly until the creset completes.

This approach is somewhat hacky, as it's somewhat of an abuse of the host
sync notifier system (IMHO), but it seems the most obvious way to
ensure that the reset/CAPP disable occurs that will work with old kernel
versions and not require additional support on the kernel side.

Suggested-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

---

v1->v2:
* Add explanatory comment about use of host sync notifier (suggested by
Fred)
---
 hw/phb3.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

Comments

Gavin Shan Jan. 15, 2017, 11:51 p.m. UTC | #1
On Fri, Jan 13, 2017 at 04:09:40PM +1100, Andrew Donnellan wrote:
>To support kexec in Linux, we need to trigger a creset to disable CAPP mode
>on each PHB that has been attached to a CAPP.
>
>Add a host sync notifier, phb3_host_sync_reset(), that will be triggered by
>the opal_sync_host_reboot() call that Linux makes when "shutting down" a
>powernv system (this includes bringing the system down to prepare it for
>kexec). This notifier will trigger a creset only on PHBs that need it, and
>will poll regularly until the creset completes.
>
>This approach is somewhat hacky, as it's somewhat of an abuse of the host
>sync notifier system (IMHO), but it seems the most obvious way to
>ensure that the reset/CAPP disable occurs that will work with old kernel
>versions and not require additional support on the kernel side.
>
>Suggested-by: Stewart Smith <stewart@linux.vnet.ibm.com>
>Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
>
>---
>
>v1->v2:
>* Add explanatory comment about use of host sync notifier (suggested by
>Fred)
>---
> hw/phb3.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 40 insertions(+)
>
>diff --git a/hw/phb3.c b/hw/phb3.c
>index 87dc87f..05221a4 100644
>--- a/hw/phb3.c
>+++ b/hw/phb3.c
>@@ -4622,6 +4622,44 @@ static bool phb3_calculate_windows(struct phb3 *p)
> 	return true;
> }
> 
>+/*
>+ * Trigger a creset to disable CAPI mode on kernel shutdown.
>+ *
>+ * This helper is called repeatedly by the host sync notifier mechanism, which
>+ * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it
>+ * shuts down.
>+ *
>+ * This is a somewhat hacky abuse of the host sync notifier mechanism, but the
>+ * alternatives require a new API call which won't work for older kernels.
>+ */
>+static bool phb3_host_sync_reset(void *data)
>+{
>+	struct phb3 *p = (struct phb3 *)data;
>+	struct pci_slot *slot = p->phb.slot;
>+	struct proc_chip *chip = get_chip(p->chip_id);
>+	int64_t rc;
>+
>+	switch (slot->state) {
>+	case PHB3_SLOT_NORMAL:
>+		lock(&capi_lock);
>+		rc = (chip->capp_phb3_attached_mask & (1 << p->index)) ?
>+			OPAL_PHB_CAPI_MODE_CAPI :
>+			OPAL_PHB_CAPI_MODE_PCIE;
>+		unlock(&capi_lock);
>+
>+		if (rc == OPAL_PHB_CAPI_MODE_PCIE)
>+			return true;
>+
>+		PHBINF(p, "PHB in CAPI mode, resetting\n");
>+		p->flags &= ~PHB3_CAPP_RECOVERY;
>+		phb3_creset(slot);
>+		return false;
>+	default:
>+		rc = slot->ops.poll(slot);
>+		return rc == OPAL_SUCCESS;

The code seems incorrect. In opal_sync_host_reboot(), OPAL_BUSY_EVENT
is returned to Linux if any notifier returns false. Linux delays 10ms
and call opal_sync_host_reboot() again, meaning all notifiers will be
triggered (including phb3_host_sync_reset()) again.

When error is returned from slot->ops.poll(slot), the PHB might be put
to non_PHB3_SLOT_NORMAL state. false is returned on the error. This
function is called again and jump to default case. It's obviously not
what we want. So the code would be:

		return rc <= OPAL_SUCCESS;

Anyway, I think opal_sync_host_reboot() needs enhancement to avoid
triggering all notifier on failure from any of them. It's something
out of scope though.

>+	}
>+}
>+
> static void phb3_create(struct dt_node *np)
> {
> 	const struct dt_property *prop;
>@@ -4755,6 +4793,8 @@ static void phb3_create(struct dt_node *np)
> 	/* Load capp microcode into capp unit */
> 	capp_load_ucode(p);
> 
>+	opal_add_host_sync_notifier(phb3_host_sync_reset, p);
>+
> 	/* Platform additional setup */
> 	if (platform.pci_setup_phb)
> 		platform.pci_setup_phb(&p->phb, p->index);

Thanks,
Gavin
Andrew Donnellan Jan. 16, 2017, 8:30 a.m. UTC | #2
On 16/01/17 10:51, Gavin Shan wrote:
>> +	switch (slot->state) {
>> +	case PHB3_SLOT_NORMAL:
>> +		lock(&capi_lock);
>> +		rc = (chip->capp_phb3_attached_mask & (1 << p->index)) ?
>> +			OPAL_PHB_CAPI_MODE_CAPI :
>> +			OPAL_PHB_CAPI_MODE_PCIE;
>> +		unlock(&capi_lock);
>> +
>> +		if (rc == OPAL_PHB_CAPI_MODE_PCIE)
>> +			return true;
>> +
>> +		PHBINF(p, "PHB in CAPI mode, resetting\n");
>> +		p->flags &= ~PHB3_CAPP_RECOVERY;
>> +		phb3_creset(slot);
>> +		return false;
>> +	default:
>> +		rc = slot->ops.poll(slot);
>> +		return rc == OPAL_SUCCESS;
>
> The code seems incorrect. In opal_sync_host_reboot(), OPAL_BUSY_EVENT
> is returned to Linux if any notifier returns false. Linux delays 10ms
> and call opal_sync_host_reboot() again, meaning all notifiers will be
> triggered (including phb3_host_sync_reset()) again.
>
> When error is returned from slot->ops.poll(slot), the PHB might be put
> to non_PHB3_SLOT_NORMAL state. false is returned on the error. This
> function is called again and jump to default case. It's obviously not
> what we want. So the code would be:
>
> 		return rc <= OPAL_SUCCESS;

Good catch, will investigate further tomorrow.

>
> Anyway, I think opal_sync_host_reboot() needs enhancement to avoid
> triggering all notifier on failure from any of them. It's something
> out of scope though.

opal_sync_host_reboot() wasn't exactly designed with "failure" in mind - 
the reason I describe this as an abuse of the host sync notifier system 
is that the other notifiers just flush buffers rather than doing 
anything complex like this.
diff mbox

Patch

diff --git a/hw/phb3.c b/hw/phb3.c
index 87dc87f..05221a4 100644
--- a/hw/phb3.c
+++ b/hw/phb3.c
@@ -4622,6 +4622,44 @@  static bool phb3_calculate_windows(struct phb3 *p)
 	return true;
 }
 
+/*
+ * Trigger a creset to disable CAPI mode on kernel shutdown.
+ *
+ * This helper is called repeatedly by the host sync notifier mechanism, which
+ * relies on the kernel to regularly poll the OPAL_SYNC_HOST_REBOOT call as it
+ * shuts down.
+ *
+ * This is a somewhat hacky abuse of the host sync notifier mechanism, but the
+ * alternatives require a new API call which won't work for older kernels.
+ */
+static bool phb3_host_sync_reset(void *data)
+{
+	struct phb3 *p = (struct phb3 *)data;
+	struct pci_slot *slot = p->phb.slot;
+	struct proc_chip *chip = get_chip(p->chip_id);
+	int64_t rc;
+
+	switch (slot->state) {
+	case PHB3_SLOT_NORMAL:
+		lock(&capi_lock);
+		rc = (chip->capp_phb3_attached_mask & (1 << p->index)) ?
+			OPAL_PHB_CAPI_MODE_CAPI :
+			OPAL_PHB_CAPI_MODE_PCIE;
+		unlock(&capi_lock);
+
+		if (rc == OPAL_PHB_CAPI_MODE_PCIE)
+			return true;
+
+		PHBINF(p, "PHB in CAPI mode, resetting\n");
+		p->flags &= ~PHB3_CAPP_RECOVERY;
+		phb3_creset(slot);
+		return false;
+	default:
+		rc = slot->ops.poll(slot);
+		return rc == OPAL_SUCCESS;
+	}
+}
+
 static void phb3_create(struct dt_node *np)
 {
 	const struct dt_property *prop;
@@ -4755,6 +4793,8 @@  static void phb3_create(struct dt_node *np)
 	/* Load capp microcode into capp unit */
 	capp_load_ucode(p);
 
+	opal_add_host_sync_notifier(phb3_host_sync_reset, p);
+
 	/* Platform additional setup */
 	if (platform.pci_setup_phb)
 		platform.pci_setup_phb(&p->phb, p->index);