[v4,2/7] linux/pci: Add uevents in AER and EEH error/resume

Message ID 20180105164552.36371-3-bryantly@linux.vnet.ibm.com
State New
Headers show
Series
  • SR-IOV Enablement on PowerVM
Related show

Commit Message

Bryant G. Ly Jan. 5, 2018, 4:45 p.m.
Devices can go offline when erors reported. This
patch adds a change to the kernel object and lets udev
know of error. When device resumes, a change is also set
reporting device as online. Therefore, EEH and AER events
are better propagated to user space for PCI devices in
all arches.

Signed-off-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
Signed-off-by: Juan J. Alvarez <jjalvare@linux.vnet.ibm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/powerpc/kernel/eeh_driver.c   |  6 ++++++
 drivers/pci/pcie/aer/aerdrv_core.c |  3 +++
 include/linux/pci.h                | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

Comments

Bjorn Helgaas Jan. 5, 2018, 6:15 p.m. | #1
I doubt "linux/pci: " matches the powerpc convention and I know it doesn't
match the drivers/pci convention.

I'd suggest matching one or the other.  In drivers/pci I would be using
"PCI/AER: ".

On Jan 5, 2018 10:46 AM, "Bryant G. Ly" <bryantly@linux.vnet.ibm.com> wrote:

> Devices can go offline when erors reported. This
> patch adds a change to the kernel object and lets udev
> know of error. When device resumes, a change is also set
> reporting device as online. Therefore, EEH and AER events
> are better propagated to user space for PCI devices in
> all arches.
>
> Signed-off-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
> Signed-off-by: Juan J. Alvarez <jjalvare@linux.vnet.ibm.com>
> Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>  arch/powerpc/kernel/eeh_driver.c   |  6 ++++++
>  drivers/pci/pcie/aer/aerdrv_core.c |  3 +++
>  include/linux/pci.h                | 36 ++++++++++++++++++++++++++++++
> ++++++
>  3 files changed, 45 insertions(+)
>
> diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_
> driver.c
> index 3c0fa99c5533..beea2182d754 100644
> --- a/arch/powerpc/kernel/eeh_driver.c
> +++ b/arch/powerpc/kernel/eeh_driver.c
> @@ -228,6 +228,7 @@ static void *eeh_report_error(void *data, void
> *userdata)
>
>         edev->in_error = true;
>         eeh_pcid_put(dev);
> +       pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
>         return NULL;
>  }
>
> @@ -381,6 +382,10 @@ static void *eeh_report_resume(void *data, void
> *userdata)
>         driver->err_handler->resume(dev);
>
>         eeh_pcid_put(dev);
> +       pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
> +#ifdef CONFIG_PCI_IOV
> +       eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
> +#endif
>         return NULL;
>  }
>
> @@ -416,6 +421,7 @@ static void *eeh_report_failure(void *data, void
> *userdata)
>         driver->err_handler->error_detected(dev,
> pci_channel_io_perm_failure);
>
>         eeh_pcid_put(dev);
> +       pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>         return NULL;
>  }
>
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c
> b/drivers/pci/pcie/aer/aerdrv_core.c
> index 744805232155..8d7448063fd1 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -278,6 +278,7 @@ static int report_error_detected(struct pci_dev *dev,
> void *data)
>         } else {
>                 err_handler = dev->driver->err_handler;
>                 vote = err_handler->error_detected(dev,
> result_data->state);
> +               pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
>         }
>
>         result_data->result = merge_result(result_data->result, vote);
> @@ -341,6 +342,7 @@ static int report_resume(struct pci_dev *dev, void
> *data)
>
>         err_handler = dev->driver->err_handler;
>         err_handler->resume(dev);
> +       pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
>  out:
>         device_unlock(&dev->dev);
>         return 0;
> @@ -541,6 +543,7 @@ static void do_recovery(struct pci_dev *dev, int
> severity)
>         return;
>
>  failed:
> +       pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>         /* TODO: Should kernel panic here? */
>         dev_info(&dev->dev, "AER: Device recovery failed\n");
>  }
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index e3e94467687a..405630441b74 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -2277,6 +2277,42 @@ static inline bool pci_is_thunderbolt_attached(struct
> pci_dev *pdev)
>         return false;
>  }
>
> +/**
> + * pci_uevent_ers - emit a uevent during recovery path of pci device
> + * @pdev: pci device to check
> + * @err_type: type of error event
> + *
> + */
> +static inline void pci_uevent_ers(struct pci_dev *pdev,
> +                                 enum  pci_ers_result err_type)
> +{
> +       int idx = 0;
> +       char *envp[3];
> +
> +       switch (err_type) {
> +       case PCI_ERS_RESULT_NONE:
> +       case PCI_ERS_RESULT_CAN_RECOVER:
> +               envp[idx++] = "ERROR_EVENT=BEGIN_RECOVERY";
> +               envp[idx++] = "DEVICE_ONLINE=0";
> +               break;
> +       case PCI_ERS_RESULT_RECOVERED:
> +               envp[idx++] = "ERROR_EVENT=SUCCESSFUL_RECOVERY";
> +               envp[idx++] = "DEVICE_ONLINE=1";
> +               break;
> +       case PCI_ERS_RESULT_DISCONNECT:
> +               envp[idx++] = "ERROR_EVENT=FAILED_RECOVERY";
> +               envp[idx++] = "DEVICE_ONLINE=0";
> +               break;
> +       default:
> +               break;
> +       }
> +
> +       if (idx > 0) {
> +               envp[idx++] = NULL;
> +               kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, envp);
> +       }
> +}
> +
>  /* provide the legacy pci_dma_* API */
>  #include <linux/pci-dma-compat.h>
>
> --
> 2.14.3 (Apple Git-98)
>
>
<div dir="auto">I doubt &quot;linux/pci: &quot; matches the powerpc convention and I know it doesn&#39;t match the drivers/pci convention.<div dir="auto"><br></div><div dir="auto">I&#39;d suggest matching one or the other.  In drivers/pci I would be using &quot;PCI/AER: &quot;.</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Jan 5, 2018 10:46 AM, &quot;Bryant G. Ly&quot; &lt;<a href="mailto:bryantly@linux.vnet.ibm.com">bryantly@linux.vnet.ibm.com</a>&gt; wrote:<br type="attribution"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Devices can go offline when erors reported. This<br>
patch adds a change to the kernel object and lets udev<br>
know of error. When device resumes, a change is also set<br>
reporting device as online. Therefore, EEH and AER events<br>
are better propagated to user space for PCI devices in<br>
all arches.<br>
<br>
Signed-off-by: Bryant G. Ly &lt;<a href="mailto:bryantly@linux.vnet.ibm.com">bryantly@linux.vnet.ibm.com</a>&gt;<br>
Signed-off-by: Juan J. Alvarez &lt;<a href="mailto:jjalvare@linux.vnet.ibm.com">jjalvare@linux.vnet.ibm.com</a>&gt;<br>
Acked-by: Bjorn Helgaas &lt;<a href="mailto:bhelgaas@google.com">bhelgaas@google.com</a>&gt;<br>
---<br>
 arch/powerpc/kernel/eeh_<wbr>driver.c   |  6 ++++++<br>
 drivers/pci/pcie/aer/aerdrv_<wbr>core.c |  3 +++<br>
 include/linux/pci.h                | 36 ++++++++++++++++++++++++++++++<wbr>++++++<br>
 3 files changed, 45 insertions(+)<br>
<br>
diff --git a/arch/powerpc/kernel/eeh_<wbr>driver.c b/arch/powerpc/kernel/eeh_<wbr>driver.c<br>
index 3c0fa99c5533..beea2182d754 100644<br>
--- a/arch/powerpc/kernel/eeh_<wbr>driver.c<br>
+++ b/arch/powerpc/kernel/eeh_<wbr>driver.c<br>
@@ -228,6 +228,7 @@ static void *eeh_report_error(void *data, void *userdata)<br>
<br>
        edev-&gt;in_error = true;<br>
        eeh_pcid_put(dev);<br>
+       pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);<br>
        return NULL;<br>
 }<br>
<br>
@@ -381,6 +382,10 @@ static void *eeh_report_resume(void *data, void *userdata)<br>
        driver-&gt;err_handler-&gt;resume(<wbr>dev);<br>
<br>
        eeh_pcid_put(dev);<br>
+       pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);<br>
+#ifdef CONFIG_PCI_IOV<br>
+       eeh_ops-&gt;notify_resume(eeh_<wbr>dev_to_pdn(edev));<br>
+#endif<br>
        return NULL;<br>
 }<br>
<br>
@@ -416,6 +421,7 @@ static void *eeh_report_failure(void *data, void *userdata)<br>
        driver-&gt;err_handler-&gt;error_<wbr>detected(dev, pci_channel_io_perm_failure);<br>
<br>
        eeh_pcid_put(dev);<br>
+       pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);<br>
        return NULL;<br>
 }<br>
<br>
diff --git a/drivers/pci/pcie/aer/aerdrv_<wbr>core.c b/drivers/pci/pcie/aer/aerdrv_<wbr>core.c<br>
index 744805232155..8d7448063fd1 100644<br>
--- a/drivers/pci/pcie/aer/aerdrv_<wbr>core.c<br>
+++ b/drivers/pci/pcie/aer/aerdrv_<wbr>core.c<br>
@@ -278,6 +278,7 @@ static int report_error_detected(struct pci_dev *dev, void *data)<br>
        } else {<br>
                err_handler = dev-&gt;driver-&gt;err_handler;<br>
                vote = err_handler-&gt;error_detected(<wbr>dev, result_data-&gt;state);<br>
+               pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);<br>
        }<br>
<br>
        result_data-&gt;result = merge_result(result_data-&gt;<wbr>result, vote);<br>
@@ -341,6 +342,7 @@ static int report_resume(struct pci_dev *dev, void *data)<br>
<br>
        err_handler = dev-&gt;driver-&gt;err_handler;<br>
        err_handler-&gt;resume(dev);<br>
+       pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);<br>
 out:<br>
        device_unlock(&amp;dev-&gt;dev);<br>
        return 0;<br>
@@ -541,6 +543,7 @@ static void do_recovery(struct pci_dev *dev, int severity)<br>
        return;<br>
<br>
 failed:<br>
+       pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);<br>
        /* TODO: Should kernel panic here? */<br>
        dev_info(&amp;dev-&gt;dev, &quot;AER: Device recovery failed\n&quot;);<br>
 }<br>
diff --git a/include/linux/pci.h b/include/linux/pci.h<br>
index e3e94467687a..405630441b74 100644<br>
--- a/include/linux/pci.h<br>
+++ b/include/linux/pci.h<br>
@@ -2277,6 +2277,42 @@ static inline bool pci_is_thunderbolt_attached(<wbr>struct pci_dev *pdev)<br>
        return false;<br>
 }<br>
<br>
+/**<br>
+ * pci_uevent_ers - emit a uevent during recovery path of pci device<br>
+ * @pdev: pci device to check<br>
+ * @err_type: type of error event<br>
+ *<br>
+ */<br>
+static inline void pci_uevent_ers(struct pci_dev *pdev,<br>
+                                 enum  pci_ers_result err_type)<br>
+{<br>
+       int idx = 0;<br>
+       char *envp[3];<br>
+<br>
+       switch (err_type) {<br>
+       case PCI_ERS_RESULT_NONE:<br>
+       case PCI_ERS_RESULT_CAN_RECOVER:<br>
+               envp[idx++] = &quot;ERROR_EVENT=BEGIN_RECOVERY&quot;;<br>
+               envp[idx++] = &quot;DEVICE_ONLINE=0&quot;;<br>
+               break;<br>
+       case PCI_ERS_RESULT_RECOVERED:<br>
+               envp[idx++] = &quot;ERROR_EVENT=SUCCESSFUL_<wbr>RECOVERY&quot;;<br>
+               envp[idx++] = &quot;DEVICE_ONLINE=1&quot;;<br>
+               break;<br>
+       case PCI_ERS_RESULT_DISCONNECT:<br>
+               envp[idx++] = &quot;ERROR_EVENT=FAILED_RECOVERY&quot;;<br>
+               envp[idx++] = &quot;DEVICE_ONLINE=0&quot;;<br>
+               break;<br>
+       default:<br>
+               break;<br>
+       }<br>
+<br>
+       if (idx &gt; 0) {<br>
+               envp[idx++] = NULL;<br>
+               kobject_uevent_env(&amp;pdev-&gt;dev.<wbr>kobj, KOBJ_CHANGE, envp);<br>
+       }<br>
+}<br>
+<br>
 /* provide the legacy pci_dma_* API */<br>
 #include &lt;linux/pci-dma-compat.h&gt;<br>
<br>
--<br>
2.14.3 (Apple Git-98)<br>
<br>
</blockquote></div></div>

Patch

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 3c0fa99c5533..beea2182d754 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -228,6 +228,7 @@  static void *eeh_report_error(void *data, void *userdata)
 
 	edev->in_error = true;
 	eeh_pcid_put(dev);
+	pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
 	return NULL;
 }
 
@@ -381,6 +382,10 @@  static void *eeh_report_resume(void *data, void *userdata)
 	driver->err_handler->resume(dev);
 
 	eeh_pcid_put(dev);
+	pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
+#ifdef CONFIG_PCI_IOV
+	eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
+#endif
 	return NULL;
 }
 
@@ -416,6 +421,7 @@  static void *eeh_report_failure(void *data, void *userdata)
 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 
 	eeh_pcid_put(dev);
+	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
 	return NULL;
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 744805232155..8d7448063fd1 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -278,6 +278,7 @@  static int report_error_detected(struct pci_dev *dev, void *data)
 	} else {
 		err_handler = dev->driver->err_handler;
 		vote = err_handler->error_detected(dev, result_data->state);
+		pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
 	}
 
 	result_data->result = merge_result(result_data->result, vote);
@@ -341,6 +342,7 @@  static int report_resume(struct pci_dev *dev, void *data)
 
 	err_handler = dev->driver->err_handler;
 	err_handler->resume(dev);
+	pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
 out:
 	device_unlock(&dev->dev);
 	return 0;
@@ -541,6 +543,7 @@  static void do_recovery(struct pci_dev *dev, int severity)
 	return;
 
 failed:
+	pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
 	/* TODO: Should kernel panic here? */
 	dev_info(&dev->dev, "AER: Device recovery failed\n");
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e3e94467687a..405630441b74 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2277,6 +2277,42 @@  static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 	return false;
 }
 
+/**
+ * pci_uevent_ers - emit a uevent during recovery path of pci device
+ * @pdev: pci device to check
+ * @err_type: type of error event
+ *
+ */
+static inline void pci_uevent_ers(struct pci_dev *pdev,
+				  enum  pci_ers_result err_type)
+{
+	int idx = 0;
+	char *envp[3];
+
+	switch (err_type) {
+	case PCI_ERS_RESULT_NONE:
+	case PCI_ERS_RESULT_CAN_RECOVER:
+		envp[idx++] = "ERROR_EVENT=BEGIN_RECOVERY";
+		envp[idx++] = "DEVICE_ONLINE=0";
+		break;
+	case PCI_ERS_RESULT_RECOVERED:
+		envp[idx++] = "ERROR_EVENT=SUCCESSFUL_RECOVERY";
+		envp[idx++] = "DEVICE_ONLINE=1";
+		break;
+	case PCI_ERS_RESULT_DISCONNECT:
+		envp[idx++] = "ERROR_EVENT=FAILED_RECOVERY";
+		envp[idx++] = "DEVICE_ONLINE=0";
+		break;
+	default:
+		break;
+	}
+
+	if (idx > 0) {
+		envp[idx++] = NULL;
+		kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, envp);
+	}
+}
+
 /* provide the legacy pci_dma_* API */
 #include <linux/pci-dma-compat.h>