diff mbox series

[v2] PCI: pciehp: Report degraded links via link bandwidth notification

Message ID 20181207182021.16344-1-mr.nuke.me@gmail.com
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series [v2] PCI: pciehp: Report degraded links via link bandwidth notification | expand

Commit Message

Alex G. Dec. 7, 2018, 6:20 p.m. UTC
A warning is generated when a PCIe device is probed with a degraded
link, but there was no similar mechanism to warn when the link becomes
degraded after probing. The Link Bandwidth Notification provides this
mechanism.

Use the link bandwidth notification interrupt to detect bandwidth
changes, and rescan the bandwidth, looking for the weakest point. This
is the same logic used in probe().

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
Changes since v1:
 * Layer on top of the pcie port service drivers, instead of hotplug service.

This patch needs to be applied on top of:
	PCI: Add missing include to drivers/pci.h

Anticipated FAQ:

Q: Why is this unconditionally compiled in?
A: The symmetrical check in pci probe() is also always compiled in.

Q: Why call module_init() instead of adding a call in pcie_init_services() ?
A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
non-static implementation in bw_notification.c. Using module_init() is
functionally equivalent, and takes less code.

Q: Why print only on degraded links and not when a link is back to full speed?
For symmetry with PCI probe(). Although I see a benefit in conveying that a
link is back to full speed, I expect this to be extremely rare. Secondary bus
reset is usually needed to retrain at full bandwidth.


 drivers/pci/pcie/Makefile          |   1 +
 drivers/pci/pcie/bw_notification.c | 107 +++++++++++++++++++++++++++++
 drivers/pci/pcie/portdrv.h         |   4 +-
 drivers/pci/pcie/portdrv_core.c    |  14 ++--
 4 files changed, 121 insertions(+), 5 deletions(-)
 create mode 100644 drivers/pci/pcie/bw_notification.c

Comments

Alex G. Dec. 27, 2018, 7:33 p.m. UTC | #1
On 12/7/18 12:20 PM, Alexandru Gagniuc wrote:
> A warning is generated when a PCIe device is probed with a degraded
> link, but there was no similar mechanism to warn when the link becomes
> degraded after probing. The Link Bandwidth Notification provides this
> mechanism.
> 
> Use the link bandwidth notification interrupt to detect bandwidth
> changes, and rescan the bandwidth, looking for the weakest point. This
> is the same logic used in probe().
> 
> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> ---

ping.

> Changes since v1:
>   * Layer on top of the pcie port service drivers, instead of hotplug service.
> 
> This patch needs to be applied on top of:
> 	PCI: Add missing include to drivers/pci.h
> 
> Anticipated FAQ:
> 
> Q: Why is this unconditionally compiled in?
> A: The symmetrical check in pci probe() is also always compiled in.
> 
> Q: Why call module_init() instead of adding a call in pcie_init_services() ?
> A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
> non-static implementation in bw_notification.c. Using module_init() is
> functionally equivalent, and takes less code.
> 
> Q: Why print only on degraded links and not when a link is back to full speed?
> For symmetry with PCI probe(). Although I see a benefit in conveying that a
> link is back to full speed, I expect this to be extremely rare. Secondary bus
> reset is usually needed to retrain at full bandwidth.
> 
> 
>   drivers/pci/pcie/Makefile          |   1 +
>   drivers/pci/pcie/bw_notification.c | 107 +++++++++++++++++++++++++++++
>   drivers/pci/pcie/portdrv.h         |   4 +-
>   drivers/pci/pcie/portdrv_core.c    |  14 ++--
>   4 files changed, 121 insertions(+), 5 deletions(-)
>   create mode 100644 drivers/pci/pcie/bw_notification.c
> 
> diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
> index ab514083d5d4..f1d7bc1e5efa 100644
> --- a/drivers/pci/pcie/Makefile
> +++ b/drivers/pci/pcie/Makefile
> @@ -3,6 +3,7 @@
>   # Makefile for PCI Express features and port driver
>   
>   pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
> +pcieportdrv-y			+= bw_notification.o
>   
>   obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
>   
> diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
> new file mode 100644
> index 000000000000..64391ea9a172
> --- /dev/null
> +++ b/drivers/pci/pcie/bw_notification.c
> @@ -0,0 +1,107 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * PCI Express Bandwidth notification services driver
> + * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> + *
> + * Copyright (C) 2018, Dell Inc
> + *
> + * The PCIe bandwidth notification provides a way to notify the operating system
> + * when the link width or data rate changes. This capability is required for all
> + * root ports and downstream ports supporting links wider than x1 and/or
> + * multiple link speeds.
> + *
> + * This service port driver hooks into the bandwidth notification interrupt and
> + * warns when links become degraded in operation.
> + */
> +
> +#include <linux/module.h>
> +
> +#include "../pci.h"
> +#include "portdrv.h"
> +
> +static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
> +{
> +	int ret;
> +	u32 lnk_cap;
> +
> +	ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
> +	return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
> +}
> +
> +static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
> +{
> +	u16 lnk_ctl;
> +
> +	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> +	lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
> +	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> +}
> +
> +static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
> +{
> +	u16 lnk_ctl;
> +
> +	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> +	lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
> +	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> +}
> +
> +static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
> +{
> +	struct pcie_device *srv = context;
> +	struct pci_dev *port = srv->port;
> +	struct pci_dev *dev;
> +	u16 link_status, events;
> +	int ret;
> +
> +	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
> +	events = link_status & PCI_EXP_LNKSTA_LBMS;
> +
> +	if (!events || ret != PCIBIOS_SUCCESSFUL)
> +		return IRQ_NONE;
> +
> +	/* Print status from upstream link partner, not this downstream port. */
> +	list_for_each_entry(dev, &port->subordinate->devices, bus_list)
> +		__pcie_print_link_status(dev, false);
> +
> +	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
> +	return IRQ_HANDLED;
> +}
> +
> +static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
> +{
> +	int ret;
> +
> +	/* Single-width or single-speed ports do not have to support this. */
> +	if (!pcie_link_bandwidth_notification_supported(srv->port))
> +		return -ENODEV;
> +
> +	ret = devm_request_irq(&srv->device, srv->irq, pcie_bw_notification_irq,
> +			       IRQF_SHARED, "PCIe BW notif", srv);
> +	if (ret)
> +		return ret;
> +
> +	pcie_enable_link_bandwidth_notification(srv->port);
> +
> +	return 0;
> +}
> +
> +static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
> +{
> +	pcie_disable_link_bandwidth_notification(srv->port);
> +}
> +
> +static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
> +	.name		= "pcie_bw_notification",
> +	.port_type	= PCI_EXP_TYPE_DOWNSTREAM,
> +	.service	= PCIE_PORT_SERVICE_BwNOTIF,
> +	.probe		= pcie_bandwidth_notification_probe,
> +	.remove		= pcie_bandwidth_notification_remove,
> +};
> +
> +static int __init pcie_bandwidth_notification_service_init(void)
> +{
> +	return pcie_port_service_register(&pcie_bandwidth_notification_driver);
> +}
> +
> +module_init(pcie_bandwidth_notification_service_init);
> diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
> index e495f04394d0..46652469ffaa 100644
> --- a/drivers/pci/pcie/portdrv.h
> +++ b/drivers/pci/pcie/portdrv.h
> @@ -20,8 +20,10 @@
>   #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
>   #define PCIE_PORT_SERVICE_DPC_SHIFT	3	/* Downstream Port Containment */
>   #define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
> +#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT	4	/* Bandwidth notification */
> +#define PCIE_PORT_SERVICE_BwNOTIF	(1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
>   
> -#define PCIE_PORT_DEVICE_MAXSERVICES   4
> +#define PCIE_PORT_DEVICE_MAXSERVICES   5
>   
>   #ifdef CONFIG_PCIEAER
>   int pcie_aer_init(void);
> diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
> index f458ac9cb70c..86455ff7ced9 100644
> --- a/drivers/pci/pcie/portdrv_core.c
> +++ b/drivers/pci/pcie/portdrv_core.c
> @@ -99,7 +99,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
>    */
>   static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>   {
> -	int nr_entries, nvec;
> +	int nr_entries, nvec, pcie_irq;
>   	u32 pme = 0, aer = 0, dpc = 0;
>   
>   	/* Allocate the maximum possible number of MSI/MSI-X vectors */
> @@ -136,9 +136,12 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>   	}
>   
>   	/* PME and hotplug share an MSI/MSI-X vector */
> -	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
> -		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
> -		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
> +	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
> +		    PCIE_PORT_SERVICE_BwNOTIF)) {
> +		pcie_irq = pci_irq_vector(dev, pme);
> +		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
> +		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
> +		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
>   	}
>   
>   	if (mask & PCIE_PORT_SERVICE_AER)
> @@ -250,6 +253,9 @@ static int get_port_device_capability(struct pci_dev *dev)
>   	    pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
>   		services |= PCIE_PORT_SERVICE_DPC;
>   
> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
> +		services |= PCIE_PORT_SERVICE_BwNOTIF;
> +
>   	return services;
>   }
>   
>
Lukas Wunner Feb. 25, 2019, 2:28 a.m. UTC | #2
On Fri, Dec 07, 2018 at 12:20:00PM -0600, Alexandru Gagniuc wrote:
> A warning is generated when a PCIe device is probed with a degraded
> link, but there was no similar mechanism to warn when the link becomes
> degraded after probing. The Link Bandwidth Notification provides this
> mechanism.
> 
> Use the link bandwidth notification interrupt to detect bandwidth
> changes, and rescan the bandwidth, looking for the weakest point. This
> is the same logic used in probe().

This is unrelated to pciehp, so the subject prefix should be changed
to "PCI/portdrv".


> Q: Why is this unconditionally compiled in?
> A: The symmetrical check in pci probe() is also always compiled in.

Hm, it looks like the convention is to provide a separate Kconfig entry
for each port service.


> Q: Why call module_init() instead of adding a call in pcie_init_services() ?
> A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
> non-static implementation in bw_notification.c. Using module_init() is
> functionally equivalent, and takes less code.

Commit c29de84149ab ("PCI: portdrv: Initialize service drivers directly")
moved away from module_init() on purpose, apparently to fix a race
condition.


> Q: Why print only on degraded links and not when a link is back to full speed?
> For symmetry with PCI probe(). Although I see a benefit in conveying that a
> link is back to full speed, I expect this to be extremely rare. Secondary bus
> reset is usually needed to retrain at full bandwidth.

What if the link is retrained at the same speed/width?  Intuitively
I'd compare the speed in the Link Status Register to what is cached
in the cur_bus_speed member of struct pci_bus and print a message
only if the speed has changed.  (Don't we need to cache the width as
well?)


> +static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
> +{
> +	struct pcie_device *srv = context;
> +	struct pci_dev *port = srv->port;
> +	struct pci_dev *dev;
> +	u16 link_status, events;
> +	int ret;
> +
> +	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
> +	events = link_status & PCI_EXP_LNKSTA_LBMS;
> +
> +	if (!events || ret != PCIBIOS_SUCCESSFUL)
> +		return IRQ_NONE;
> +
> +	/* Print status from upstream link partner, not this downstream port. */
> +	list_for_each_entry(dev, &port->subordinate->devices, bus_list)
> +		__pcie_print_link_status(dev, false);
> +
> +	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
> +	return IRQ_HANDLED;
> +}

You need to hold pci_bus_sem for the list iteration to protect against
simultaneous removal of child devices.

This may sleep, so request the IRQ with request_threaded_irq(), pass NULL
for the "handler" argument and pcie_bw_notification_irq for the "thread_fn"
argument.

You may want to call pcie_update_link_speed() to update the now stale
speed cached in the pci_bus struct.


> +	ret = devm_request_irq(&srv->device, srv->irq, pcie_bw_notification_irq,
> +			       IRQF_SHARED, "PCIe BW notif", srv);

The plan is to move away from port services as devices in the future,
so device-managed allocations should be avoided.

Apart from that, with a device-managed IRQ, if this service driver is
unbound, its IRQ handler may still be invoked if some other port service
signals an interrupt (because the IRQ is shared).  Which seems wrong.


> +	.port_type      = PCI_EXP_TYPE_DOWNSTREAM,

According to PCIe r4.0, sec 7.8.6, Link Bandwidth Notification Capability
is also required for root ports, so I think you need to match for
PCIE_ANY_PORT and amend pcie_link_bandwidth_notification_supported()
to check that you're dealing with a root or downstream port.


> +	.service	= PCIE_PORT_SERVICE_BwNOTIF,

All caps please.


> @@ -136,9 +136,12 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>  	}
>  
>  	/* PME and hotplug share an MSI/MSI-X vector */
> -	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
> -		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
> -		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
> +	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
> +		    PCIE_PORT_SERVICE_BwNOTIF)) {
> +		pcie_irq = pci_irq_vector(dev, pme);
> +		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
> +		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
> +		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;

Please amend the code comment with something like

-	/* PME and hotplug share an MSI/MSI-X vector */
+	/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */


> @@ -250,6 +253,9 @@ static int get_port_device_capability(struct pci_dev *dev)
>  	    pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
>  		services |= PCIE_PORT_SERVICE_DPC;
>  
> +	if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
> +		services |= PCIE_PORT_SERVICE_BwNOTIF;
> +

Also need to check for root ports, see above.


Otherwise LGTM.

Thanks,

Lukas
Alex_Gagniuc@Dellteam.com Feb. 27, 2019, 8:21 p.m. UTC | #3
On 2/24/19 8:29 PM, Lukas Wunner wrote:
> On Fri, Dec 07, 2018 at 12:20:00PM -0600, Alexandru Gagniuc wrote:
> 
> 
>> Q: Why is this unconditionally compiled in?
>> A: The symmetrical check in pci probe() is also always compiled in.
> 
> Hm, it looks like the convention is to provide a separate Kconfig entry
> for each port service.

Does the convention still make sense in light of the symmetry reason?

>> Q: Why call module_init() instead of adding a call in pcie_init_services() ?
>> A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
>> non-static implementation in bw_notification.c. Using module_init() is
>> functionally equivalent, and takes less code.
> 
> Commit c29de84149ab ("PCI: portdrv: Initialize service drivers directly")
> moved away from module_init() on purpose, apparently to fix a race
> condition.

*GROWL*


> What if the link is retrained at the same speed/width?  Intuitively
> I'd compare the speed in the Link Status Register to what is cached
> in the cur_bus_speed member of struct pci_bus and print a message
> only if the speed has changed.  (Don't we need to cache the width as
> well?)

There are two mechanisms to bring a degraded link back to full BW.
   1. Secondary bus reset, which results in the device being tore down 
along with our cached speed value.
   2. Set the PCI_EXP_LNKCTL_LD (Link Disable) bit and clear it. We do 
that as part of the pciehp teardown path. We'd lose our cached value 
just like in the first case.

>> +static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
>> [...]
> 
> You need to hold pci_bus_sem [...]
> This may sleep, so request the IRQ with request_threaded_irq() [...]

Good catch! Thanks!
All other issues you pointed out should be resolved in next version.

Alex
Lukas Wunner Feb. 28, 2019, 6:43 a.m. UTC | #4
On Wed, Feb 27, 2019 at 08:21:58PM +0000, Alex_Gagniuc@Dellteam.com wrote:
> On 2/24/19 8:29 PM, Lukas Wunner wrote:
> > On Fri, Dec 07, 2018 at 12:20:00PM -0600, Alexandru Gagniuc wrote:
> > > Q: Why is this unconditionally compiled in?
> > > A: The symmetrical check in pci probe() is also always compiled in.
> > 
> > Hm, it looks like the convention is to provide a separate Kconfig entry
> > for each port service.
> 
> Does the convention still make sense in light of the symmetry reason?

I don't know.  In the past we had OpenWRT/LEDE folks complain that
the PCI core is hogging too much memory on their space-constrained
Mips routers, so they #ifdef'ed a lot of x86-specific stuff in
quirks.c.  In the same vein they could argue that they'd like to
disable unneeded PCIe services.  That space argument and possibly
the reduction of attack surface on Linux firewalls is the only
justification I can come up with for a separate Kconfig option for
bandwidth notification.

Thanks,

Lukas
diff mbox series

Patch

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index ab514083d5d4..f1d7bc1e5efa 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -3,6 +3,7 @@ 
 # Makefile for PCI Express features and port driver
 
 pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
+pcieportdrv-y			+= bw_notification.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
new file mode 100644
index 000000000000..64391ea9a172
--- /dev/null
+++ b/drivers/pci/pcie/bw_notification.c
@@ -0,0 +1,107 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCI Express Bandwidth notification services driver
+ * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
+ *
+ * Copyright (C) 2018, Dell Inc
+ *
+ * The PCIe bandwidth notification provides a way to notify the operating system
+ * when the link width or data rate changes. This capability is required for all
+ * root ports and downstream ports supporting links wider than x1 and/or
+ * multiple link speeds.
+ *
+ * This service port driver hooks into the bandwidth notification interrupt and
+ * warns when links become degraded in operation.
+ */
+
+#include <linux/module.h>
+
+#include "../pci.h"
+#include "portdrv.h"
+
+static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
+{
+	int ret;
+	u32 lnk_cap;
+
+	ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
+	return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
+}
+
+static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
+{
+	u16 lnk_ctl;
+
+	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
+	lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
+	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
+}
+
+static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
+{
+	u16 lnk_ctl;
+
+	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
+	lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
+	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
+}
+
+static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
+{
+	struct pcie_device *srv = context;
+	struct pci_dev *port = srv->port;
+	struct pci_dev *dev;
+	u16 link_status, events;
+	int ret;
+
+	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+	events = link_status & PCI_EXP_LNKSTA_LBMS;
+
+	if (!events || ret != PCIBIOS_SUCCESSFUL)
+		return IRQ_NONE;
+
+	/* Print status from upstream link partner, not this downstream port. */
+	list_for_each_entry(dev, &port->subordinate->devices, bus_list)
+		__pcie_print_link_status(dev, false);
+
+	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+	return IRQ_HANDLED;
+}
+
+static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
+{
+	int ret;
+
+	/* Single-width or single-speed ports do not have to support this. */
+	if (!pcie_link_bandwidth_notification_supported(srv->port))
+		return -ENODEV;
+
+	ret = devm_request_irq(&srv->device, srv->irq, pcie_bw_notification_irq,
+			       IRQF_SHARED, "PCIe BW notif", srv);
+	if (ret)
+		return ret;
+
+	pcie_enable_link_bandwidth_notification(srv->port);
+
+	return 0;
+}
+
+static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
+{
+	pcie_disable_link_bandwidth_notification(srv->port);
+}
+
+static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
+	.name		= "pcie_bw_notification",
+	.port_type	= PCI_EXP_TYPE_DOWNSTREAM,
+	.service	= PCIE_PORT_SERVICE_BwNOTIF,
+	.probe		= pcie_bandwidth_notification_probe,
+	.remove		= pcie_bandwidth_notification_remove,
+};
+
+static int __init pcie_bandwidth_notification_service_init(void)
+{
+	return pcie_port_service_register(&pcie_bandwidth_notification_driver);
+}
+
+module_init(pcie_bandwidth_notification_service_init);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index e495f04394d0..46652469ffaa 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,8 +20,10 @@ 
 #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
 #define PCIE_PORT_SERVICE_DPC_SHIFT	3	/* Downstream Port Containment */
 #define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
+#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT	4	/* Bandwidth notification */
+#define PCIE_PORT_SERVICE_BwNOTIF	(1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
 
-#define PCIE_PORT_DEVICE_MAXSERVICES   4
+#define PCIE_PORT_DEVICE_MAXSERVICES   5
 
 #ifdef CONFIG_PCIEAER
 int pcie_aer_init(void);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index f458ac9cb70c..86455ff7ced9 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -99,7 +99,7 @@  static int pcie_message_numbers(struct pci_dev *dev, int mask,
  */
 static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 {
-	int nr_entries, nvec;
+	int nr_entries, nvec, pcie_irq;
 	u32 pme = 0, aer = 0, dpc = 0;
 
 	/* Allocate the maximum possible number of MSI/MSI-X vectors */
@@ -136,9 +136,12 @@  static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 	}
 
 	/* PME and hotplug share an MSI/MSI-X vector */
-	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
-		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
-		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
+	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
+		    PCIE_PORT_SERVICE_BwNOTIF)) {
+		pcie_irq = pci_irq_vector(dev, pme);
+		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
+		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
+		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
 	}
 
 	if (mask & PCIE_PORT_SERVICE_AER)
@@ -250,6 +253,9 @@  static int get_port_device_capability(struct pci_dev *dev)
 	    pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
 		services |= PCIE_PORT_SERVICE_DPC;
 
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
+		services |= PCIE_PORT_SERVICE_BwNOTIF;
+
 	return services;
 }