diff mbox series

Revert "PCI/LINK: Report degraded links via link bandwidth notification"

Message ID 20190429185611.121751-2-helgaas@kernel.org
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series Revert "PCI/LINK: Report degraded links via link bandwidth notification" | expand

Commit Message

Bjorn Helgaas April 29, 2019, 6:56 p.m. UTC
From: Bjorn Helgaas <bhelgaas@google.com>

This reverts commit e8303bb7a75c113388badcc49b2a84b4121c1b3e.

e8303bb7a75c added logging whenever a link changed speed or width to a
state that is considered degraded.  Unfortunately, it cannot differentiate
signal integrity-related link changes from those intentionally initiated by
an endpoint driver, including drivers that may live in userspace or VMs
when making use of vfio-pci.  Some GPU drivers actively manage the link
state to save power, which generates a stream of messages like this:

  vfio-pci 0000:07:00.0: 32.000 Gb/s available PCIe bandwidth, limited by 2.5 GT/s x16 link at 0000:00:02.0 (capable of 64.000 Gb/s with 5 GT/s x16 link)

We really *do* want to be alerted when the link bandwidth is reduced
because of hardware failures, but degradation by intentional link state
management is probably far more common, so the signal-to-noise ratio is
currently low.

Until we figure out a way to identify the real problems or silence the
intentional situations, revert the following commits, which include the
initial implementation (e8303bb7a75c) and subsequent fixes:

    e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification")
    3e82a7f9031f ("PCI/LINK: Supply IRQ handler so level-triggered IRQs are acked")
    55397ce8df48 ("PCI/LINK: Clear bandwidth notification interrupt before enabling it")
    0fa635aec9ab ("PCI/LINK: Deduplicate bandwidth reports for multi-function devices")

Link: https://lore.kernel.org/lkml/155597243666.19387.1205950870601742062.stgit@gimli.home
Link: https://lore.kernel.org/lkml/155605909349.3575.13433421148215616375.stgit@gimli.home
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Alexandru Gagniuc <mr.nuke.me@gmail.com>
CC: Lukas Wunner <lukas@wunner.de>
CC: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/pci/pci.h                  |   1 -
 drivers/pci/pcie/Makefile          |   1 -
 drivers/pci/pcie/bw_notification.c | 121 -----------------------------
 drivers/pci/pcie/portdrv.h         |   6 +-
 drivers/pci/pcie/portdrv_core.c    |  17 ++--
 drivers/pci/pcie/portdrv_pci.c     |   1 -
 drivers/pci/probe.c                |   2 +-
 7 files changed, 7 insertions(+), 142 deletions(-)
 delete mode 100644 drivers/pci/pcie/bw_notification.c

Comments

Alex Williamson April 29, 2019, 7:21 p.m. UTC | #1
On Mon, 29 Apr 2019 13:56:11 -0500
Bjorn Helgaas <helgaas@kernel.org> wrote:

> From: Bjorn Helgaas <bhelgaas@google.com>
> 
> This reverts commit e8303bb7a75c113388badcc49b2a84b4121c1b3e.
> 
> e8303bb7a75c added logging whenever a link changed speed or width to a
> state that is considered degraded.  Unfortunately, it cannot differentiate
> signal integrity-related link changes from those intentionally initiated by
> an endpoint driver, including drivers that may live in userspace or VMs
> when making use of vfio-pci.  Some GPU drivers actively manage the link
> state to save power, which generates a stream of messages like this:
> 
>   vfio-pci 0000:07:00.0: 32.000 Gb/s available PCIe bandwidth, limited by 2.5 GT/s x16 link at 0000:00:02.0 (capable of 64.000 Gb/s with 5 GT/s x16 link)
> 
> We really *do* want to be alerted when the link bandwidth is reduced
> because of hardware failures, but degradation by intentional link state
> management is probably far more common, so the signal-to-noise ratio is
> currently low.
> 
> Until we figure out a way to identify the real problems or silence the
> intentional situations, revert the following commits, which include the
> initial implementation (e8303bb7a75c) and subsequent fixes:
> 
>     e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification")
>     3e82a7f9031f ("PCI/LINK: Supply IRQ handler so level-triggered IRQs are acked")
>     55397ce8df48 ("PCI/LINK: Clear bandwidth notification interrupt before enabling it")
>     0fa635aec9ab ("PCI/LINK: Deduplicate bandwidth reports for multi-function devices")
> 
> Link: https://lore.kernel.org/lkml/155597243666.19387.1205950870601742062.stgit@gimli.home
> Link: https://lore.kernel.org/lkml/155605909349.3575.13433421148215616375.stgit@gimli.home
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> CC: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> CC: Lukas Wunner <lukas@wunner.de>
> CC: Alex Williamson <alex.williamson@redhat.com>

Unfortunate, but a good choice for 5.1 and hopefully it can be brought
up to par for 5.2.  Some sort of more structured reporting channel,
possibly also with admin &/ driver hooks would be good.

Reviewed-by: Alex Williamson <alex.williamson@redhat.com>

Thanks,
Alex

> ---
>  drivers/pci/pci.h                  |   1 -
>  drivers/pci/pcie/Makefile          |   1 -
>  drivers/pci/pcie/bw_notification.c | 121 -----------------------------
>  drivers/pci/pcie/portdrv.h         |   6 +-
>  drivers/pci/pcie/portdrv_core.c    |  17 ++--
>  drivers/pci/pcie/portdrv_pci.c     |   1 -
>  drivers/pci/probe.c                |   2 +-
>  7 files changed, 7 insertions(+), 142 deletions(-)
>  delete mode 100644 drivers/pci/pcie/bw_notification.c
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index d994839a3e24..224d88634115 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -273,7 +273,6 @@ enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
>  u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
>  			   enum pcie_link_width *width);
>  void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
> -void pcie_report_downtraining(struct pci_dev *dev);
>  
>  /* Single Root I/O Virtualization */
>  struct pci_sriov {
> diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
> index f1d7bc1e5efa..ab514083d5d4 100644
> --- a/drivers/pci/pcie/Makefile
> +++ b/drivers/pci/pcie/Makefile
> @@ -3,7 +3,6 @@
>  # Makefile for PCI Express features and port driver
>  
>  pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
> -pcieportdrv-y			+= bw_notification.o
>  
>  obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
>  
> diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
> deleted file mode 100644
> index 4fa9e3523ee1..000000000000
> --- a/drivers/pci/pcie/bw_notification.c
> +++ /dev/null
> @@ -1,121 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0+
> -/*
> - * PCI Express Link Bandwidth Notification services driver
> - * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> - *
> - * Copyright (C) 2019, Dell Inc
> - *
> - * The PCIe Link Bandwidth Notification provides a way to notify the
> - * operating system when the link width or data rate changes.  This
> - * capability is required for all root ports and downstream ports
> - * supporting links wider than x1 and/or multiple link speeds.
> - *
> - * This service port driver hooks into the bandwidth notification interrupt
> - * and warns when links become degraded in operation.
> - */
> -
> -#include "../pci.h"
> -#include "portdrv.h"
> -
> -static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
> -{
> -	int ret;
> -	u32 lnk_cap;
> -
> -	ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
> -	return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
> -}
> -
> -static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
> -{
> -	u16 lnk_ctl;
> -
> -	pcie_capability_write_word(dev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
> -
> -	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> -	lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
> -	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> -}
> -
> -static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
> -{
> -	u16 lnk_ctl;
> -
> -	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> -	lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
> -	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> -}
> -
> -static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
> -{
> -	struct pcie_device *srv = context;
> -	struct pci_dev *port = srv->port;
> -	u16 link_status, events;
> -	int ret;
> -
> -	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
> -	events = link_status & PCI_EXP_LNKSTA_LBMS;
> -
> -	if (ret != PCIBIOS_SUCCESSFUL || !events)
> -		return IRQ_NONE;
> -
> -	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
> -	pcie_update_link_speed(port->subordinate, link_status);
> -	return IRQ_WAKE_THREAD;
> -}
> -
> -static irqreturn_t pcie_bw_notification_handler(int irq, void *context)
> -{
> -	struct pcie_device *srv = context;
> -	struct pci_dev *port = srv->port;
> -	struct pci_dev *dev;
> -
> -	/*
> -	 * Print status from downstream devices, not this root port or
> -	 * downstream switch port.
> -	 */
> -	down_read(&pci_bus_sem);
> -	list_for_each_entry(dev, &port->subordinate->devices, bus_list)
> -		pcie_report_downtraining(dev);
> -	up_read(&pci_bus_sem);
> -
> -	return IRQ_HANDLED;
> -}
> -
> -static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
> -{
> -	int ret;
> -
> -	/* Single-width or single-speed ports do not have to support this. */
> -	if (!pcie_link_bandwidth_notification_supported(srv->port))
> -		return -ENODEV;
> -
> -	ret = request_threaded_irq(srv->irq, pcie_bw_notification_irq,
> -				   pcie_bw_notification_handler,
> -				   IRQF_SHARED, "PCIe BW notif", srv);
> -	if (ret)
> -		return ret;
> -
> -	pcie_enable_link_bandwidth_notification(srv->port);
> -
> -	return 0;
> -}
> -
> -static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
> -{
> -	pcie_disable_link_bandwidth_notification(srv->port);
> -	free_irq(srv->irq, srv);
> -}
> -
> -static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
> -	.name		= "pcie_bw_notification",
> -	.port_type	= PCIE_ANY_PORT,
> -	.service	= PCIE_PORT_SERVICE_BWNOTIF,
> -	.probe		= pcie_bandwidth_notification_probe,
> -	.remove		= pcie_bandwidth_notification_remove,
> -};
> -
> -int __init pcie_bandwidth_notification_init(void)
> -{
> -	return pcie_port_service_register(&pcie_bandwidth_notification_driver);
> -}
> diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
> index 1d50dc58ac40..fbbf00b0992e 100644
> --- a/drivers/pci/pcie/portdrv.h
> +++ b/drivers/pci/pcie/portdrv.h
> @@ -20,10 +20,8 @@
>  #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
>  #define PCIE_PORT_SERVICE_DPC_SHIFT	3	/* Downstream Port Containment */
>  #define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
> -#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT	4	/* Bandwidth notification */
> -#define PCIE_PORT_SERVICE_BWNOTIF	(1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
>  
> -#define PCIE_PORT_DEVICE_MAXSERVICES   5
> +#define PCIE_PORT_DEVICE_MAXSERVICES   4
>  
>  #ifdef CONFIG_PCIEAER
>  int pcie_aer_init(void);
> @@ -49,8 +47,6 @@ int pcie_dpc_init(void);
>  static inline int pcie_dpc_init(void) { return 0; }
>  #endif
>  
> -int pcie_bandwidth_notification_init(void);
> -
>  /* Port Type */
>  #define PCIE_ANY_PORT			(~0)
>  
> diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
> index 7d04f9d087a6..f458ac9cb70c 100644
> --- a/drivers/pci/pcie/portdrv_core.c
> +++ b/drivers/pci/pcie/portdrv_core.c
> @@ -99,7 +99,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
>   */
>  static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>  {
> -	int nr_entries, nvec, pcie_irq;
> +	int nr_entries, nvec;
>  	u32 pme = 0, aer = 0, dpc = 0;
>  
>  	/* Allocate the maximum possible number of MSI/MSI-X vectors */
> @@ -135,13 +135,10 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>  			return nr_entries;
>  	}
>  
> -	/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
> -	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
> -		    PCIE_PORT_SERVICE_BWNOTIF)) {
> -		pcie_irq = pci_irq_vector(dev, pme);
> -		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
> -		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
> -		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
> +	/* PME and hotplug share an MSI/MSI-X vector */
> +	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
> +		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
> +		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
>  	}
>  
>  	if (mask & PCIE_PORT_SERVICE_AER)
> @@ -253,10 +250,6 @@ static int get_port_device_capability(struct pci_dev *dev)
>  	    pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
>  		services |= PCIE_PORT_SERVICE_DPC;
>  
> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM ||
> -	    pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
> -		services |= PCIE_PORT_SERVICE_BWNOTIF;
> -
>  	return services;
>  }
>  
> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> index 0a87091a0800..99d2abe88d0b 100644
> --- a/drivers/pci/pcie/portdrv_pci.c
> +++ b/drivers/pci/pcie/portdrv_pci.c
> @@ -240,7 +240,6 @@ static void __init pcie_init_services(void)
>  	pcie_pme_init();
>  	pcie_dpc_init();
>  	pcie_hp_init();
> -	pcie_bandwidth_notification_init();
>  }
>  
>  static int __init pcie_portdrv_init(void)
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 7e12d0163863..2ec0df04e0dc 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2388,7 +2388,7 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
>  	return dev;
>  }
>  
> -void pcie_report_downtraining(struct pci_dev *dev)
> +static void pcie_report_downtraining(struct pci_dev *dev)
>  {
>  	if (!pci_is_pcie(dev))
>  		return;
Alex G. April 30, 2019, 1:07 a.m. UTC | #2
On 4/29/19 1:56 PM, Bjorn Helgaas wrote:
> From: Bjorn Helgaas <bhelgaas@google.com>
> 
> This reverts commit e8303bb7a75c113388badcc49b2a84b4121c1b3e.
> 
> e8303bb7a75c added logging whenever a link changed speed or width to a
> state that is considered degraded.  Unfortunately, it cannot differentiate
> signal integrity-related link changes from those intentionally initiated by
> an endpoint driver, including drivers that may live in userspace or VMs
> when making use of vfio-pci.  Some GPU drivers actively manage the link
> state to save power, which generates a stream of messages like this:
> 
>    vfio-pci 0000:07:00.0: 32.000 Gb/s available PCIe bandwidth, limited by 2.5 GT/s x16 link at 0000:00:02.0 (capable of 64.000 Gb/s with 5 GT/s x16 link)
> 
> We really *do* want to be alerted when the link bandwidth is reduced
> because of hardware failures, but degradation by intentional link state
> management is probably far more common, so the signal-to-noise ratio is
> currently low.
> 
> Until we figure out a way to identify the real problems or silence the
> intentional situations, revert the following commits, which include the
> initial implementation (e8303bb7a75c) and subsequent fixes:

I think we're overreacting to a bit of perceived verbosity in the system 
log. Intentional degradation does not seem to me to be as common as 
advertised. I have not observed this with either radeon, nouveau, or 
amdgpu, and the proper mechanism to save power at the link level is 
ASPM. I stand to be corrected and we have on CC some very knowledgeable 
fellows that I am certain will jump at the opportunity to do so.

What it seems like to me is that a proprietary driver running in a VM is 
initiating these changes. And if that is the case then it seems this is 
a virtualization problem. A quick glance over GPU drivers in linux did 
not reveal any obvious places where we intentionally downgrade a link.

I'm not convinced a revert is the best call.

Alex

>      e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification")
>      3e82a7f9031f ("PCI/LINK: Supply IRQ handler so level-triggered IRQs are acked")
>      55397ce8df48 ("PCI/LINK: Clear bandwidth notification interrupt before enabling it")
>      0fa635aec9ab ("PCI/LINK: Deduplicate bandwidth reports for multi-function devices")
> 
> Link: https://lore.kernel.org/lkml/155597243666.19387.1205950870601742062.stgit@gimli.home
> Link: https://lore.kernel.org/lkml/155605909349.3575.13433421148215616375.stgit@gimli.home
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> CC: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> CC: Lukas Wunner <lukas@wunner.de>
> CC: Alex Williamson <alex.williamson@redhat.com>
> ---
>   drivers/pci/pci.h                  |   1 -
>   drivers/pci/pcie/Makefile          |   1 -
>   drivers/pci/pcie/bw_notification.c | 121 -----------------------------
>   drivers/pci/pcie/portdrv.h         |   6 +-
>   drivers/pci/pcie/portdrv_core.c    |  17 ++--
>   drivers/pci/pcie/portdrv_pci.c     |   1 -
>   drivers/pci/probe.c                |   2 +-
>   7 files changed, 7 insertions(+), 142 deletions(-)
>   delete mode 100644 drivers/pci/pcie/bw_notification.c
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index d994839a3e24..224d88634115 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -273,7 +273,6 @@ enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
>   u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
>   			   enum pcie_link_width *width);
>   void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
> -void pcie_report_downtraining(struct pci_dev *dev);
>   
>   /* Single Root I/O Virtualization */
>   struct pci_sriov {
> diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
> index f1d7bc1e5efa..ab514083d5d4 100644
> --- a/drivers/pci/pcie/Makefile
> +++ b/drivers/pci/pcie/Makefile
> @@ -3,7 +3,6 @@
>   # Makefile for PCI Express features and port driver
>   
>   pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
> -pcieportdrv-y			+= bw_notification.o
>   
>   obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
>   
> diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
> deleted file mode 100644
> index 4fa9e3523ee1..000000000000
> --- a/drivers/pci/pcie/bw_notification.c
> +++ /dev/null
> @@ -1,121 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0+
> -/*
> - * PCI Express Link Bandwidth Notification services driver
> - * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> - *
> - * Copyright (C) 2019, Dell Inc
> - *
> - * The PCIe Link Bandwidth Notification provides a way to notify the
> - * operating system when the link width or data rate changes.  This
> - * capability is required for all root ports and downstream ports
> - * supporting links wider than x1 and/or multiple link speeds.
> - *
> - * This service port driver hooks into the bandwidth notification interrupt
> - * and warns when links become degraded in operation.
> - */
> -
> -#include "../pci.h"
> -#include "portdrv.h"
> -
> -static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
> -{
> -	int ret;
> -	u32 lnk_cap;
> -
> -	ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
> -	return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
> -}
> -
> -static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
> -{
> -	u16 lnk_ctl;
> -
> -	pcie_capability_write_word(dev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
> -
> -	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> -	lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
> -	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> -}
> -
> -static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
> -{
> -	u16 lnk_ctl;
> -
> -	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> -	lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
> -	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> -}
> -
> -static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
> -{
> -	struct pcie_device *srv = context;
> -	struct pci_dev *port = srv->port;
> -	u16 link_status, events;
> -	int ret;
> -
> -	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
> -	events = link_status & PCI_EXP_LNKSTA_LBMS;
> -
> -	if (ret != PCIBIOS_SUCCESSFUL || !events)
> -		return IRQ_NONE;
> -
> -	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
> -	pcie_update_link_speed(port->subordinate, link_status);
> -	return IRQ_WAKE_THREAD;
> -}
> -
> -static irqreturn_t pcie_bw_notification_handler(int irq, void *context)
> -{
> -	struct pcie_device *srv = context;
> -	struct pci_dev *port = srv->port;
> -	struct pci_dev *dev;
> -
> -	/*
> -	 * Print status from downstream devices, not this root port or
> -	 * downstream switch port.
> -	 */
> -	down_read(&pci_bus_sem);
> -	list_for_each_entry(dev, &port->subordinate->devices, bus_list)
> -		pcie_report_downtraining(dev);
> -	up_read(&pci_bus_sem);
> -
> -	return IRQ_HANDLED;
> -}
> -
> -static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
> -{
> -	int ret;
> -
> -	/* Single-width or single-speed ports do not have to support this. */
> -	if (!pcie_link_bandwidth_notification_supported(srv->port))
> -		return -ENODEV;
> -
> -	ret = request_threaded_irq(srv->irq, pcie_bw_notification_irq,
> -				   pcie_bw_notification_handler,
> -				   IRQF_SHARED, "PCIe BW notif", srv);
> -	if (ret)
> -		return ret;
> -
> -	pcie_enable_link_bandwidth_notification(srv->port);
> -
> -	return 0;
> -}
> -
> -static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
> -{
> -	pcie_disable_link_bandwidth_notification(srv->port);
> -	free_irq(srv->irq, srv);
> -}
> -
> -static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
> -	.name		= "pcie_bw_notification",
> -	.port_type	= PCIE_ANY_PORT,
> -	.service	= PCIE_PORT_SERVICE_BWNOTIF,
> -	.probe		= pcie_bandwidth_notification_probe,
> -	.remove		= pcie_bandwidth_notification_remove,
> -};
> -
> -int __init pcie_bandwidth_notification_init(void)
> -{
> -	return pcie_port_service_register(&pcie_bandwidth_notification_driver);
> -}
> diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
> index 1d50dc58ac40..fbbf00b0992e 100644
> --- a/drivers/pci/pcie/portdrv.h
> +++ b/drivers/pci/pcie/portdrv.h
> @@ -20,10 +20,8 @@
>   #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
>   #define PCIE_PORT_SERVICE_DPC_SHIFT	3	/* Downstream Port Containment */
>   #define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
> -#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT	4	/* Bandwidth notification */
> -#define PCIE_PORT_SERVICE_BWNOTIF	(1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
>   
> -#define PCIE_PORT_DEVICE_MAXSERVICES   5
> +#define PCIE_PORT_DEVICE_MAXSERVICES   4
>   
>   #ifdef CONFIG_PCIEAER
>   int pcie_aer_init(void);
> @@ -49,8 +47,6 @@ int pcie_dpc_init(void);
>   static inline int pcie_dpc_init(void) { return 0; }
>   #endif
>   
> -int pcie_bandwidth_notification_init(void);
> -
>   /* Port Type */
>   #define PCIE_ANY_PORT			(~0)
>   
> diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
> index 7d04f9d087a6..f458ac9cb70c 100644
> --- a/drivers/pci/pcie/portdrv_core.c
> +++ b/drivers/pci/pcie/portdrv_core.c
> @@ -99,7 +99,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
>    */
>   static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>   {
> -	int nr_entries, nvec, pcie_irq;
> +	int nr_entries, nvec;
>   	u32 pme = 0, aer = 0, dpc = 0;
>   
>   	/* Allocate the maximum possible number of MSI/MSI-X vectors */
> @@ -135,13 +135,10 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
>   			return nr_entries;
>   	}
>   
> -	/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
> -	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
> -		    PCIE_PORT_SERVICE_BWNOTIF)) {
> -		pcie_irq = pci_irq_vector(dev, pme);
> -		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
> -		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
> -		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
> +	/* PME and hotplug share an MSI/MSI-X vector */
> +	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
> +		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
> +		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
>   	}
>   
>   	if (mask & PCIE_PORT_SERVICE_AER)
> @@ -253,10 +250,6 @@ static int get_port_device_capability(struct pci_dev *dev)
>   	    pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
>   		services |= PCIE_PORT_SERVICE_DPC;
>   
> -	if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM ||
> -	    pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
> -		services |= PCIE_PORT_SERVICE_BWNOTIF;
> -
>   	return services;
>   }
>   
> diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
> index 0a87091a0800..99d2abe88d0b 100644
> --- a/drivers/pci/pcie/portdrv_pci.c
> +++ b/drivers/pci/pcie/portdrv_pci.c
> @@ -240,7 +240,6 @@ static void __init pcie_init_services(void)
>   	pcie_pme_init();
>   	pcie_dpc_init();
>   	pcie_hp_init();
> -	pcie_bandwidth_notification_init();
>   }
>   
>   static int __init pcie_portdrv_init(void)
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 7e12d0163863..2ec0df04e0dc 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2388,7 +2388,7 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
>   	return dev;
>   }
>   
> -void pcie_report_downtraining(struct pci_dev *dev)
> +static void pcie_report_downtraining(struct pci_dev *dev)
>   {
>   	if (!pci_is_pcie(dev))
>   		return;
>
Bjorn Helgaas April 30, 2019, 4:11 p.m. UTC | #3
On Mon, Apr 29, 2019 at 08:07:53PM -0500, Alex G wrote:
> On 4/29/19 1:56 PM, Bjorn Helgaas wrote:
> > From: Bjorn Helgaas <bhelgaas@google.com>
> > 
> > This reverts commit e8303bb7a75c113388badcc49b2a84b4121c1b3e.
> > 
> > e8303bb7a75c added logging whenever a link changed speed or width to a
> > state that is considered degraded.  Unfortunately, it cannot differentiate
> > signal integrity-related link changes from those intentionally initiated by
> > an endpoint driver, including drivers that may live in userspace or VMs
> > when making use of vfio-pci.  Some GPU drivers actively manage the link
> > state to save power, which generates a stream of messages like this:
> > 
> >    vfio-pci 0000:07:00.0: 32.000 Gb/s available PCIe bandwidth, limited by 2.5 GT/s x16 link at 0000:00:02.0 (capable of 64.000 Gb/s with 5 GT/s x16 link)
> > 
> > We really *do* want to be alerted when the link bandwidth is reduced
> > because of hardware failures, but degradation by intentional link state
> > management is probably far more common, so the signal-to-noise ratio is
> > currently low.
> > 
> > Until we figure out a way to identify the real problems or silence the
> > intentional situations, revert the following commits, which include the
> > initial implementation (e8303bb7a75c) and subsequent fixes:
> 
> I think we're overreacting to a bit of perceived verbosity in the system
> log. Intentional degradation does not seem to me to be as common as
> advertised. I have not observed this with either radeon, nouveau, or amdgpu,
> and the proper mechanism to save power at the link level is ASPM. I stand to
> be corrected and we have on CC some very knowledgeable fellows that I am
> certain will jump at the opportunity to do so.

I can't quantify how common it is, but the verbosity is definitely
*there*, and it seems unlikely to me that a hardware failure is more
common than any intentional driver-driven degradation.

If we can reliably distinguish hardware failures from benign changes,
we should certainly log the failures.  But in this case even the
failures are fully functional, albeit at lower performance, so if the
messages end up being 99% false positives, I think it'll just be
confusing for users.

> What it seems like to me is that a proprietary driver running in a VM is
> initiating these changes. And if that is the case then it seems this is a
> virtualization problem. A quick glance over GPU drivers in linux did not
> reveal any obvious places where we intentionally downgrade a link.

I'm not 100% on board with the idea of drivers directly manipulating
the link because it seems like the PCI core might need to be at least
aware of this.  But some drivers certainly do manipulate it today for
ASPM, gen2/gen3 retraining, etc.

If we treat this as a virtualization problem, I guess you're
suggesting the host kernel should prevent that sort of link
manipulation?  We could have a conversation about that, but it
doesn't seem like the immediate solution to this problem.

> I'm not convinced a revert is the best call.

I have very limited options at this stage of the release, but I'd be
glad to hear suggestions.  My concern is that if we release v5.1
as-is, we'll spend a lot of energy on those false positives.

Bjorn
Keith Busch April 30, 2019, 6:05 p.m. UTC | #4
On Tue, Apr 30, 2019 at 11:11:51AM -0500, Bjorn Helgaas wrote:
> > I'm not convinced a revert is the best call.
> 
> I have very limited options at this stage of the release, but I'd be
> glad to hear suggestions.  My concern is that if we release v5.1
> as-is, we'll spend a lot of energy on those false positives.

May be too late now if the revert is queued up, but I think this feature
should have been a default 'false' Kconfig bool rather than always on.
Keith Busch April 30, 2019, 6:18 p.m. UTC | #5
On Tue, Apr 30, 2019 at 12:05:09PM -0600, Keith Busch wrote:
> On Tue, Apr 30, 2019 at 11:11:51AM -0500, Bjorn Helgaas wrote:
> > > I'm not convinced a revert is the best call.
> > 
> > I have very limited options at this stage of the release, but I'd be
> > glad to hear suggestions.  My concern is that if we release v5.1
> > as-is, we'll spend a lot of energy on those false positives.
> 
> May be too late now if the revert is queued up, but I think this feature
> should have been a default 'false' Kconfig bool rather than always on.

This is what I mean:

---
diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index 5cbdbca904ac..7f480685df93 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -142,3 +142,12 @@ config PCIE_PTM
 
 	  This is only useful if you have devices that support PTM, but it
 	  is safe to enable even if you don't.
+
+config PCIE_BW
+	bool "PCI Express Bandwidth Change Notification"
+	default n
+	depends on PCIEPORTBUS
+	help
+	  This enables PCI Express Bandwidth Change Notification. If
+	  you know link width or rate changes occur only to correct
+	  unreliable links, you may answer Y.
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index f1d7bc1e5efa..d356a5bdb158 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -3,7 +3,6 @@
 # Makefile for PCI Express features and port driver
 
 pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
-pcieportdrv-y			+= bw_notification.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
@@ -13,3 +12,4 @@ obj-$(CONFIG_PCIEAER_INJECT)	+= aer_inject.o
 obj-$(CONFIG_PCIE_PME)		+= pme.o
 obj-$(CONFIG_PCIE_DPC)		+= dpc.o
 obj-$(CONFIG_PCIE_PTM)		+= ptm.o
+obj-$(CONFIG_PCIE_BW)		:= bw_notification.o
--
Lukas Wunner April 30, 2019, 11:09 p.m. UTC | #6
On Tue, Apr 30, 2019 at 12:05:09PM -0600, Keith Busch wrote:
> On Tue, Apr 30, 2019 at 11:11:51AM -0500, Bjorn Helgaas wrote:
> > > I'm not convinced a revert is the best call.
> > 
> > I have very limited options at this stage of the release, but I'd be
> > glad to hear suggestions.  My concern is that if we release v5.1
> > as-is, we'll spend a lot of energy on those false positives.
> 
> May be too late now if the revert is queued up, but I think this feature
> should have been a default 'false' Kconfig bool rather than always on.

Good idea, this would seem to be a less harsh solution than a revert.

Enabling the feature by default for everyone was probably overly confident.
I recall I did bring up in a review comment that all other port services
have a Kconfig option.  Alex replied that he's not using one because
on device enumeration, downtraining is checked unconditionally as well.

Bandwidth notification might be a feature that's not used by many operating
systems.  Such features don't get much real world exposure or aren't even
validated by manufacturers.  Inevitably, unpleasant side effects occur
once Linux supports them.

However if we keep the code and default to "N" in Kconfig, at least people
get a chance to test and validate the functionality and hopefully this
will lead to either better hardware or better driver support in the future.

Thanks,

Lukas
Bjorn Helgaas May 1, 2019, 2:12 a.m. UTC | #7
On Tue, Apr 30, 2019 at 12:18:13PM -0600, Keith Busch wrote:
> On Tue, Apr 30, 2019 at 12:05:09PM -0600, Keith Busch wrote:
> > On Tue, Apr 30, 2019 at 11:11:51AM -0500, Bjorn Helgaas wrote:
> > > > I'm not convinced a revert is the best call.
> > > 
> > > I have very limited options at this stage of the release, but I'd be
> > > glad to hear suggestions.  My concern is that if we release v5.1
> > > as-is, we'll spend a lot of energy on those false positives.
> > 
> > May be too late now if the revert is queued up, but I think this feature
> > should have been a default 'false' Kconfig bool rather than always on.

Since this feature currently just adds a message in dmesg, which we
don't really consider a stable API, I think a Kconfig switch is a
reasonable option.

If you send me a signed-off-by for the following patch, I can apply it:

commit 302b77157e66
Author: Keith Busch <kbusch@kernel.org>
Date:   Tue Apr 30 12:18:13 2019 -0600

    PCI/LINK: Add Kconfig option (default off)
    
    e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth
    notification") added dmesg logging whenever a link changes speed or width
    to a state that is considered degraded.  Unfortunately, it cannot
    differentiate signal integrity-related link changes from those
    intentionally initiated by an endpoint driver, including drivers that may
    live in userspace or VMs when making use of vfio-pci.  Some GPU drivers
    actively manage the link state to save power, which generates a stream of
    messages like this:
    
      vfio-pci 0000:07:00.0: 32.000 Gb/s available PCIe bandwidth, limited by 2.5 GT/s x16 link at 0000:00:02.0 (capable of 64.000 Gb/s with 5 GT/s x16 link)
    
    Since we can't distinguish the intentional changes from the signal
    integrity issues, leave the reporting turned off by default.  Add a Kconfig
    option to turn it on if desired.
    
    Fixes: e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth
    notification")

diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index 5cbdbca904ac..4a094f0d2856 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -142,3 +142,12 @@ config PCIE_PTM
 
 	  This is only useful if you have devices that support PTM, but it
 	  is safe to enable even if you don't.
+
+config PCIE_BW
+	bool "PCI Express Bandwidth Change Notification"
+	default n
+	depends on PCIEPORTBUS
+	help
+	  This enables PCI Express Bandwidth Change Notification.  If
+	  you know link width or rate changes occur only to correct
+	  unreliable links, you may answer Y.
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index f1d7bc1e5efa..d356a5bdb158 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -3,7 +3,6 @@
 # Makefile for PCI Express features and port driver
 
 pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
-pcieportdrv-y			+= bw_notification.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
@@ -13,3 +12,4 @@ obj-$(CONFIG_PCIEAER_INJECT)	+= aer_inject.o
 obj-$(CONFIG_PCIE_PME)		+= pme.o
 obj-$(CONFIG_PCIE_DPC)		+= dpc.o
 obj-$(CONFIG_PCIE_PTM)		+= ptm.o
+obj-$(CONFIG_PCIE_BW)		:= bw_notification.o
Keith Busch May 1, 2019, 1:16 p.m. UTC | #8
On Tue, Apr 30, 2019 at 09:12:49PM -0500, Bjorn Helgaas wrote:
> On Tue, Apr 30, 2019 at 12:18:13PM -0600, Keith Busch wrote:
> > On Tue, Apr 30, 2019 at 12:05:09PM -0600, Keith Busch wrote:
> > > On Tue, Apr 30, 2019 at 11:11:51AM -0500, Bjorn Helgaas wrote:
> > > > > I'm not convinced a revert is the best call.
> > > > 
> > > > I have very limited options at this stage of the release, but I'd be
> > > > glad to hear suggestions.  My concern is that if we release v5.1
> > > > as-is, we'll spend a lot of energy on those false positives.
> > > 
> > > May be too late now if the revert is queued up, but I think this feature
> > > should have been a default 'false' Kconfig bool rather than always on.
> 
> Since this feature currently just adds a message in dmesg, which we
> don't really consider a stable API, I think a Kconfig switch is a
> reasonable option.
> 
> If you send me a signed-off-by for the following patch, I can apply it:

Sounds good, I'll need to resend though since I messed up the Makefile:

> +obj-$(CONFIG_PCIE_BW)		:= bw_notification.o

    s/:=/+=
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d994839a3e24..224d88634115 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -273,7 +273,6 @@  enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
 u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
 			   enum pcie_link_width *width);
 void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
-void pcie_report_downtraining(struct pci_dev *dev);
 
 /* Single Root I/O Virtualization */
 struct pci_sriov {
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index f1d7bc1e5efa..ab514083d5d4 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -3,7 +3,6 @@ 
 # Makefile for PCI Express features and port driver
 
 pcieportdrv-y			:= portdrv_core.o portdrv_pci.o err.o
-pcieportdrv-y			+= bw_notification.o
 
 obj-$(CONFIG_PCIEPORTBUS)	+= pcieportdrv.o
 
diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
deleted file mode 100644
index 4fa9e3523ee1..000000000000
--- a/drivers/pci/pcie/bw_notification.c
+++ /dev/null
@@ -1,121 +0,0 @@ 
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * PCI Express Link Bandwidth Notification services driver
- * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
- *
- * Copyright (C) 2019, Dell Inc
- *
- * The PCIe Link Bandwidth Notification provides a way to notify the
- * operating system when the link width or data rate changes.  This
- * capability is required for all root ports and downstream ports
- * supporting links wider than x1 and/or multiple link speeds.
- *
- * This service port driver hooks into the bandwidth notification interrupt
- * and warns when links become degraded in operation.
- */
-
-#include "../pci.h"
-#include "portdrv.h"
-
-static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
-{
-	int ret;
-	u32 lnk_cap;
-
-	ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
-	return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
-}
-
-static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
-{
-	u16 lnk_ctl;
-
-	pcie_capability_write_word(dev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
-
-	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
-	lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
-	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
-}
-
-static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
-{
-	u16 lnk_ctl;
-
-	pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
-	lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
-	pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
-}
-
-static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
-{
-	struct pcie_device *srv = context;
-	struct pci_dev *port = srv->port;
-	u16 link_status, events;
-	int ret;
-
-	ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
-	events = link_status & PCI_EXP_LNKSTA_LBMS;
-
-	if (ret != PCIBIOS_SUCCESSFUL || !events)
-		return IRQ_NONE;
-
-	pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
-	pcie_update_link_speed(port->subordinate, link_status);
-	return IRQ_WAKE_THREAD;
-}
-
-static irqreturn_t pcie_bw_notification_handler(int irq, void *context)
-{
-	struct pcie_device *srv = context;
-	struct pci_dev *port = srv->port;
-	struct pci_dev *dev;
-
-	/*
-	 * Print status from downstream devices, not this root port or
-	 * downstream switch port.
-	 */
-	down_read(&pci_bus_sem);
-	list_for_each_entry(dev, &port->subordinate->devices, bus_list)
-		pcie_report_downtraining(dev);
-	up_read(&pci_bus_sem);
-
-	return IRQ_HANDLED;
-}
-
-static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
-{
-	int ret;
-
-	/* Single-width or single-speed ports do not have to support this. */
-	if (!pcie_link_bandwidth_notification_supported(srv->port))
-		return -ENODEV;
-
-	ret = request_threaded_irq(srv->irq, pcie_bw_notification_irq,
-				   pcie_bw_notification_handler,
-				   IRQF_SHARED, "PCIe BW notif", srv);
-	if (ret)
-		return ret;
-
-	pcie_enable_link_bandwidth_notification(srv->port);
-
-	return 0;
-}
-
-static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
-{
-	pcie_disable_link_bandwidth_notification(srv->port);
-	free_irq(srv->irq, srv);
-}
-
-static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
-	.name		= "pcie_bw_notification",
-	.port_type	= PCIE_ANY_PORT,
-	.service	= PCIE_PORT_SERVICE_BWNOTIF,
-	.probe		= pcie_bandwidth_notification_probe,
-	.remove		= pcie_bandwidth_notification_remove,
-};
-
-int __init pcie_bandwidth_notification_init(void)
-{
-	return pcie_port_service_register(&pcie_bandwidth_notification_driver);
-}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 1d50dc58ac40..fbbf00b0992e 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,10 +20,8 @@ 
 #define PCIE_PORT_SERVICE_HP		(1 << PCIE_PORT_SERVICE_HP_SHIFT)
 #define PCIE_PORT_SERVICE_DPC_SHIFT	3	/* Downstream Port Containment */
 #define PCIE_PORT_SERVICE_DPC		(1 << PCIE_PORT_SERVICE_DPC_SHIFT)
-#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT	4	/* Bandwidth notification */
-#define PCIE_PORT_SERVICE_BWNOTIF	(1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
 
-#define PCIE_PORT_DEVICE_MAXSERVICES   5
+#define PCIE_PORT_DEVICE_MAXSERVICES   4
 
 #ifdef CONFIG_PCIEAER
 int pcie_aer_init(void);
@@ -49,8 +47,6 @@  int pcie_dpc_init(void);
 static inline int pcie_dpc_init(void) { return 0; }
 #endif
 
-int pcie_bandwidth_notification_init(void);
-
 /* Port Type */
 #define PCIE_ANY_PORT			(~0)
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 7d04f9d087a6..f458ac9cb70c 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -99,7 +99,7 @@  static int pcie_message_numbers(struct pci_dev *dev, int mask,
  */
 static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 {
-	int nr_entries, nvec, pcie_irq;
+	int nr_entries, nvec;
 	u32 pme = 0, aer = 0, dpc = 0;
 
 	/* Allocate the maximum possible number of MSI/MSI-X vectors */
@@ -135,13 +135,10 @@  static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
 			return nr_entries;
 	}
 
-	/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
-	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
-		    PCIE_PORT_SERVICE_BWNOTIF)) {
-		pcie_irq = pci_irq_vector(dev, pme);
-		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
-		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
-		irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
+	/* PME and hotplug share an MSI/MSI-X vector */
+	if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
+		irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
+		irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
 	}
 
 	if (mask & PCIE_PORT_SERVICE_AER)
@@ -253,10 +250,6 @@  static int get_port_device_capability(struct pci_dev *dev)
 	    pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
 		services |= PCIE_PORT_SERVICE_DPC;
 
-	if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM ||
-	    pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
-		services |= PCIE_PORT_SERVICE_BWNOTIF;
-
 	return services;
 }
 
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 0a87091a0800..99d2abe88d0b 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -240,7 +240,6 @@  static void __init pcie_init_services(void)
 	pcie_pme_init();
 	pcie_dpc_init();
 	pcie_hp_init();
-	pcie_bandwidth_notification_init();
 }
 
 static int __init pcie_portdrv_init(void)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 7e12d0163863..2ec0df04e0dc 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2388,7 +2388,7 @@  static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 	return dev;
 }
 
-void pcie_report_downtraining(struct pci_dev *dev)
+static void pcie_report_downtraining(struct pci_dev *dev)
 {
 	if (!pci_is_pcie(dev))
 		return;