diff mbox series

PCI: Check for PCIe downtraining conditions

Message ID 20180531150535.9684-1-mr.nuke.me@gmail.com
State Superseded
Headers show
Series PCI: Check for PCIe downtraining conditions | expand

Commit Message

Alex G. May 31, 2018, 3:05 p.m. UTC
PCIe downtraining happens when both the device and PCIe port are
capable of a larger bus width or higher speed than negotiated.
Downtraining might be indicative of other problems in the system, and
identifying this from userspace is neither intuitive, nor straigh
forward.
Instead, check for such conditions on device probe, and print an
appropriate message.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/pci/probe.c           | 78 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pci_regs.h |  1 +
 2 files changed, 79 insertions(+)

Comments

Sinan Kaya May 31, 2018, 3:28 p.m. UTC | #1
On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
> +static void pcie_max_link_cap(struct pci_dev *dev, enum pci_bus_speed *speed,
> +				enum pcie_link_width *width)
> +{
> +	uint32_t lnkcap;
> +
> +	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
> +
> +	*speed = pcie_link_speed[lnkcap & PCI_EXP_LNKCAP_SLS];
> +	*width = (lnkcap & PCI_EXP_LNKCAP_MLW) >> PCI_EXP_LNKCAP_MLW_SHIFT;
> +}
> +
> +static void pcie_cur_link_sta(struct pci_dev *dev, enum pci_bus_speed *speed,
> +				enum pcie_link_width *width)
> +{
> +	uint16_t lnksta;
> +
> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
> +	*speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
> +	*width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
> +}
> +
> +static const char *pcie_bus_speed_name(enum pci_bus_speed speed)
> +{
> +	switch (speed) {
> +	case PCIE_SPEED_2_5GT:
> +		return "2.5 GT/s";
> +	case PCIE_SPEED_5_0GT:
> +		return "5.0 GT/s";
> +	case PCIE_SPEED_8_0GT:
> +		return "8.0 GT/s";
> +	default:
> +		return "unknown";
> +	}
> +}

I thought Bjorn added some functions to retrieve this now.
Alex_Gagniuc@Dellteam.com May 31, 2018, 3:29 p.m. UTC | #2
On 5/31/2018 10:28 AM, Sinan Kaya wrote:
> On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
>> +static void pcie_max_link_cap(struct pci_dev *dev, enum pci_bus_speed *speed,
>> +				enum pcie_link_width *width)
>> +{
>> +	uint32_t lnkcap;
>> +
>> +	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
>> +
>> +	*speed = pcie_link_speed[lnkcap & PCI_EXP_LNKCAP_SLS];
>> +	*width = (lnkcap & PCI_EXP_LNKCAP_MLW) >> PCI_EXP_LNKCAP_MLW_SHIFT;
>> +}
>> +
>> +static void pcie_cur_link_sta(struct pci_dev *dev, enum pci_bus_speed *speed,
>> +				enum pcie_link_width *width)
>> +{
>> +	uint16_t lnksta;
>> +
>> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
>> +	*speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
>> +	*width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
>> +}
>> +
>> +static const char *pcie_bus_speed_name(enum pci_bus_speed speed)
>> +{
>> +	switch (speed) {
>> +	case PCIE_SPEED_2_5GT:
>> +		return "2.5 GT/s";
>> +	case PCIE_SPEED_5_0GT:
>> +		return "5.0 GT/s";
>> +	case PCIE_SPEED_8_0GT:
>> +		return "8.0 GT/s";
>> +	default:
>> +		return "unknown";
>> +	}
>> +}
> 
> I thought Bjorn added some functions to retrieve this now.

Hmm. I couldn't find them.

Alex
Sinan Kaya May 31, 2018, 3:30 p.m. UTC | #3
On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
> +	if (dev_cur_speed < max_link_speed)
> +		pci_warn(dev, "PCIe downtrain: link speed is %s (%s capable)",
> +			 pcie_bus_speed_name(dev_cur_speed),
> +			 pcie_bus_speed_name(max_link_speed));
> +

Also this isn't quite correct. Target link speed is what the device tries to
train. A device can try to train to much lower speed than the maximum on purpose.

It makes sense to print this if the speed that platform wants via target link
speed is different from what is actually established though.
Sinan Kaya May 31, 2018, 3:38 p.m. UTC | #4
On 5/31/2018 11:29 AM, Alex_Gagniuc@Dellteam.com wrote:
> On 5/31/2018 10:28 AM, Sinan Kaya wrote:
>> On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
>>> +static void pcie_max_link_cap(struct pci_dev *dev, enum pci_bus_speed *speed,
>>> +				enum pcie_link_width *width)
>>> +{
>>> +	uint32_t lnkcap;
>>> +
>>> +	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
>>> +
>>> +	*speed = pcie_link_speed[lnkcap & PCI_EXP_LNKCAP_SLS];
>>> +	*width = (lnkcap & PCI_EXP_LNKCAP_MLW) >> PCI_EXP_LNKCAP_MLW_SHIFT;
>>> +}
>>> +
>>> +static void pcie_cur_link_sta(struct pci_dev *dev, enum pci_bus_speed *speed,
>>> +				enum pcie_link_width *width)
>>> +{
>>> +	uint16_t lnksta;
>>> +
>>> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
>>> +	*speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
>>> +	*width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
>>> +}
>>> +
>>> +static const char *pcie_bus_speed_name(enum pci_bus_speed speed)
>>> +{
>>> +	switch (speed) {
>>> +	case PCIE_SPEED_2_5GT:
>>> +		return "2.5 GT/s";
>>> +	case PCIE_SPEED_5_0GT:
>>> +		return "5.0 GT/s";
>>> +	case PCIE_SPEED_8_0GT:
>>> +		return "8.0 GT/s";
>>> +	default:
>>> +		return "unknown";
>>> +	}
>>> +}
>>
>> I thought Bjorn added some functions to retrieve this now.
> 
> Hmm. I couldn't find them.

https://lkml.org/lkml/2018/3/30/553

are you working on linux-next?

> 
> Alex
> 
>
Alex G. May 31, 2018, 3:46 p.m. UTC | #5
On 05/31/2018 10:38 AM, Sinan Kaya wrote:
> On 5/31/2018 11:29 AM, Alex_Gagniuc@Dellteam.com wrote:
>> On 5/31/2018 10:28 AM, Sinan Kaya wrote:
>>> On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
>>>> +static void pcie_max_link_cap(struct pci_dev *dev, enum pci_bus_speed *speed,
>>>> +				enum pcie_link_width *width)
>>>> +{
>>>> +	uint32_t lnkcap;
>>>> +
>>>> +	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
>>>> +
>>>> +	*speed = pcie_link_speed[lnkcap & PCI_EXP_LNKCAP_SLS];
>>>> +	*width = (lnkcap & PCI_EXP_LNKCAP_MLW) >> PCI_EXP_LNKCAP_MLW_SHIFT;
>>>> +}
>>>> +
>>>> +static void pcie_cur_link_sta(struct pci_dev *dev, enum pci_bus_speed *speed,
>>>> +				enum pcie_link_width *width)
>>>> +{
>>>> +	uint16_t lnksta;
>>>> +
>>>> +	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
>>>> +	*speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
>>>> +	*width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
>>>> +}
>>>> +
>>>> +static const char *pcie_bus_speed_name(enum pci_bus_speed speed)
>>>> +{
>>>> +	switch (speed) {
>>>> +	case PCIE_SPEED_2_5GT:
>>>> +		return "2.5 GT/s";
>>>> +	case PCIE_SPEED_5_0GT:
>>>> +		return "5.0 GT/s";
>>>> +	case PCIE_SPEED_8_0GT:
>>>> +		return "8.0 GT/s";
>>>> +	default:
>>>> +		return "unknown";
>>>> +	}
>>>> +}
>>>
>>> I thought Bjorn added some functions to retrieve this now.
>>
>> Hmm. I couldn't find them.
> 
> https://lkml.org/lkml/2018/3/30/553

Oh, pcie_get_speed_cap()/pcie_get_width_cap() seems to handle the
capability. Not seeing one for status and speed name.

> are you working on linux-next?

v4.17-rc7

>>
>> Alex
>>
>>
> 
>
Sinan Kaya May 31, 2018, 3:54 p.m. UTC | #6
On 5/31/2018 11:46 AM, Alex G. wrote:
>> https://lkml.org/lkml/2018/3/30/553
> Oh, pcie_get_speed_cap()/pcie_get_width_cap() seems to handle the
> capability. Not seeing one for status and speed name.
> 
>> are you working on linux-next?
> v4.17-rc7
> 

I think everything you need is in the series. I don't know which linux
tree this landed or if it landed. Probably, it is shipping for 4.18. 
Need some help from Bjorn where to locate these.

Fri, 30 Mar 2018 16:04:40 -0500

Bjorn Helgaas (6):
      bnx2x: Report PCIe link properties with pcie_print_link_status()
      bnxt_en: Report PCIe link properties with pcie_print_link_status()
      cxgb4: Report PCIe link properties with pcie_print_link_status()
      fm10k: Report PCIe link properties with pcie_print_link_status()
      ixgbe: Report PCIe link properties with pcie_print_link_status()
      PCI: Remove unused pcie_get_minimum_link()

Tal Gilboa (8):
      PCI: Add pcie_get_speed_cap() to find max supported link speed
      PCI: Add pcie_get_width_cap() to find max supported link width
      PCI: Add pcie_bandwidth_capable() to compute max supported link bandwidth
      PCI: Add pcie_bandwidth_available() to compute bandwidth available to device
      PCI: Add pcie_print_link_status() to log link speed and whether it's limited
Alex G. May 31, 2018, 4:01 p.m. UTC | #7
On 05/31/2018 10:54 AM, Sinan Kaya wrote:
> On 5/31/2018 11:46 AM, Alex G. wrote:
>>> https://lkml.org/lkml/2018/3/30/553
>> Oh, pcie_get_speed_cap()/pcie_get_width_cap() seems to handle the
>> capability. Not seeing one for status and speed name.
>>
>>> are you working on linux-next?
>> v4.17-rc7
>>
> 
> I think everything you need is in the series. I don't know which linux
> tree this landed or if it landed. Probably, it is shipping for 4.18. 
> Need some help from Bjorn where to locate these.
> 
> Fri, 30 Mar 2018 16:04:40 -0500
> 
> Bjorn Helgaas (6):
>       bnx2x: Report PCIe link properties with pcie_print_link_status()
>       bnxt_en: Report PCIe link properties with pcie_print_link_status()
>       cxgb4: Report PCIe link properties with pcie_print_link_status()
>       fm10k: Report PCIe link properties with pcie_print_link_status()
>       ixgbe: Report PCIe link properties with pcie_print_link_status()
>       PCI: Remove unused pcie_get_minimum_link()

I remember seeing some of these in my tree.

> Tal Gilboa (8):
>       PCI: Add pcie_get_speed_cap() to find max supported link speed
>       PCI: Add pcie_get_width_cap() to find max supported link width
>       PCI: Add pcie_bandwidth_capable() to compute max supported link bandwidth
>       PCI: Add pcie_bandwidth_available() to compute bandwidth available to device

I definitely have these.

>       PCI: Add pcie_print_link_status() to log link speed and whether it's limited

This one, I have, but it's not what I need. This looks at the available
bandwidth from root port to endpoint, whereas I'm only interested in
downtraining between endpoint and upstream port.

Alex
Sinan Kaya May 31, 2018, 4:13 p.m. UTC | #8
On 5/31/2018 12:01 PM, Alex G. wrote:
>>       PCI: Add pcie_print_link_status() to log link speed and whether it's limited
> This one, I have, but it's not what I need. This looks at the available
> bandwidth from root port to endpoint, whereas I'm only interested in
> downtraining between endpoint and upstream port.

I see what you are saying. 

With a little bit of effort, you can reuse the same code.

Here is an attempt.

You can probably extend pcie_bandwidth_available() to put an optional parent bridge
device for your own use case and terminate the loop around here.

https://elixir.bootlin.com/linux/v4.17-rc7/source/drivers/pci/pci.c#L5182

Then, you can use the existing code to achieve what you are looking for via
pcie_print_link_status() by adding an optional parent parameter.

	bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
	bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width, *parent*);


If parent parameter is NULL, code can walk all the way to root as it is doing today.
If it is not, then will terminate the loop on the first iteration.
Alex G. May 31, 2018, 4:49 p.m. UTC | #9
On 05/31/2018 11:13 AM, Sinan Kaya wrote:
> On 5/31/2018 12:01 PM, Alex G. wrote:
>>>       PCI: Add pcie_print_link_status() to log link speed and whether it's limited
>> This one, I have, but it's not what I need. This looks at the available
>> bandwidth from root port to endpoint, whereas I'm only interested in
>> downtraining between endpoint and upstream port.
> 
> I see what you are saying. 
> 
> With a little bit of effort, you can reuse the same code.
> 
> Here is an attempt.
> 
> You can probably extend pcie_bandwidth_available() to put an optional parent bridge
> device for your own use case and terminate the loop around here.
> 
> https://elixir.bootlin.com/linux/v4.17-rc7/source/drivers/pci/pci.c#L5182
> 
> Then, you can use the existing code to achieve what you are looking for via
> pcie_print_link_status() by adding an optional parent parameter.
> 
> 	bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
> 	bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width, *parent*);

That's confusing. I'd expect _capable() and _available() to be
symmetrical. They either both look at one link only, or both go down to
the root port. Though it seems _capable() is link-local, and
_available() is down to root port.

> 
> If parent parameter is NULL, code can walk all the way to root as it is doing today.
> If it is not, then will terminate the loop on the first iteration.
>
Alex G. May 31, 2018, 4:50 p.m. UTC | #10
On 05/31/2018 11:49 AM, Alex G. wrote:
> 
> 
> On 05/31/2018 11:13 AM, Sinan Kaya wrote:
>> On 5/31/2018 12:01 PM, Alex G. wrote:
>>>>       PCI: Add pcie_print_link_status() to log link speed and whether it's limited
>>> This one, I have, but it's not what I need. This looks at the available
>>> bandwidth from root port to endpoint, whereas I'm only interested in
>>> downtraining between endpoint and upstream port.
>>
>> I see what you are saying. 
>>
>> With a little bit of effort, you can reuse the same code.
>>
>> Here is an attempt.
>>
>> You can probably extend pcie_bandwidth_available() to put an optional parent bridge
>> device for your own use case and terminate the loop around here.
>>
>> https://elixir.bootlin.com/linux/v4.17-rc7/source/drivers/pci/pci.c#L5182
>>
>> Then, you can use the existing code to achieve what you are looking for via
>> pcie_print_link_status() by adding an optional parent parameter.
>>
>> 	bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
>> 	bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width, *parent*);
> 
> That's confusing.

"confusing" refers to the way the code currently works. It doesn't refer
to your proposal.


Alex

 I'd expect _capable() and _available() to be
> symmetrical. They either both look at one link only, or both go down to
> the root port. Though it seems _capable() is link-local, and
> _available() is down to root port.
> 
>>
>> If parent parameter is NULL, code can walk all the way to root as it is doing today.
>> If it is not, then will terminate the loop on the first iteration.
>>
Sinan Kaya May 31, 2018, 5:11 p.m. UTC | #11
On 5/31/2018 12:49 PM, Alex G. wrote:
>> 	bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
>> 	bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width, *parent*);
> That's confusing. I'd expect _capable() and _available() to be
> symmetrical. They either both look at one link only, or both go down to
> the root port. Though it seems _capable() is link-local, and
> _available() is down to root port.
> 

As you know, link speed is a qualification of two devices speed capability.
Both speed and width parameters get negotiated by two devices during TS1 and TS2
ordered set exchange. 

You need to see what your link partner can support in available function() vs.
what this device can do in bandwidth() function.
Alex G. May 31, 2018, 5:27 p.m. UTC | #12
On 05/31/2018 12:11 PM, Sinan Kaya wrote:
> On 5/31/2018 12:49 PM, Alex G. wrote:
>>> 	bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
>>> 	bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width, *parent*);
>> That's confusing. I'd expect _capable() and _available() to be
>> symmetrical. They either both look at one link only, or both go down to
>> the root port. Though it seems _capable() is link-local, and
>> _available() is down to root port.
>>
> 
> As you know, link speed is a qualification of two devices speed capability.
> Both speed and width parameters get negotiated by two devices during TS1 and TS2
> ordered set exchange. 
> 
> You need to see what your link partner can support in available function() vs.
> what this device can do in bandwidth() function.

I see. I'm not sure I can use pcie_print_link_status() without some
major refactoring. I need to look at capability of device and it
downstream port. There's no point complaining that an x16 device is
running at x4 when the port is only x4 capable.

Let me think some more on this.

Alex
Alex G. May 31, 2018, 9:44 p.m. UTC | #13
On 05/31/2018 10:30 AM, Sinan Kaya wrote:
> On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
>> +	if (dev_cur_speed < max_link_speed)
>> +		pci_warn(dev, "PCIe downtrain: link speed is %s (%s capable)",
>> +			 pcie_bus_speed_name(dev_cur_speed),
>> +			 pcie_bus_speed_name(max_link_speed));
>> +
> 
> Also this isn't quite correct. Target link speed is what the device tries to
> train. A device can try to train to much lower speed than the maximum on purpose.
> 
> It makes sense to print this if the speed that platform wants via target link
> speed is different from what is actually established though.

After seeing Gen 3 devices that train above the speed in the target link
speed field, I talked to one the spec writers today. There is some
ambiguity with the target link speed field. In PCIe 4.0 they are
clarifying that to state that this field is "permitted to have no effect".

Alex
Alex G. May 31, 2018, 9:52 p.m. UTC | #14
On 05/31/2018 12:27 PM, Alex G. wrote:
> On 05/31/2018 12:11 PM, Sinan Kaya wrote:
>> On 5/31/2018 12:49 PM, Alex G. wrote:
>>>> 	bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap);
>>>> 	bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width, *parent*);
>>> That's confusing. I'd expect _capable() and _available() to be
>>> symmetrical. They either both look at one link only, or both go down to
>>> the root port. Though it seems _capable() is link-local, and
>>> _available() is down to root port.
>>>
>>
>> As you know, link speed is a qualification of two devices speed capability.
>> Both speed and width parameters get negotiated by two devices during TS1 and TS2
>> ordered set exchange. 
>>
>> You need to see what your link partner can support in available function() vs.
>> what this device can do in bandwidth() function.
> 
> I see. I'm not sure I can use pcie_print_link_status() without some
> major refactoring. I need to look at capability of device and it
> downstream port. There's no point complaining that an x16 device is
> running at x4 when the port is only x4 capable.
> 
> Let me think some more on this.

I did some thinking, and I don't like using the same message for both
point-to-point links and full-tree links. From a userspace point of view
having an arbitrary terminator in the tree parsing is confusing. So
we're better of not modifying the behavior here.

On the other hand, just printing the link status on probe might be good
enough. If we're downtraining, but there's a slower link somewhere else,
it won't matter much that we're downtrained. Just calling the unmodified
pcie_print_link_status() will most of the time find the exact segment
that's also downtrained. So let's do this in v2

Alex

> Alex
>
Sinan Kaya June 1, 2018, 1:30 p.m. UTC | #15
On 5/31/2018 5:44 PM, Alex G. wrote:
> On 05/31/2018 10:30 AM, Sinan Kaya wrote:
>> On 5/31/2018 11:05 AM, Alexandru Gagniuc wrote:
>>> +	if (dev_cur_speed < max_link_speed)
>>> +		pci_warn(dev, "PCIe downtrain: link speed is %s (%s capable)",
>>> +			 pcie_bus_speed_name(dev_cur_speed),
>>> +			 pcie_bus_speed_name(max_link_speed));
>>> +
>>
>> Also this isn't quite correct. Target link speed is what the device tries to
>> train. A device can try to train to much lower speed than the maximum on purpose.
>>
>> It makes sense to print this if the speed that platform wants via target link
>> speed is different from what is actually established though.
> 
> After seeing Gen 3 devices that train above the speed in the target link
> speed field, I talked to one the spec writers today. There is some
> ambiguity with the target link speed field. In PCIe 4.0 they are
> clarifying that to state that this field is "permitted to have no effect".

Good to know.
Probably, some company screwed up implementing this register...
All these flexible terminology in the spec is an indication of such a failure.

> 
> Alex
> 
>
Pavel Machek June 2, 2018, 5:42 p.m. UTC | #16
On Thu 2018-05-31 10:05:33, Alexandru Gagniuc wrote:
> PCIe downtraining happens when both the device and PCIe port are
> capable of a larger bus width or higher speed than negotiated.
> Downtraining might be indicative of other problems in the system, and
> identifying this from userspace is neither intuitive, nor straigh
> forward.
> Instead, check for such conditions on device probe, and print an
> appropriate message.
> 

> +	if (dev_cur_width < max_link_width) {
> +		/* Lanes might not be routed, so use info instead of warn. */
> +		pci_info(dev, "PCIe downtrain: Port and device capable of x%d, but link running at x%d",
> +			 max_link_width, dev_cur_width);
> +	}

Would "warn" be right loglevel?
									Pavel
diff mbox series

Patch

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ac91b6fd0bcd..b58c5de70540 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2146,6 +2146,82 @@  static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
 	return dev;
 }
 
+static void pcie_max_link_cap(struct pci_dev *dev, enum pci_bus_speed *speed,
+				enum pcie_link_width *width)
+{
+	uint32_t lnkcap;
+
+	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+
+	*speed = pcie_link_speed[lnkcap & PCI_EXP_LNKCAP_SLS];
+	*width = (lnkcap & PCI_EXP_LNKCAP_MLW) >> PCI_EXP_LNKCAP_MLW_SHIFT;
+}
+
+static void pcie_cur_link_sta(struct pci_dev *dev, enum pci_bus_speed *speed,
+				enum pcie_link_width *width)
+{
+	uint16_t lnksta;
+
+	pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
+	*speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+	*width = (lnksta & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+static const char *pcie_bus_speed_name(enum pci_bus_speed speed)
+{
+	switch (speed) {
+	case PCIE_SPEED_2_5GT:
+		return "2.5 GT/s";
+	case PCIE_SPEED_5_0GT:
+		return "5.0 GT/s";
+	case PCIE_SPEED_8_0GT:
+		return "8.0 GT/s";
+	default:
+		return "unknown";
+	}
+}
+
+static void pcie_check_downtrain_errors(struct pci_dev *dev)
+{
+	enum pci_bus_speed dev_max_speed, dev_cur_speed;
+	enum pci_bus_speed max_link_speed, bus_max_speed;
+	enum pcie_link_width dev_cur_width, dev_max_width;
+	enum pcie_link_width bus_max_width, max_link_width;
+	struct pci_dev *uport = pci_upstream_bridge(dev);
+
+	if (!pci_is_pcie(dev) || !uport)
+		return;
+
+	/* Look from the device up to avoid downstream ports with no devices. */
+	if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT) &&
+	    (pci_pcie_type(dev) != PCI_EXP_TYPE_LEG_END) &&
+	    (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM))
+		return;
+
+	/* Multi-function PCIe share the same link/status. */
+	if (PCI_FUNC(dev->devfn) != 0)
+		return;
+
+	pcie_cur_link_sta(dev, &dev_cur_speed, &dev_cur_width);
+	pcie_max_link_cap(dev, &dev_max_speed, &dev_max_width);
+	pcie_max_link_cap(uport, &bus_max_speed, &bus_max_width);
+
+	max_link_speed = min(bus_max_speed, dev_max_speed);
+	max_link_width = min(bus_max_width, dev_max_width);
+
+
+	if (dev_cur_speed < max_link_speed)
+		pci_warn(dev, "PCIe downtrain: link speed is %s (%s capable)",
+			 pcie_bus_speed_name(dev_cur_speed),
+			 pcie_bus_speed_name(max_link_speed));
+
+	if (dev_cur_width < max_link_width) {
+		/* Lanes might not be routed, so use info instead of warn. */
+		pci_info(dev, "PCIe downtrain: Port and device capable of x%d, but link running at x%d",
+			 max_link_width, dev_cur_width);
+	}
+}
+
 static void pci_init_capabilities(struct pci_dev *dev)
 {
 	/* Enhanced Allocation */
@@ -2181,6 +2257,8 @@  static void pci_init_capabilities(struct pci_dev *dev)
 	/* Advanced Error Reporting */
 	pci_aer_init(dev);
 
+	pcie_check_downtrain_errors(dev);
+
 	if (pci_probe_reset_function(dev) == 0)
 		dev->reset_fn = 1;
 }
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba797a8f3..5557e6dfd05a 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -522,6 +522,7 @@ 
 #define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
+#define  PCI_EXP_LNKCAP_MLW_SHIFT 4	/* start of MLW mask in link status */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_L0SEL	0x00007000 /* L0s Exit Latency */
 #define  PCI_EXP_LNKCAP_L1EL	0x00038000 /* L1 Exit Latency */