diff mbox

pcie: Enhance PCIe links

Message ID 20130319222330.19359.2521.stgit@bling.home
State New
Headers show

Commit Message

Alex Williamson March 19, 2013, 10:24 p.m. UTC
Enable PCIe devices to negotiate links.  This upgrades our root ports
and switches to advertising x16, 8.0GT/s and negotiates the current
link status to the best available width and speed.  Note that we also
skip setting link fields for Root Complex Integrated Endpoints as
indicated by the PCIe spec.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

This depends on the previous pcie_endpoint_cap_init patch.

 hw/ioh3420.c            |    5 +-
 hw/pci/pcie.c           |  150 ++++++++++++++++++++++++++++++++++++++++++++---
 hw/pci/pcie.h           |    7 ++
 hw/pci/pcie_regs.h      |   17 +++++
 hw/usb/hcd-xhci.c       |    3 +
 hw/xio3130_downstream.c |    4 +
 hw/xio3130_upstream.c   |    4 +
 7 files changed, 173 insertions(+), 17 deletions(-)

Comments

Michael S. Tsirkin March 21, 2013, 4:56 p.m. UTC | #1
On Tue, Mar 19, 2013 at 04:24:47PM -0600, Alex Williamson wrote:
> Enable PCIe devices to negotiate links.  This upgrades our root ports
> and switches to advertising x16, 8.0GT/s and negotiates the current
> link status to the best available width and speed.  Note that we also
> skip setting link fields for Root Complex Integrated Endpoints as
> indicated by the PCIe spec.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> This depends on the previous pcie_endpoint_cap_init patch.
> 
>  hw/ioh3420.c            |    5 +-
>  hw/pci/pcie.c           |  150 ++++++++++++++++++++++++++++++++++++++++++++---
>  hw/pci/pcie.h           |    7 ++
>  hw/pci/pcie_regs.h      |   17 +++++
>  hw/usb/hcd-xhci.c       |    3 +
>  hw/xio3130_downstream.c |    4 +
>  hw/xio3130_upstream.c   |    4 +
>  7 files changed, 173 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/ioh3420.c b/hw/ioh3420.c
> index 5cff61e..0aaec5b 100644
> --- a/hw/ioh3420.c
> +++ b/hw/ioh3420.c
> @@ -115,7 +115,10 @@ static int ioh3420_initfn(PCIDevice *d)
>      if (rc < 0) {
>          goto err_bridge;
>      }
> -    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port);
> +
> +    /* Real hardware only supports up to x4, 2.5GT/s, but we're not real hw */
> +    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port,
> +                       PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
>      if (rc < 0) {
>          goto err_msi;
>
I'd like to see some documentation about why this is needed/a good idea.
You could argue that some guest might be surprised if the card width
does not match reality but it could work in reverse too ...
Do you see some drivers complaining? Linux prints warnings sometimes -
is this what worries you?
Let's document the motivation here not only what's going on.

> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index 62bd0b8..c07d3cc 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -37,11 +37,98 @@
>  #define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
>      PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
>  
> +static uint16_t pcie_link_max_width(PCIDevice *dev)
> +{
> +    uint8_t *exp_cap;
> +    uint32_t lnkcap;
> +
> +    exp_cap = dev->config + dev->exp.exp_cap;
> +    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> +
> +    return lnkcap & PCI_EXP_LNKCAP_MLW;
> +}
> +
> +static uint8_t pcie_link_speed_mask(PCIDevice *dev)
> +{
> +    uint8_t *exp_cap;
> +    uint32_t lnkcap, lnkcap2;
> +    uint8_t speeds, mask;
> +
> +    exp_cap = dev->config + dev->exp.exp_cap;
> +    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> +    lnkcap2 = pci_get_long(exp_cap + PCI_EXP_LNKCAP2);
> +
> +    mask = (1 << (lnkcap & PCI_EXP_LNKCAP_SLS)) - 1;
> +
> +    /*
> +     * If LNKCAP2 reports supported link speeds, then LNKCAP indexes
> +     * the highest supported speed.  Mask out the rest and return.
> +     */
> +    speeds = (lnkcap2 & 0xfe) >> 1;

what's 0xfe?

> +    if (speeds) {
> +        return speeds & mask;
> +    }
> +
> +    /*
> +     * Otherwise LNKCAP returns the maximum speed and the device supports
> +     * all speeds below it.  This is really only valid for 2.5 & 5GT/s
> +     */
> +    return mask;
> +}
> +
> +void pcie_negotiate_link(PCIDevice *dev)
> +{
> +    PCIDevice *parent;
> +    uint16_t flags, width;
> +    uint8_t type, speed;
> +
> +    /* Skip non-express buses and Root Complex buses. */
> +    if (!pci_bus_is_express(dev->bus) || pci_bus_is_root(dev->bus)) {
> +        return;
> +    }
> +
> +    /*
> +     * Downstream ports don't negotiate with upstream ports, their link
> +     * is negotiated by whatever is attached downstream to them.  The
> +     * same is true of root ports, but root ports are always attached to
> +     * the root complex, so fall out above.
> +     */
> +    flags = pci_get_word(dev->config + dev->exp.exp_cap + PCI_EXP_FLAGS);
> +    type = (flags & PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
> +    if (type == PCI_EXP_TYPE_DOWNSTREAM) {
> +        return;
> +    }
> +
> +    parent = dev->bus->parent_dev;
> +
> +    assert(pci_is_express(dev) && dev->exp.exp_cap &&
> +           pci_is_express(parent) && parent->exp.exp_cap);
> +
> +    /* Devices support all widths below max, so use the MIN max */
> +    width = MIN(pcie_link_max_width(dev), pcie_link_max_width(parent));


So I see this in spec:
7.8.19.Link Control 2 Register (Offset 30h)


    9
    Hardware Autonomous Width Disable – When Set, this bit
    disables hardware from changing the Link width for reasons
    other than attempting to correct unreliable Link operation by
    reducing Link width.
    For a Multi-Function device associated with an Upstream Port,
    the bit in Function 0 is of type RW, and only Function 0 controls
    the component’s Link behavior. In all other Functions of that
    device, this bit is of type RsvdP.
    Components that do not implement the ability autonomously to
    change Link width are permitted to hardwire this bit to 0b.
    Default value of this bit is 0b.
    RW/RsvdP
    (see
    description)

So far we did not implement this according to rules:
we always used the 1x width so not ability to change
width.

Now that we do, we need to support the override?

Did not look but something like this could exist for speed too?

> +    /* Choose the highest speed supported by both devices */
> +    speed = ffs(pow2floor(pcie_link_speed_mask(dev) &
> +                          pcie_link_speed_mask(parent)));

What's all this trickery about? Not same as fls by chance?

> +
> +    pci_word_test_and_clear_mask(dev->config + dev->exp.exp_cap +
> +                                 PCI_EXP_LNKSTA,
> +                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
> +    pci_word_test_and_set_mask(dev->config + dev->exp.exp_cap + PCI_EXP_LNKSTA,
> +                               width | speed);

This looks strange. Should not be pci_set_word_by_mask,
since speed is not a mask itself?

> +
> +    pci_word_test_and_clear_mask(parent->config + parent->exp.exp_cap +
> +                                 PCI_EXP_LNKSTA,
> +                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
> +    pci_word_test_and_set_mask(parent->config + parent->exp.exp_cap +
> +                               PCI_EXP_LNKSTA,
> +                               width | speed);
> +}
>  
>  /***************************************************************************
>   * pci express capability helper functions
>   */
> -int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
> +                  uint16_t width, uint8_t speed)
>  {
>      int pos;
>      uint8_t *exp_cap;
> @@ -71,14 +158,56 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
>       */
>      pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
>  
> -    pci_set_long(exp_cap + PCI_EXP_LNKCAP,
> -                 (port << PCI_EXP_LNKCAP_PN_SHIFT) |
> -                 PCI_EXP_LNKCAP_ASPMS_0S |
> -                 PCI_EXP_LNK_MLW_1 |
> -                 PCI_EXP_LNK_LS_25);
> +    /* Root Complex devices don't report link fields */
> +    if (type != PCI_EXP_TYPE_RC_END) {
> +        /* User specifies the link port # */
> +        pci_set_long(exp_cap + PCI_EXP_LNKCAP, port << PCI_EXP_LNKCAP_PN_SHIFT);
> +
> +        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, width | speed);
> +
> +        /* Advertise L0s ASPM support */
> +        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> +                                   PCI_EXP_LNKCAP_ASPMS_L0S);
>  
> -    pci_set_word(exp_cap + PCI_EXP_LNKSTA,
> -                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
> +        /*
> +         * Link bandwidth notification is required for all root ports and
> +         * downstream ports supporting links wider than x1.
> +         */
> +        if (width > PCI_EXP_LNK_MLW_1 && (type == PCI_EXP_TYPE_ROOT_PORT ||
> +            type == PCI_EXP_TYPE_DOWNSTREAM)) {
> +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> +                                       PCI_EXP_LNKCAP_LBNC);
> +        }
> +
> +        /* 8.0GT/s adds some requirements */
> +        if (speed >= PCI_EXP_LNK_LS_80) {
> +            /*
> +             * 2.5 & 5.0GT/s can be fully described by LNKCAP, but 8.0GT/s
> +             * is actually a reference to the highest bit supported in this
> +             * register.  We assume the device supports all link speeds.
> +             */
> +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> +                                       PCI_EXP_LNK2_LS_25);
> +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> +                                       PCI_EXP_LNK2_LS_50);
> +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> +                                       PCI_EXP_LNK2_LS_80);

Let's just do a | here.

> +
> +            /*
> +             * Supporting 8.0GT/s requires that we advertise Data Link Layer
> +             * Active on all downstream ports supporting hotplug or speeds
> +             * great than 5GT/s

typo

> +             */
> +            if (type == PCI_EXP_TYPE_DOWNSTREAM) {
> +                pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> +                                           PCI_EXP_LNKCAP_DLLLARC);
> +                pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA,
> +                                           PCI_EXP_LNKSTA_DLLLA);
> +            }
> +        }
> +
> +        pcie_negotiate_link(dev);
> +    }
>  
>      pci_set_long(exp_cap + PCI_EXP_DEVCAP2,
>                   PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> @@ -87,7 +216,8 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
>      return pos;
>  }
>  
> -int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
> +int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
> +                           uint8_t speed)
>  {
>      uint8_t type = PCI_EXP_TYPE_ENDPOINT;
>  
> @@ -100,7 +230,7 @@ int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
>          type = PCI_EXP_TYPE_RC_END;
>      }
>  
> -    return pcie_cap_init(dev, offset, type, 0);
> +    return pcie_cap_init(dev, offset, type, 0, width, speed);
>  }
>  
>  void pcie_cap_exit(PCIDevice *dev)
> diff --git a/hw/pci/pcie.h b/hw/pci/pcie.h
> index c010007..051dd76 100644
> --- a/hw/pci/pcie.h
> +++ b/hw/pci/pcie.h
> @@ -94,9 +94,12 @@ struct PCIExpressDevice {
>  };
>  
>  /* PCI express capability helper functions */
> -int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port);
> -int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset);
> +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
> +                  uint16_t width, uint8_t speed);
> +int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
> +                           uint8_t speed);
>  void pcie_cap_exit(PCIDevice *dev);
> +void pcie_negotiate_link(PCIDevice *dev);
>  uint8_t pcie_cap_get_type(const PCIDevice *dev);
>  void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
>  uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
> diff --git a/hw/pci/pcie_regs.h b/hw/pci/pcie_regs.h
> index 4d123d9..4078451 100644
> --- a/hw/pci/pcie_regs.h
> +++ b/hw/pci/pcie_regs.h
> @@ -34,13 +34,23 @@
>  /* PCI_EXP_LINK{CAP, STA} */
>  /* link speed */
>  #define PCI_EXP_LNK_LS_25               1
> +#define PCI_EXP_LNK_LS_50               2
> +#define PCI_EXP_LNK_LS_80               3
>  
>  #define PCI_EXP_LNK_MLW_SHIFT           (ffs(PCI_EXP_LNKCAP_MLW) - 1)
>  #define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
> +#define PCI_EXP_LNK_MLW_2               (2 << PCI_EXP_LNK_MLW_SHIFT)
> +#define PCI_EXP_LNK_MLW_4               (4 << PCI_EXP_LNK_MLW_SHIFT)
> +#define PCI_EXP_LNK_MLW_8               (8 << PCI_EXP_LNK_MLW_SHIFT)
> +#define PCI_EXP_LNK_MLW_12              (12 << PCI_EXP_LNK_MLW_SHIFT)
> +#define PCI_EXP_LNK_MLW_16              (16 << PCI_EXP_LNK_MLW_SHIFT)
> +#define PCI_EXP_LNK_MLW_32              (32 << PCI_EXP_LNK_MLW_SHIFT)
>  
>  /* PCI_EXP_LINKCAP */
>  #define PCI_EXP_LNKCAP_ASPMS_SHIFT      (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
> -#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> +#define PCI_EXP_LNKCAP_ASPMS_L0S        (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> +#define PCI_EXP_LNKCAP_ASPMS_L1         (2 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> +#define PCI_EXP_LNKCAP_ASPMS_L0SL1      (3 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
>  
>  #define PCI_EXP_LNKCAP_PN_SHIFT         (ffs(PCI_EXP_LNKCAP_PN) - 1)
>  
> @@ -72,6 +82,11 @@
>  
>  #define PCI_EXP_DEVCTL2_EETLPPB         0x80
>  
> +#define PCI_EXP_LNKCAP2         44      /* Link Capabilities 2 */
> +#define PCI_EXP_LNK2_LS_25              (1 << 1)
> +#define PCI_EXP_LNK2_LS_50              (1 << 2)
> +#define PCI_EXP_LNK2_LS_80              (1 << 3)
> +
>  /* ARI */
>  #define PCI_ARI_VER                     1
>  #define PCI_ARI_SIZEOF                  8
> diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
> index 5aa342b..5f57807 100644
> --- a/hw/usb/hcd-xhci.c
> +++ b/hw/usb/hcd-xhci.c
> @@ -3332,7 +3332,8 @@ static int usb_xhci_initfn(struct PCIDevice *dev)
>                       PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64,
>                       &xhci->mem);
>  
> -    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0);
> +    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0, PCI_EXP_LNK_MLW_1,
> +                                 PCI_EXP_LNK_LS_25);
>      assert(ret >= 0);
>  
>      if (xhci->flags & (1 << XHCI_FLAG_USE_MSI)) {
> diff --git a/hw/xio3130_downstream.c b/hw/xio3130_downstream.c
> index b868f56..086ada9 100644
> --- a/hw/xio3130_downstream.c
> +++ b/hw/xio3130_downstream.c
> @@ -79,8 +79,10 @@ static int xio3130_downstream_initfn(PCIDevice *d)
>      if (rc < 0) {
>          goto err_bridge;
>      }
> +
> +    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */

add "So set it to 8.0 GT/s because ... "

>      rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_DOWNSTREAM,
> -                       p->port);
> +                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
>      if (rc < 0) {
>          goto err_msi;
>      }
> diff --git a/hw/xio3130_upstream.c b/hw/xio3130_upstream.c
> index cd5d97d..4b6820b 100644
> --- a/hw/xio3130_upstream.c
> +++ b/hw/xio3130_upstream.c
> @@ -75,8 +75,10 @@ static int xio3130_upstream_initfn(PCIDevice *d)
>      if (rc < 0) {
>          goto err_bridge;
>      }
> +
> +    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
>      rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_UPSTREAM,
> -                       p->port);
> +                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
>      if (rc < 0) {
>          goto err_msi;
>      }
Alex Williamson March 22, 2013, 3:25 p.m. UTC | #2
On Thu, 2013-03-21 at 18:56 +0200, Michael S. Tsirkin wrote:
> On Tue, Mar 19, 2013 at 04:24:47PM -0600, Alex Williamson wrote:
> > Enable PCIe devices to negotiate links.  This upgrades our root ports
> > and switches to advertising x16, 8.0GT/s and negotiates the current
> > link status to the best available width and speed.  Note that we also
> > skip setting link fields for Root Complex Integrated Endpoints as
> > indicated by the PCIe spec.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> > This depends on the previous pcie_endpoint_cap_init patch.
> > 
> >  hw/ioh3420.c            |    5 +-
> >  hw/pci/pcie.c           |  150 ++++++++++++++++++++++++++++++++++++++++++++---
> >  hw/pci/pcie.h           |    7 ++
> >  hw/pci/pcie_regs.h      |   17 +++++
> >  hw/usb/hcd-xhci.c       |    3 +
> >  hw/xio3130_downstream.c |    4 +
> >  hw/xio3130_upstream.c   |    4 +
> >  7 files changed, 173 insertions(+), 17 deletions(-)
> > 
> > diff --git a/hw/ioh3420.c b/hw/ioh3420.c
> > index 5cff61e..0aaec5b 100644
> > --- a/hw/ioh3420.c
> > +++ b/hw/ioh3420.c
> > @@ -115,7 +115,10 @@ static int ioh3420_initfn(PCIDevice *d)
> >      if (rc < 0) {
> >          goto err_bridge;
> >      }
> > -    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port);
> > +
> > +    /* Real hardware only supports up to x4, 2.5GT/s, but we're not real hw */
> > +    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port,
> > +                       PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> >      if (rc < 0) {
> >          goto err_msi;
> >
> I'd like to see some documentation about why this is needed/a good idea.
> You could argue that some guest might be surprised if the card width
> does not match reality but it could work in reverse too ...
> Do you see some drivers complaining? Linux prints warnings sometimes -
> is this what worries you?
> Let's document the motivation here not only what's going on.

Ok, I can add motivation.  This is where I really wish we had a generic
switch that wouldn't risk having existing real world expectations.  The
base motivation though is to not create artificial bottlenecks.  If I
assign a graphics card to a guest, I want the link to negotiate to the
same bandwidth it has on the host.  I'm not entirely sure how to do that
yet, whether I should cap the device capabilities to it's current status
or whether I should force a negotiation at the current speed.  The
former may confuse drivers if they expect certain device capabilities,
the latter may cause drivers to attempt to renegotiate higher speeds.
The goal though is to have a virtual platform that advertises sufficient
speed on all ports to attach any real or virtual device.

Perhaps I should stick to hardware limitations for xio3130 & io3420 and
distill these drivers down to generic ones with x32 ports and 8GT/s.

> > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > index 62bd0b8..c07d3cc 100644
> > --- a/hw/pci/pcie.c
> > +++ b/hw/pci/pcie.c
> > @@ -37,11 +37,98 @@
> >  #define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
> >      PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> >  
> > +static uint16_t pcie_link_max_width(PCIDevice *dev)
> > +{
> > +    uint8_t *exp_cap;
> > +    uint32_t lnkcap;
> > +
> > +    exp_cap = dev->config + dev->exp.exp_cap;
> > +    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > +
> > +    return lnkcap & PCI_EXP_LNKCAP_MLW;
> > +}
> > +
> > +static uint8_t pcie_link_speed_mask(PCIDevice *dev)
> > +{
> > +    uint8_t *exp_cap;
> > +    uint32_t lnkcap, lnkcap2;
> > +    uint8_t speeds, mask;
> > +
> > +    exp_cap = dev->config + dev->exp.exp_cap;
> > +    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > +    lnkcap2 = pci_get_long(exp_cap + PCI_EXP_LNKCAP2);
> > +
> > +    mask = (1 << (lnkcap & PCI_EXP_LNKCAP_SLS)) - 1;
> > +
> > +    /*
> > +     * If LNKCAP2 reports supported link speeds, then LNKCAP indexes
> > +     * the highest supported speed.  Mask out the rest and return.
> > +     */
> > +    speeds = (lnkcap2 & 0xfe) >> 1;
> 
> what's 0xfe?

Will add macro

> > +    if (speeds) {
> > +        return speeds & mask;
> > +    }
> > +
> > +    /*
> > +     * Otherwise LNKCAP returns the maximum speed and the device supports
> > +     * all speeds below it.  This is really only valid for 2.5 & 5GT/s
> > +     */
> > +    return mask;
> > +}
> > +
> > +void pcie_negotiate_link(PCIDevice *dev)
> > +{
> > +    PCIDevice *parent;
> > +    uint16_t flags, width;
> > +    uint8_t type, speed;
> > +
> > +    /* Skip non-express buses and Root Complex buses. */
> > +    if (!pci_bus_is_express(dev->bus) || pci_bus_is_root(dev->bus)) {
> > +        return;
> > +    }
> > +
> > +    /*
> > +     * Downstream ports don't negotiate with upstream ports, their link
> > +     * is negotiated by whatever is attached downstream to them.  The
> > +     * same is true of root ports, but root ports are always attached to
> > +     * the root complex, so fall out above.
> > +     */
> > +    flags = pci_get_word(dev->config + dev->exp.exp_cap + PCI_EXP_FLAGS);
> > +    type = (flags & PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
> > +    if (type == PCI_EXP_TYPE_DOWNSTREAM) {
> > +        return;
> > +    }
> > +
> > +    parent = dev->bus->parent_dev;
> > +
> > +    assert(pci_is_express(dev) && dev->exp.exp_cap &&
> > +           pci_is_express(parent) && parent->exp.exp_cap);
> > +
> > +    /* Devices support all widths below max, so use the MIN max */
> > +    width = MIN(pcie_link_max_width(dev), pcie_link_max_width(parent));
> 
> 
> So I see this in spec:
> 7.8.19.Link Control 2 Register (Offset 30h)
> 
> 
>     9
>     Hardware Autonomous Width Disable – When Set, this bit
>     disables hardware from changing the Link width for reasons
>     other than attempting to correct unreliable Link operation by
>     reducing Link width.
>     For a Multi-Function device associated with an Upstream Port,
>     the bit in Function 0 is of type RW, and only Function 0 controls
>     the component’s Link behavior. In all other Functions of that
>     device, this bit is of type RsvdP.
>     Components that do not implement the ability autonomously to
>     change Link width are permitted to hardwire this bit to 0b.
>     Default value of this bit is 0b.
>     RW/RsvdP
>     (see
>     description)
> 
> So far we did not implement this according to rules:
> we always used the 1x width so not ability to change
> width.
> 
> Now that we do, we need to support the override?
> 
> Did not look but something like this could exist for speed too?

We should definitely only have function 0 perform the link negotiation
with the upstream port, subsequent functions can just copy the result
from function 0.  Am I still missing something or is that sufficient?

> > +    /* Choose the highest speed supported by both devices */
> > +    speed = ffs(pow2floor(pcie_link_speed_mask(dev) &
> > +                          pcie_link_speed_mask(parent)));
> 
> What's all this trickery about? Not same as fls by chance?

Yes, I couldn't find fls, I'll see if the compiler can.  This rounds
down to the highest power of 2 then finds the first (only) set bit.

> > +
> > +    pci_word_test_and_clear_mask(dev->config + dev->exp.exp_cap +
> > +                                 PCI_EXP_LNKSTA,
> > +                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
> > +    pci_word_test_and_set_mask(dev->config + dev->exp.exp_cap + PCI_EXP_LNKSTA,
> > +                               width | speed);
> 
> This looks strange. Should not be pci_set_word_by_mask,
> since speed is not a mask itself?

Yep, it looks like set_word_by_mask would replace clear_mask, set_mask.

> > +
> > +    pci_word_test_and_clear_mask(parent->config + parent->exp.exp_cap +
> > +                                 PCI_EXP_LNKSTA,
> > +                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
> > +    pci_word_test_and_set_mask(parent->config + parent->exp.exp_cap +
> > +                               PCI_EXP_LNKSTA,
> > +                               width | speed);
> > +}
> >  
> >  /***************************************************************************
> >   * pci express capability helper functions
> >   */
> > -int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> > +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
> > +                  uint16_t width, uint8_t speed)
> >  {
> >      int pos;
> >      uint8_t *exp_cap;
> > @@ -71,14 +158,56 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> >       */
> >      pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
> >  
> > -    pci_set_long(exp_cap + PCI_EXP_LNKCAP,
> > -                 (port << PCI_EXP_LNKCAP_PN_SHIFT) |
> > -                 PCI_EXP_LNKCAP_ASPMS_0S |
> > -                 PCI_EXP_LNK_MLW_1 |
> > -                 PCI_EXP_LNK_LS_25);
> > +    /* Root Complex devices don't report link fields */
> > +    if (type != PCI_EXP_TYPE_RC_END) {
> > +        /* User specifies the link port # */
> > +        pci_set_long(exp_cap + PCI_EXP_LNKCAP, port << PCI_EXP_LNKCAP_PN_SHIFT);
> > +
> > +        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, width | speed);
> > +
> > +        /* Advertise L0s ASPM support */
> > +        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> > +                                   PCI_EXP_LNKCAP_ASPMS_L0S);
> >  
> > -    pci_set_word(exp_cap + PCI_EXP_LNKSTA,
> > -                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
> > +        /*
> > +         * Link bandwidth notification is required for all root ports and
> > +         * downstream ports supporting links wider than x1.
> > +         */
> > +        if (width > PCI_EXP_LNK_MLW_1 && (type == PCI_EXP_TYPE_ROOT_PORT ||
> > +            type == PCI_EXP_TYPE_DOWNSTREAM)) {
> > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> > +                                       PCI_EXP_LNKCAP_LBNC);
> > +        }
> > +
> > +        /* 8.0GT/s adds some requirements */
> > +        if (speed >= PCI_EXP_LNK_LS_80) {
> > +            /*
> > +             * 2.5 & 5.0GT/s can be fully described by LNKCAP, but 8.0GT/s
> > +             * is actually a reference to the highest bit supported in this
> > +             * register.  We assume the device supports all link speeds.
> > +             */
> > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> > +                                       PCI_EXP_LNK2_LS_25);
> > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> > +                                       PCI_EXP_LNK2_LS_50);
> > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> > +                                       PCI_EXP_LNK2_LS_80);
> 
> Let's just do a | here.

Ok

> > +
> > +            /*
> > +             * Supporting 8.0GT/s requires that we advertise Data Link Layer
> > +             * Active on all downstream ports supporting hotplug or speeds
> > +             * great than 5GT/s
> 
> typo
> 
> > +             */
> > +            if (type == PCI_EXP_TYPE_DOWNSTREAM) {
> > +                pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> > +                                           PCI_EXP_LNKCAP_DLLLARC);
> > +                pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA,
> > +                                           PCI_EXP_LNKSTA_DLLLA);
> > +            }
> > +        }
> > +
> > +        pcie_negotiate_link(dev);
> > +    }
> >  
> >      pci_set_long(exp_cap + PCI_EXP_DEVCAP2,
> >                   PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> > @@ -87,7 +216,8 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> >      return pos;
> >  }
> >  
> > -int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
> > +int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
> > +                           uint8_t speed)
> >  {
> >      uint8_t type = PCI_EXP_TYPE_ENDPOINT;
> >  
> > @@ -100,7 +230,7 @@ int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
> >          type = PCI_EXP_TYPE_RC_END;
> >      }
> >  
> > -    return pcie_cap_init(dev, offset, type, 0);
> > +    return pcie_cap_init(dev, offset, type, 0, width, speed);
> >  }
> >  
> >  void pcie_cap_exit(PCIDevice *dev)
> > diff --git a/hw/pci/pcie.h b/hw/pci/pcie.h
> > index c010007..051dd76 100644
> > --- a/hw/pci/pcie.h
> > +++ b/hw/pci/pcie.h
> > @@ -94,9 +94,12 @@ struct PCIExpressDevice {
> >  };
> >  
> >  /* PCI express capability helper functions */
> > -int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port);
> > -int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset);
> > +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
> > +                  uint16_t width, uint8_t speed);
> > +int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
> > +                           uint8_t speed);
> >  void pcie_cap_exit(PCIDevice *dev);
> > +void pcie_negotiate_link(PCIDevice *dev);
> >  uint8_t pcie_cap_get_type(const PCIDevice *dev);
> >  void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
> >  uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
> > diff --git a/hw/pci/pcie_regs.h b/hw/pci/pcie_regs.h
> > index 4d123d9..4078451 100644
> > --- a/hw/pci/pcie_regs.h
> > +++ b/hw/pci/pcie_regs.h
> > @@ -34,13 +34,23 @@
> >  /* PCI_EXP_LINK{CAP, STA} */
> >  /* link speed */
> >  #define PCI_EXP_LNK_LS_25               1
> > +#define PCI_EXP_LNK_LS_50               2
> > +#define PCI_EXP_LNK_LS_80               3
> >  
> >  #define PCI_EXP_LNK_MLW_SHIFT           (ffs(PCI_EXP_LNKCAP_MLW) - 1)
> >  #define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
> > +#define PCI_EXP_LNK_MLW_2               (2 << PCI_EXP_LNK_MLW_SHIFT)
> > +#define PCI_EXP_LNK_MLW_4               (4 << PCI_EXP_LNK_MLW_SHIFT)
> > +#define PCI_EXP_LNK_MLW_8               (8 << PCI_EXP_LNK_MLW_SHIFT)
> > +#define PCI_EXP_LNK_MLW_12              (12 << PCI_EXP_LNK_MLW_SHIFT)
> > +#define PCI_EXP_LNK_MLW_16              (16 << PCI_EXP_LNK_MLW_SHIFT)
> > +#define PCI_EXP_LNK_MLW_32              (32 << PCI_EXP_LNK_MLW_SHIFT)
> >  
> >  /* PCI_EXP_LINKCAP */
> >  #define PCI_EXP_LNKCAP_ASPMS_SHIFT      (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
> > -#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > +#define PCI_EXP_LNKCAP_ASPMS_L0S        (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > +#define PCI_EXP_LNKCAP_ASPMS_L1         (2 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > +#define PCI_EXP_LNKCAP_ASPMS_L0SL1      (3 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> >  
> >  #define PCI_EXP_LNKCAP_PN_SHIFT         (ffs(PCI_EXP_LNKCAP_PN) - 1)
> >  
> > @@ -72,6 +82,11 @@
> >  
> >  #define PCI_EXP_DEVCTL2_EETLPPB         0x80
> >  
> > +#define PCI_EXP_LNKCAP2         44      /* Link Capabilities 2 */
> > +#define PCI_EXP_LNK2_LS_25              (1 << 1)
> > +#define PCI_EXP_LNK2_LS_50              (1 << 2)
> > +#define PCI_EXP_LNK2_LS_80              (1 << 3)
> > +
> >  /* ARI */
> >  #define PCI_ARI_VER                     1
> >  #define PCI_ARI_SIZEOF                  8
> > diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
> > index 5aa342b..5f57807 100644
> > --- a/hw/usb/hcd-xhci.c
> > +++ b/hw/usb/hcd-xhci.c
> > @@ -3332,7 +3332,8 @@ static int usb_xhci_initfn(struct PCIDevice *dev)
> >                       PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64,
> >                       &xhci->mem);
> >  
> > -    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0);
> > +    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0, PCI_EXP_LNK_MLW_1,
> > +                                 PCI_EXP_LNK_LS_25);
> >      assert(ret >= 0);
> >  
> >      if (xhci->flags & (1 << XHCI_FLAG_USE_MSI)) {
> > diff --git a/hw/xio3130_downstream.c b/hw/xio3130_downstream.c
> > index b868f56..086ada9 100644
> > --- a/hw/xio3130_downstream.c
> > +++ b/hw/xio3130_downstream.c
> > @@ -79,8 +79,10 @@ static int xio3130_downstream_initfn(PCIDevice *d)
> >      if (rc < 0) {
> >          goto err_bridge;
> >      }
> > +
> > +    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
> 
> add "So set it to 8.0 GT/s because ... "

Point taken.  Thanks,

Alex

> >      rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_DOWNSTREAM,
> > -                       p->port);
> > +                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> >      if (rc < 0) {
> >          goto err_msi;
> >      }
> > diff --git a/hw/xio3130_upstream.c b/hw/xio3130_upstream.c
> > index cd5d97d..4b6820b 100644
> > --- a/hw/xio3130_upstream.c
> > +++ b/hw/xio3130_upstream.c
> > @@ -75,8 +75,10 @@ static int xio3130_upstream_initfn(PCIDevice *d)
> >      if (rc < 0) {
> >          goto err_bridge;
> >      }
> > +
> > +    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
> >      rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_UPSTREAM,
> > -                       p->port);
> > +                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> >      if (rc < 0) {
> >          goto err_msi;
> >      }
Michael S. Tsirkin March 24, 2013, 9:14 a.m. UTC | #3
On Fri, Mar 22, 2013 at 09:25:13AM -0600, Alex Williamson wrote:
> On Thu, 2013-03-21 at 18:56 +0200, Michael S. Tsirkin wrote:
> > On Tue, Mar 19, 2013 at 04:24:47PM -0600, Alex Williamson wrote:
> > > Enable PCIe devices to negotiate links.  This upgrades our root ports
> > > and switches to advertising x16, 8.0GT/s and negotiates the current
> > > link status to the best available width and speed.  Note that we also
> > > skip setting link fields for Root Complex Integrated Endpoints as
> > > indicated by the PCIe spec.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > > This depends on the previous pcie_endpoint_cap_init patch.
> > > 
> > >  hw/ioh3420.c            |    5 +-
> > >  hw/pci/pcie.c           |  150 ++++++++++++++++++++++++++++++++++++++++++++---
> > >  hw/pci/pcie.h           |    7 ++
> > >  hw/pci/pcie_regs.h      |   17 +++++
> > >  hw/usb/hcd-xhci.c       |    3 +
> > >  hw/xio3130_downstream.c |    4 +
> > >  hw/xio3130_upstream.c   |    4 +
> > >  7 files changed, 173 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/hw/ioh3420.c b/hw/ioh3420.c
> > > index 5cff61e..0aaec5b 100644
> > > --- a/hw/ioh3420.c
> > > +++ b/hw/ioh3420.c
> > > @@ -115,7 +115,10 @@ static int ioh3420_initfn(PCIDevice *d)
> > >      if (rc < 0) {
> > >          goto err_bridge;
> > >      }
> > > -    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port);
> > > +
> > > +    /* Real hardware only supports up to x4, 2.5GT/s, but we're not real hw */
> > > +    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port,
> > > +                       PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> > >      if (rc < 0) {
> > >          goto err_msi;
> > >
> > I'd like to see some documentation about why this is needed/a good idea.
> > You could argue that some guest might be surprised if the card width
> > does not match reality but it could work in reverse too ...
> > Do you see some drivers complaining? Linux prints warnings sometimes -
> > is this what worries you?
> > Let's document the motivation here not only what's going on.
> 
> Ok, I can add motivation.  This is where I really wish we had a generic
> switch that wouldn't risk having existing real world expectations.  The
> base motivation though is to not create artificial bottlenecks.  If I
> assign a graphics card to a guest, I want the link to negotiate to the
> same bandwidth it has on the host.  I'm not entirely sure how to do that
> yet, whether I should cap the device capabilities to it's current status
> or whether I should force a negotiation at the current speed.  The
> former may confuse drivers if they expect certain device capabilities,
> the latter may cause drivers to attempt to renegotiate higher speeds.
> The goal though is to have a virtual platform that advertises sufficient
> speed on all ports to attach any real or virtual device.
> 
> Perhaps I should stick to hardware limitations for xio3130 & io3420 and
> distill these drivers down to generic ones with x32 ports and 8GT/s.

Hmm this doesn't actually answer the question ...  Why does it matter
what is the width negotiated by the guest?

> > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > index 62bd0b8..c07d3cc 100644
> > > --- a/hw/pci/pcie.c
> > > +++ b/hw/pci/pcie.c
> > > @@ -37,11 +37,98 @@
> > >  #define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
> > >      PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
> > >  
> > > +static uint16_t pcie_link_max_width(PCIDevice *dev)
> > > +{
> > > +    uint8_t *exp_cap;
> > > +    uint32_t lnkcap;
> > > +
> > > +    exp_cap = dev->config + dev->exp.exp_cap;
> > > +    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > > +
> > > +    return lnkcap & PCI_EXP_LNKCAP_MLW;
> > > +}
> > > +
> > > +static uint8_t pcie_link_speed_mask(PCIDevice *dev)
> > > +{
> > > +    uint8_t *exp_cap;
> > > +    uint32_t lnkcap, lnkcap2;
> > > +    uint8_t speeds, mask;
> > > +
> > > +    exp_cap = dev->config + dev->exp.exp_cap;
> > > +    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > > +    lnkcap2 = pci_get_long(exp_cap + PCI_EXP_LNKCAP2);
> > > +
> > > +    mask = (1 << (lnkcap & PCI_EXP_LNKCAP_SLS)) - 1;
> > > +
> > > +    /*
> > > +     * If LNKCAP2 reports supported link speeds, then LNKCAP indexes
> > > +     * the highest supported speed.  Mask out the rest and return.
> > > +     */
> > > +    speeds = (lnkcap2 & 0xfe) >> 1;
> > 
> > what's 0xfe?
> 
> Will add macro
> 
> > > +    if (speeds) {
> > > +        return speeds & mask;
> > > +    }
> > > +
> > > +    /*
> > > +     * Otherwise LNKCAP returns the maximum speed and the device supports
> > > +     * all speeds below it.  This is really only valid for 2.5 & 5GT/s
> > > +     */
> > > +    return mask;
> > > +}
> > > +
> > > +void pcie_negotiate_link(PCIDevice *dev)
> > > +{
> > > +    PCIDevice *parent;
> > > +    uint16_t flags, width;
> > > +    uint8_t type, speed;
> > > +
> > > +    /* Skip non-express buses and Root Complex buses. */
> > > +    if (!pci_bus_is_express(dev->bus) || pci_bus_is_root(dev->bus)) {
> > > +        return;
> > > +    }
> > > +
> > > +    /*
> > > +     * Downstream ports don't negotiate with upstream ports, their link
> > > +     * is negotiated by whatever is attached downstream to them.  The
> > > +     * same is true of root ports, but root ports are always attached to
> > > +     * the root complex, so fall out above.
> > > +     */
> > > +    flags = pci_get_word(dev->config + dev->exp.exp_cap + PCI_EXP_FLAGS);
> > > +    type = (flags & PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
> > > +    if (type == PCI_EXP_TYPE_DOWNSTREAM) {
> > > +        return;
> > > +    }
> > > +
> > > +    parent = dev->bus->parent_dev;
> > > +
> > > +    assert(pci_is_express(dev) && dev->exp.exp_cap &&
> > > +           pci_is_express(parent) && parent->exp.exp_cap);
> > > +
> > > +    /* Devices support all widths below max, so use the MIN max */
> > > +    width = MIN(pcie_link_max_width(dev), pcie_link_max_width(parent));
> > 
> > 
> > So I see this in spec:
> > 7.8.19.Link Control 2 Register (Offset 30h)
> > 
> > 
> >     9
> >     Hardware Autonomous Width Disable – When Set, this bit
> >     disables hardware from changing the Link width for reasons
> >     other than attempting to correct unreliable Link operation by
> >     reducing Link width.
> >     For a Multi-Function device associated with an Upstream Port,
> >     the bit in Function 0 is of type RW, and only Function 0 controls
> >     the component’s Link behavior. In all other Functions of that
> >     device, this bit is of type RsvdP.
> >     Components that do not implement the ability autonomously to
> >     change Link width are permitted to hardwire this bit to 0b.
> >     Default value of this bit is 0b.
> >     RW/RsvdP
> >     (see
> >     description)
> > 
> > So far we did not implement this according to rules:
> > we always used the 1x width so not ability to change
> > width.
> > 
> > Now that we do, we need to support the override?
> > 
> > Did not look but something like this could exist for speed too?
> 
> We should definitely only have function 0 perform the link negotiation
> with the upstream port, subsequent functions can just copy the result
> from function 0.  Am I still missing something or is that sufficient?

We would also need to implement the bit above.
If we ever implement error reporting, we might
see devices reduce the width automatically
and need to implement that.

All of this is bypassed if we simply report x1 to guest ...

> > > +    /* Choose the highest speed supported by both devices */
> > > +    speed = ffs(pow2floor(pcie_link_speed_mask(dev) &
> > > +                          pcie_link_speed_mask(parent)));
> > 
> > What's all this trickery about? Not same as fls by chance?
> 
> Yes, I couldn't find fls, I'll see if the compiler can.  This rounds
> down to the highest power of 2 then finds the first (only) set bit.

It's in include/qemu-common.h

> > > +
> > > +    pci_word_test_and_clear_mask(dev->config + dev->exp.exp_cap +
> > > +                                 PCI_EXP_LNKSTA,
> > > +                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
> > > +    pci_word_test_and_set_mask(dev->config + dev->exp.exp_cap + PCI_EXP_LNKSTA,
> > > +                               width | speed);
> > 
> > This looks strange. Should not be pci_set_word_by_mask,
> > since speed is not a mask itself?
> 
> Yep, it looks like set_word_by_mask would replace clear_mask, set_mask.
> 
> > > +
> > > +    pci_word_test_and_clear_mask(parent->config + parent->exp.exp_cap +
> > > +                                 PCI_EXP_LNKSTA,
> > > +                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
> > > +    pci_word_test_and_set_mask(parent->config + parent->exp.exp_cap +
> > > +                               PCI_EXP_LNKSTA,
> > > +                               width | speed);
> > > +}
> > >  
> > >  /***************************************************************************
> > >   * pci express capability helper functions
> > >   */
> > > -int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> > > +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
> > > +                  uint16_t width, uint8_t speed)
> > >  {
> > >      int pos;
> > >      uint8_t *exp_cap;
> > > @@ -71,14 +158,56 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> > >       */
> > >      pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
> > >  
> > > -    pci_set_long(exp_cap + PCI_EXP_LNKCAP,
> > > -                 (port << PCI_EXP_LNKCAP_PN_SHIFT) |
> > > -                 PCI_EXP_LNKCAP_ASPMS_0S |
> > > -                 PCI_EXP_LNK_MLW_1 |
> > > -                 PCI_EXP_LNK_LS_25);
> > > +    /* Root Complex devices don't report link fields */
> > > +    if (type != PCI_EXP_TYPE_RC_END) {
> > > +        /* User specifies the link port # */
> > > +        pci_set_long(exp_cap + PCI_EXP_LNKCAP, port << PCI_EXP_LNKCAP_PN_SHIFT);
> > > +
> > > +        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, width | speed);
> > > +
> > > +        /* Advertise L0s ASPM support */
> > > +        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> > > +                                   PCI_EXP_LNKCAP_ASPMS_L0S);
> > >  
> > > -    pci_set_word(exp_cap + PCI_EXP_LNKSTA,
> > > -                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
> > > +        /*
> > > +         * Link bandwidth notification is required for all root ports and
> > > +         * downstream ports supporting links wider than x1.
> > > +         */
> > > +        if (width > PCI_EXP_LNK_MLW_1 && (type == PCI_EXP_TYPE_ROOT_PORT ||
> > > +            type == PCI_EXP_TYPE_DOWNSTREAM)) {
> > > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> > > +                                       PCI_EXP_LNKCAP_LBNC);
> > > +        }
> > > +
> > > +        /* 8.0GT/s adds some requirements */
> > > +        if (speed >= PCI_EXP_LNK_LS_80) {
> > > +            /*
> > > +             * 2.5 & 5.0GT/s can be fully described by LNKCAP, but 8.0GT/s
> > > +             * is actually a reference to the highest bit supported in this
> > > +             * register.  We assume the device supports all link speeds.
> > > +             */
> > > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> > > +                                       PCI_EXP_LNK2_LS_25);
> > > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> > > +                                       PCI_EXP_LNK2_LS_50);
> > > +            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
> > > +                                       PCI_EXP_LNK2_LS_80);
> > 
> > Let's just do a | here.
> 
> Ok
> 
> > > +
> > > +            /*
> > > +             * Supporting 8.0GT/s requires that we advertise Data Link Layer
> > > +             * Active on all downstream ports supporting hotplug or speeds
> > > +             * great than 5GT/s
> > 
> > typo
> > 
> > > +             */
> > > +            if (type == PCI_EXP_TYPE_DOWNSTREAM) {
> > > +                pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
> > > +                                           PCI_EXP_LNKCAP_DLLLARC);
> > > +                pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA,
> > > +                                           PCI_EXP_LNKSTA_DLLLA);
> > > +            }
> > > +        }
> > > +
> > > +        pcie_negotiate_link(dev);
> > > +    }
> > >  
> > >      pci_set_long(exp_cap + PCI_EXP_DEVCAP2,
> > >                   PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> > > @@ -87,7 +216,8 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
> > >      return pos;
> > >  }
> > >  
> > > -int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
> > > +int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
> > > +                           uint8_t speed)
> > >  {
> > >      uint8_t type = PCI_EXP_TYPE_ENDPOINT;
> > >  
> > > @@ -100,7 +230,7 @@ int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
> > >          type = PCI_EXP_TYPE_RC_END;
> > >      }
> > >  
> > > -    return pcie_cap_init(dev, offset, type, 0);
> > > +    return pcie_cap_init(dev, offset, type, 0, width, speed);
> > >  }
> > >  
> > >  void pcie_cap_exit(PCIDevice *dev)
> > > diff --git a/hw/pci/pcie.h b/hw/pci/pcie.h
> > > index c010007..051dd76 100644
> > > --- a/hw/pci/pcie.h
> > > +++ b/hw/pci/pcie.h
> > > @@ -94,9 +94,12 @@ struct PCIExpressDevice {
> > >  };
> > >  
> > >  /* PCI express capability helper functions */
> > > -int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port);
> > > -int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset);
> > > +int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
> > > +                  uint16_t width, uint8_t speed);
> > > +int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
> > > +                           uint8_t speed);
> > >  void pcie_cap_exit(PCIDevice *dev);
> > > +void pcie_negotiate_link(PCIDevice *dev);
> > >  uint8_t pcie_cap_get_type(const PCIDevice *dev);
> > >  void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
> > >  uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
> > > diff --git a/hw/pci/pcie_regs.h b/hw/pci/pcie_regs.h
> > > index 4d123d9..4078451 100644
> > > --- a/hw/pci/pcie_regs.h
> > > +++ b/hw/pci/pcie_regs.h
> > > @@ -34,13 +34,23 @@
> > >  /* PCI_EXP_LINK{CAP, STA} */
> > >  /* link speed */
> > >  #define PCI_EXP_LNK_LS_25               1
> > > +#define PCI_EXP_LNK_LS_50               2
> > > +#define PCI_EXP_LNK_LS_80               3
> > >  
> > >  #define PCI_EXP_LNK_MLW_SHIFT           (ffs(PCI_EXP_LNKCAP_MLW) - 1)
> > >  #define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
> > > +#define PCI_EXP_LNK_MLW_2               (2 << PCI_EXP_LNK_MLW_SHIFT)
> > > +#define PCI_EXP_LNK_MLW_4               (4 << PCI_EXP_LNK_MLW_SHIFT)
> > > +#define PCI_EXP_LNK_MLW_8               (8 << PCI_EXP_LNK_MLW_SHIFT)
> > > +#define PCI_EXP_LNK_MLW_12              (12 << PCI_EXP_LNK_MLW_SHIFT)
> > > +#define PCI_EXP_LNK_MLW_16              (16 << PCI_EXP_LNK_MLW_SHIFT)
> > > +#define PCI_EXP_LNK_MLW_32              (32 << PCI_EXP_LNK_MLW_SHIFT)
> > >  
> > >  /* PCI_EXP_LINKCAP */
> > >  #define PCI_EXP_LNKCAP_ASPMS_SHIFT      (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
> > > -#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > > +#define PCI_EXP_LNKCAP_ASPMS_L0S        (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > > +#define PCI_EXP_LNKCAP_ASPMS_L1         (2 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > > +#define PCI_EXP_LNKCAP_ASPMS_L0SL1      (3 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
> > >  
> > >  #define PCI_EXP_LNKCAP_PN_SHIFT         (ffs(PCI_EXP_LNKCAP_PN) - 1)
> > >  
> > > @@ -72,6 +82,11 @@
> > >  
> > >  #define PCI_EXP_DEVCTL2_EETLPPB         0x80
> > >  
> > > +#define PCI_EXP_LNKCAP2         44      /* Link Capabilities 2 */
> > > +#define PCI_EXP_LNK2_LS_25              (1 << 1)
> > > +#define PCI_EXP_LNK2_LS_50              (1 << 2)
> > > +#define PCI_EXP_LNK2_LS_80              (1 << 3)
> > > +
> > >  /* ARI */
> > >  #define PCI_ARI_VER                     1
> > >  #define PCI_ARI_SIZEOF                  8
> > > diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
> > > index 5aa342b..5f57807 100644
> > > --- a/hw/usb/hcd-xhci.c
> > > +++ b/hw/usb/hcd-xhci.c
> > > @@ -3332,7 +3332,8 @@ static int usb_xhci_initfn(struct PCIDevice *dev)
> > >                       PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64,
> > >                       &xhci->mem);
> > >  
> > > -    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0);
> > > +    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0, PCI_EXP_LNK_MLW_1,
> > > +                                 PCI_EXP_LNK_LS_25);
> > >      assert(ret >= 0);
> > >  
> > >      if (xhci->flags & (1 << XHCI_FLAG_USE_MSI)) {
> > > diff --git a/hw/xio3130_downstream.c b/hw/xio3130_downstream.c
> > > index b868f56..086ada9 100644
> > > --- a/hw/xio3130_downstream.c
> > > +++ b/hw/xio3130_downstream.c
> > > @@ -79,8 +79,10 @@ static int xio3130_downstream_initfn(PCIDevice *d)
> > >      if (rc < 0) {
> > >          goto err_bridge;
> > >      }
> > > +
> > > +    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
> > 
> > add "So set it to 8.0 GT/s because ... "
> 
> Point taken.  Thanks,
> 
> Alex
> 
> > >      rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_DOWNSTREAM,
> > > -                       p->port);
> > > +                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> > >      if (rc < 0) {
> > >          goto err_msi;
> > >      }
> > > diff --git a/hw/xio3130_upstream.c b/hw/xio3130_upstream.c
> > > index cd5d97d..4b6820b 100644
> > > --- a/hw/xio3130_upstream.c
> > > +++ b/hw/xio3130_upstream.c
> > > @@ -75,8 +75,10 @@ static int xio3130_upstream_initfn(PCIDevice *d)
> > >      if (rc < 0) {
> > >          goto err_bridge;
> > >      }
> > > +
> > > +    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
> > >      rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_UPSTREAM,
> > > -                       p->port);
> > > +                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> > >      if (rc < 0) {
> > >          goto err_msi;
> > >      }
> 
>
Alex Williamson March 25, 2013, 5:59 p.m. UTC | #4
On Sun, 2013-03-24 at 11:14 +0200, Michael S. Tsirkin wrote:
> On Fri, Mar 22, 2013 at 09:25:13AM -0600, Alex Williamson wrote:
> > On Thu, 2013-03-21 at 18:56 +0200, Michael S. Tsirkin wrote:
> > > On Tue, Mar 19, 2013 at 04:24:47PM -0600, Alex Williamson wrote:
> > > > Enable PCIe devices to negotiate links.  This upgrades our root ports
> > > > and switches to advertising x16, 8.0GT/s and negotiates the current
> > > > link status to the best available width and speed.  Note that we also
> > > > skip setting link fields for Root Complex Integrated Endpoints as
> > > > indicated by the PCIe spec.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > > 
> > > > This depends on the previous pcie_endpoint_cap_init patch.
> > > > 
> > > >  hw/ioh3420.c            |    5 +-
> > > >  hw/pci/pcie.c           |  150 ++++++++++++++++++++++++++++++++++++++++++++---
> > > >  hw/pci/pcie.h           |    7 ++
> > > >  hw/pci/pcie_regs.h      |   17 +++++
> > > >  hw/usb/hcd-xhci.c       |    3 +
> > > >  hw/xio3130_downstream.c |    4 +
> > > >  hw/xio3130_upstream.c   |    4 +
> > > >  7 files changed, 173 insertions(+), 17 deletions(-)
> > > > 
> > > > diff --git a/hw/ioh3420.c b/hw/ioh3420.c
> > > > index 5cff61e..0aaec5b 100644
> > > > --- a/hw/ioh3420.c
> > > > +++ b/hw/ioh3420.c
> > > > @@ -115,7 +115,10 @@ static int ioh3420_initfn(PCIDevice *d)
> > > >      if (rc < 0) {
> > > >          goto err_bridge;
> > > >      }
> > > > -    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port);
> > > > +
> > > > +    /* Real hardware only supports up to x4, 2.5GT/s, but we're not real hw */
> > > > +    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port,
> > > > +                       PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
> > > >      if (rc < 0) {
> > > >          goto err_msi;
> > > >
> > > I'd like to see some documentation about why this is needed/a good idea.
> > > You could argue that some guest might be surprised if the card width
> > > does not match reality but it could work in reverse too ...
> > > Do you see some drivers complaining? Linux prints warnings sometimes -
> > > is this what worries you?
> > > Let's document the motivation here not only what's going on.
> > 
> > Ok, I can add motivation.  This is where I really wish we had a generic
> > switch that wouldn't risk having existing real world expectations.  The
> > base motivation though is to not create artificial bottlenecks.  If I
> > assign a graphics card to a guest, I want the link to negotiate to the
> > same bandwidth it has on the host.  I'm not entirely sure how to do that
> > yet, whether I should cap the device capabilities to it's current status
> > or whether I should force a negotiation at the current speed.  The
> > former may confuse drivers if they expect certain device capabilities,
> > the latter may cause drivers to attempt to renegotiate higher speeds.
> > The goal though is to have a virtual platform that advertises sufficient
> > speed on all ports to attach any real or virtual device.
> > 
> > Perhaps I should stick to hardware limitations for xio3130 & io3420 and
> > distill these drivers down to generic ones with x32 ports and 8GT/s.
> 
> Hmm this doesn't actually answer the question ...  Why does it matter
> what is the width negotiated by the guest?

Why does the Windows virtio-net driver report running at 10Gbps?  We
want speeds that make sense and don't impose artificial bottlenecks.
Additionally, we don't know what kinds of problems could result from an
inconsistent topology.  For example, if an assigned device reports a
link status of x16, 5GT/s and the root port reports status and
capability of x1, 2.5GT/s, what is the guest supposed to believe?  By
allowing root ports and switches to report the superset of all possible
devices we can virtually downshift to correctly report any device.
Should we wait to see if these problems materialize, or fix them
proactively?

I do concede that doing this on devices emulating physical hardware can
possibly lead to other incompatibilities as a chip specific driver for
the root port or switch might know these capabilities aren't physically
possible.  So I can limit the default to match physical hardware.

[snip]
> > > So I see this in spec:
> > > 7.8.19.Link Control 2 Register (Offset 30h)
> > > 
> > > 
> > >     9
> > >     Hardware Autonomous Width Disable – When Set, this bit
> > >     disables hardware from changing the Link width for reasons
> > >     other than attempting to correct unreliable Link operation by
> > >     reducing Link width.
> > >     For a Multi-Function device associated with an Upstream Port,
> > >     the bit in Function 0 is of type RW, and only Function 0 controls
> > >     the component’s Link behavior. In all other Functions of that
> > >     device, this bit is of type RsvdP.
> > >     Components that do not implement the ability autonomously to
> > >     change Link width are permitted to hardwire this bit to 0b.
> > >     Default value of this bit is 0b.
> > >     RW/RsvdP
> > >     (see
> > >     description)
> > > 
> > > So far we did not implement this according to rules:
> > > we always used the 1x width so not ability to change
> > > width.
> > > 
> > > Now that we do, we need to support the override?
> > > 
> > > Did not look but something like this could exist for speed too?
> > 
> > We should definitely only have function 0 perform the link negotiation
> > with the upstream port, subsequent functions can just copy the result
> > from function 0.  Am I still missing something or is that sufficient?
> 
> We would also need to implement the bit above.

Why?  Components are allowed to not implement it, which is what we do by
leaving it set it 0b.  For device assignment maybe we'll want to let a
write of that bit pass to hardware.

> If we ever implement error reporting, we might
> see devices reduce the width automatically
> and need to implement that.

And yes, assigned devices may change link speed and I expect as we
increase AER support we'll be able to notify for things like that.
Emulated devices have no reason to change link speed, so what I'm trying
for here is to get a proper initial link negotiation.

> All of this is bypassed if we simply report x1 to guest ...

So the argument is that x1 is easy?  I've never known that argument to
be very powerful ;)

> > > > +    /* Choose the highest speed supported by both devices */
> > > > +    speed = ffs(pow2floor(pcie_link_speed_mask(dev) &
> > > > +                          pcie_link_speed_mask(parent)));
> > > 
> > > What's all this trickery about? Not same as fls by chance?
> > 
> > Yes, I couldn't find fls, I'll see if the compiler can.  This rounds
> > down to the highest power of 2 then finds the first (only) set bit.
> 
> It's in include/qemu-common.h

Thanks, found it: qemu_fls.  Thanks,

Alex
diff mbox

Patch

diff --git a/hw/ioh3420.c b/hw/ioh3420.c
index 5cff61e..0aaec5b 100644
--- a/hw/ioh3420.c
+++ b/hw/ioh3420.c
@@ -115,7 +115,10 @@  static int ioh3420_initfn(PCIDevice *d)
     if (rc < 0) {
         goto err_bridge;
     }
-    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port);
+
+    /* Real hardware only supports up to x4, 2.5GT/s, but we're not real hw */
+    rc = pcie_cap_init(d, IOH_EP_EXP_OFFSET, PCI_EXP_TYPE_ROOT_PORT, p->port,
+                       PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
     if (rc < 0) {
         goto err_msi;
     }
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 62bd0b8..c07d3cc 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -37,11 +37,98 @@ 
 #define PCIE_DEV_PRINTF(dev, fmt, ...)                                  \
     PCIE_DPRINTF("%s:%x "fmt, (dev)->name, (dev)->devfn, ## __VA_ARGS__)
 
+static uint16_t pcie_link_max_width(PCIDevice *dev)
+{
+    uint8_t *exp_cap;
+    uint32_t lnkcap;
+
+    exp_cap = dev->config + dev->exp.exp_cap;
+    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
+
+    return lnkcap & PCI_EXP_LNKCAP_MLW;
+}
+
+static uint8_t pcie_link_speed_mask(PCIDevice *dev)
+{
+    uint8_t *exp_cap;
+    uint32_t lnkcap, lnkcap2;
+    uint8_t speeds, mask;
+
+    exp_cap = dev->config + dev->exp.exp_cap;
+    lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
+    lnkcap2 = pci_get_long(exp_cap + PCI_EXP_LNKCAP2);
+
+    mask = (1 << (lnkcap & PCI_EXP_LNKCAP_SLS)) - 1;
+
+    /*
+     * If LNKCAP2 reports supported link speeds, then LNKCAP indexes
+     * the highest supported speed.  Mask out the rest and return.
+     */
+    speeds = (lnkcap2 & 0xfe) >> 1;
+    if (speeds) {
+        return speeds & mask;
+    }
+
+    /*
+     * Otherwise LNKCAP returns the maximum speed and the device supports
+     * all speeds below it.  This is really only valid for 2.5 & 5GT/s
+     */
+    return mask;
+}
+
+void pcie_negotiate_link(PCIDevice *dev)
+{
+    PCIDevice *parent;
+    uint16_t flags, width;
+    uint8_t type, speed;
+
+    /* Skip non-express buses and Root Complex buses. */
+    if (!pci_bus_is_express(dev->bus) || pci_bus_is_root(dev->bus)) {
+        return;
+    }
+
+    /*
+     * Downstream ports don't negotiate with upstream ports, their link
+     * is negotiated by whatever is attached downstream to them.  The
+     * same is true of root ports, but root ports are always attached to
+     * the root complex, so fall out above.
+     */
+    flags = pci_get_word(dev->config + dev->exp.exp_cap + PCI_EXP_FLAGS);
+    type = (flags & PCI_EXP_FLAGS_TYPE) >> PCI_EXP_FLAGS_TYPE_SHIFT;
+    if (type == PCI_EXP_TYPE_DOWNSTREAM) {
+        return;
+    }
+
+    parent = dev->bus->parent_dev;
+
+    assert(pci_is_express(dev) && dev->exp.exp_cap &&
+           pci_is_express(parent) && parent->exp.exp_cap);
+
+    /* Devices support all widths below max, so use the MIN max */
+    width = MIN(pcie_link_max_width(dev), pcie_link_max_width(parent));
+    /* Choose the highest speed supported by both devices */
+    speed = ffs(pow2floor(pcie_link_speed_mask(dev) &
+                          pcie_link_speed_mask(parent)));
+
+    pci_word_test_and_clear_mask(dev->config + dev->exp.exp_cap +
+                                 PCI_EXP_LNKSTA,
+                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
+    pci_word_test_and_set_mask(dev->config + dev->exp.exp_cap + PCI_EXP_LNKSTA,
+                               width | speed);
+
+    pci_word_test_and_clear_mask(parent->config + parent->exp.exp_cap +
+                                 PCI_EXP_LNKSTA,
+                                 PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
+    pci_word_test_and_set_mask(parent->config + parent->exp.exp_cap +
+                               PCI_EXP_LNKSTA,
+                               width | speed);
+}
 
 /***************************************************************************
  * pci express capability helper functions
  */
-int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
+int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
+                  uint16_t width, uint8_t speed)
 {
     int pos;
     uint8_t *exp_cap;
@@ -71,14 +158,56 @@  int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
      */
     pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
 
-    pci_set_long(exp_cap + PCI_EXP_LNKCAP,
-                 (port << PCI_EXP_LNKCAP_PN_SHIFT) |
-                 PCI_EXP_LNKCAP_ASPMS_0S |
-                 PCI_EXP_LNK_MLW_1 |
-                 PCI_EXP_LNK_LS_25);
+    /* Root Complex devices don't report link fields */
+    if (type != PCI_EXP_TYPE_RC_END) {
+        /* User specifies the link port # */
+        pci_set_long(exp_cap + PCI_EXP_LNKCAP, port << PCI_EXP_LNKCAP_PN_SHIFT);
+
+        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, width | speed);
+
+        /* Advertise L0s ASPM support */
+        pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
+                                   PCI_EXP_LNKCAP_ASPMS_L0S);
 
-    pci_set_word(exp_cap + PCI_EXP_LNKSTA,
-                 PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25);
+        /*
+         * Link bandwidth notification is required for all root ports and
+         * downstream ports supporting links wider than x1.
+         */
+        if (width > PCI_EXP_LNK_MLW_1 && (type == PCI_EXP_TYPE_ROOT_PORT ||
+            type == PCI_EXP_TYPE_DOWNSTREAM)) {
+            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
+                                       PCI_EXP_LNKCAP_LBNC);
+        }
+
+        /* 8.0GT/s adds some requirements */
+        if (speed >= PCI_EXP_LNK_LS_80) {
+            /*
+             * 2.5 & 5.0GT/s can be fully described by LNKCAP, but 8.0GT/s
+             * is actually a reference to the highest bit supported in this
+             * register.  We assume the device supports all link speeds.
+             */
+            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+                                       PCI_EXP_LNK2_LS_25);
+            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+                                       PCI_EXP_LNK2_LS_50);
+            pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP2,
+                                       PCI_EXP_LNK2_LS_80);
+
+            /*
+             * Supporting 8.0GT/s requires that we advertise Data Link Layer
+             * Active on all downstream ports supporting hotplug or speeds
+             * great than 5GT/s
+             */
+            if (type == PCI_EXP_TYPE_DOWNSTREAM) {
+                pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP,
+                                           PCI_EXP_LNKCAP_DLLLARC);
+                pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA,
+                                           PCI_EXP_LNKSTA_DLLLA);
+            }
+        }
+
+        pcie_negotiate_link(dev);
+    }
 
     pci_set_long(exp_cap + PCI_EXP_DEVCAP2,
                  PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
@@ -87,7 +216,8 @@  int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port)
     return pos;
 }
 
-int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
+int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
+                           uint8_t speed)
 {
     uint8_t type = PCI_EXP_TYPE_ENDPOINT;
 
@@ -100,7 +230,7 @@  int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset)
         type = PCI_EXP_TYPE_RC_END;
     }
 
-    return pcie_cap_init(dev, offset, type, 0);
+    return pcie_cap_init(dev, offset, type, 0, width, speed);
 }
 
 void pcie_cap_exit(PCIDevice *dev)
diff --git a/hw/pci/pcie.h b/hw/pci/pcie.h
index c010007..051dd76 100644
--- a/hw/pci/pcie.h
+++ b/hw/pci/pcie.h
@@ -94,9 +94,12 @@  struct PCIExpressDevice {
 };
 
 /* PCI express capability helper functions */
-int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port);
-int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset);
+int pcie_cap_init(PCIDevice *dev, uint8_t offset, uint8_t type, uint8_t port,
+                  uint16_t width, uint8_t speed);
+int pcie_endpoint_cap_init(PCIDevice *dev, uint8_t offset, uint16_t width,
+                           uint8_t speed);
 void pcie_cap_exit(PCIDevice *dev);
+void pcie_negotiate_link(PCIDevice *dev);
 uint8_t pcie_cap_get_type(const PCIDevice *dev);
 void pcie_cap_flags_set_vector(PCIDevice *dev, uint8_t vector);
 uint8_t pcie_cap_flags_get_vector(PCIDevice *dev);
diff --git a/hw/pci/pcie_regs.h b/hw/pci/pcie_regs.h
index 4d123d9..4078451 100644
--- a/hw/pci/pcie_regs.h
+++ b/hw/pci/pcie_regs.h
@@ -34,13 +34,23 @@ 
 /* PCI_EXP_LINK{CAP, STA} */
 /* link speed */
 #define PCI_EXP_LNK_LS_25               1
+#define PCI_EXP_LNK_LS_50               2
+#define PCI_EXP_LNK_LS_80               3
 
 #define PCI_EXP_LNK_MLW_SHIFT           (ffs(PCI_EXP_LNKCAP_MLW) - 1)
 #define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
+#define PCI_EXP_LNK_MLW_2               (2 << PCI_EXP_LNK_MLW_SHIFT)
+#define PCI_EXP_LNK_MLW_4               (4 << PCI_EXP_LNK_MLW_SHIFT)
+#define PCI_EXP_LNK_MLW_8               (8 << PCI_EXP_LNK_MLW_SHIFT)
+#define PCI_EXP_LNK_MLW_12              (12 << PCI_EXP_LNK_MLW_SHIFT)
+#define PCI_EXP_LNK_MLW_16              (16 << PCI_EXP_LNK_MLW_SHIFT)
+#define PCI_EXP_LNK_MLW_32              (32 << PCI_EXP_LNK_MLW_SHIFT)
 
 /* PCI_EXP_LINKCAP */
 #define PCI_EXP_LNKCAP_ASPMS_SHIFT      (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
-#define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
+#define PCI_EXP_LNKCAP_ASPMS_L0S        (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
+#define PCI_EXP_LNKCAP_ASPMS_L1         (2 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
+#define PCI_EXP_LNKCAP_ASPMS_L0SL1      (3 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
 
 #define PCI_EXP_LNKCAP_PN_SHIFT         (ffs(PCI_EXP_LNKCAP_PN) - 1)
 
@@ -72,6 +82,11 @@ 
 
 #define PCI_EXP_DEVCTL2_EETLPPB         0x80
 
+#define PCI_EXP_LNKCAP2         44      /* Link Capabilities 2 */
+#define PCI_EXP_LNK2_LS_25              (1 << 1)
+#define PCI_EXP_LNK2_LS_50              (1 << 2)
+#define PCI_EXP_LNK2_LS_80              (1 << 3)
+
 /* ARI */
 #define PCI_ARI_VER                     1
 #define PCI_ARI_SIZEOF                  8
diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index 5aa342b..5f57807 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -3332,7 +3332,8 @@  static int usb_xhci_initfn(struct PCIDevice *dev)
                      PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64,
                      &xhci->mem);
 
-    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0);
+    ret = pcie_endpoint_cap_init(&xhci->pci_dev, 0xa0, PCI_EXP_LNK_MLW_1,
+                                 PCI_EXP_LNK_LS_25);
     assert(ret >= 0);
 
     if (xhci->flags & (1 << XHCI_FLAG_USE_MSI)) {
diff --git a/hw/xio3130_downstream.c b/hw/xio3130_downstream.c
index b868f56..086ada9 100644
--- a/hw/xio3130_downstream.c
+++ b/hw/xio3130_downstream.c
@@ -79,8 +79,10 @@  static int xio3130_downstream_initfn(PCIDevice *d)
     if (rc < 0) {
         goto err_bridge;
     }
+
+    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
     rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_DOWNSTREAM,
-                       p->port);
+                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
     if (rc < 0) {
         goto err_msi;
     }
diff --git a/hw/xio3130_upstream.c b/hw/xio3130_upstream.c
index cd5d97d..4b6820b 100644
--- a/hw/xio3130_upstream.c
+++ b/hw/xio3130_upstream.c
@@ -75,8 +75,10 @@  static int xio3130_upstream_initfn(PCIDevice *d)
     if (rc < 0) {
         goto err_bridge;
     }
+
+    /* Real hardware only supports x1, 2.5GT/s, but we're not real hw */
     rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_UPSTREAM,
-                       p->port);
+                       p->port, PCI_EXP_LNK_MLW_16, PCI_EXP_LNK_LS_80);
     if (rc < 0) {
         goto err_msi;
     }