Patchwork [RFC,8/8] device-assignment: pass through and stub more PCI caps

login
register
mail settings
Submitter Alex Williamson
Date Nov. 12, 2010, 2:56 a.m.
Message ID <20101112025634.31423.56735.stgit@s20.home>
Download mbox | patch
Permalink /patch/70922/
State New
Headers show

Comments

Alex Williamson - Nov. 12, 2010, 2:56 a.m.
Some drivers depend on finding capabilities like power management,
PCI express/X, vital product data, or vendor specific fields.  Now
that we have better capability support, we can pass more of these
tables through to the guest.  Note that VPD and VNDR are direct pass
through capabilies, the rest are mostly empty shells with a few
writable bits where necessary.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 149 insertions(+), 11 deletions(-)
Michael S. Tsirkin - Nov. 12, 2010, 5:36 a.m.
On Thu, Nov 11, 2010 at 07:56:46PM -0700, Alex Williamson wrote:
> Some drivers depend on finding capabilities like power management,
> PCI express/X, vital product data, or vendor specific fields.  Now
> that we have better capability support, we can pass more of these
> tables through to the guest.  Note that VPD and VNDR are direct pass
> through capabilies, the rest are mostly empty shells with a few
> writable bits where necessary.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
>  1 files changed, 149 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 179c7dc..1b228ad 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -366,6 +366,27 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
>      return (uint8_t)assigned_dev_pci_read(d, pos, 1);
>  }
>  
> +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> +{
> +    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
> +    ssize_t ret;
> +    int fd = pci_dev->real_device.config_fd;
> +
> +again:
> +    ret = pwrite(fd, &val, len, pos);
> +    if (ret != len) {
> +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> +	    goto again;


do {} while() ?

> +
> +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> +		__func__, ret, errno);
> +
> +	exit(1);
> +    }
> +
> +    return;
> +}
> +
>  static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
>  {
>      int id;
> @@ -1244,37 +1265,75 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
>  #endif
>  #endif
>  
> +static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> +                                                    uint8_t cap_id,
> +                                                    uint32_t address, int len)
> +{
> +    uint8_t cap;
> +
> +    switch (cap_id) {
> +
> +    case PCI_CAP_ID_VPD:
> +        cap = pci_find_capability(pci_dev, cap_id);
> +        if (address - cap >= PCI_CAP_FLAGS) {
> +            return assigned_dev_pci_read(pci_dev, address, len);
> +        }
> +        break;
> +
> +    case PCI_CAP_ID_VNDR:
> +        cap = pci_find_capability(pci_dev, cap_id);
> +        if (address - cap > PCI_CAP_FLAGS) {
> +            return assigned_dev_pci_read(pci_dev, address, len);
> +        }
> +        break;
> +    }
> +
> +    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
> +}
> +
>  static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
>                                                   uint8_t cap_id,
>                                                   uint32_t address,
>                                                   uint32_t val, int len)
>  {
> +    uint8_t cap;
> +
>      pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
>  
>      switch (cap_id) {
>  #ifdef KVM_CAP_IRQ_ROUTING
>      case PCI_CAP_ID_MSI:
>  #ifdef KVM_CAP_DEVICE_MSI
> -        {
> -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> -            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> -                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> -            }
> +        cap = pci_find_capability(pci_dev, cap_id);
> +        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> +            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
>          }
>  #endif
>          break;
>  
>      case PCI_CAP_ID_MSIX:
>  #ifdef KVM_CAP_DEVICE_MSIX
> -        {
> -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> -            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> -                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> -            }
> +        cap = pci_find_capability(pci_dev, cap_id);
> +        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> +            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
>          }
>  #endif
>          break;
>  #endif
> +
> +    case PCI_CAP_ID_VPD:
> +        cap = pci_find_capability(pci_dev, cap_id);
> +        if (address - cap >= PCI_CAP_FLAGS) {
> +            assigned_dev_pci_write(pci_dev, address, val, len);
> +        }
> +        break;
> +
> +    case PCI_CAP_ID_VNDR:
> +        cap = pci_find_capability(pci_dev, cap_id);
> +        if (address - cap > PCI_CAP_FLAGS) {
> +            assigned_dev_pci_write(pci_dev, address, val, len);
> +        }
> +        break;

I have a feeling we should use overlap functions instead of
address math. What do you think?
Also - put cap offsets in assigned device structure to avoid
find calls?

>      }
>      return;
>  }
> @@ -1340,6 +1399,84 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
>  #endif
>  #endif
>  
> +    /* Minimal PM support, make the state bits writable so the guest
> +     * thinks it's doing something. */
> +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM))) {
> +        uint16_t pmc, pmcsr;
> +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PM, pos,
> +                                     PCI_PM_SIZEOF);
> +
> +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> +
> +        pmcsr = pci_get_word(pci_dev->config + pos + PCI_PM_CTRL);
> +        pmcsr &= (PCI_PM_CTRL_STATE_MASK);
> +        pmcsr |= PCI_PM_CTRL_NO_SOFT_RST;
> +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, pmcsr);
> +
> +        pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
> +                     PCI_PM_CTRL_STATE_MASK);
> +    }

we don't pass anything to device. So - can this be put in pci_pm.c
so that emulated devices can use this too?

> +
> +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP))) {
> +        uint16_t devctl, lnkcap, lnksta;
> +
> +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_EXP, pos, 0x40);
> +
> +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> +
> +        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
> +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> +                   PCI_EXP_LNKCAP_L1EL);
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> +
> +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
> +                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
> +                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
> +                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
> +        
> +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> +
> +        pci_set_word(pci_dev->wmask + pos + 0x28, 0x1f);
> +        pci_set_word(pci_dev->wmask + pos + 0x30, 0xfbf);

This seems to overlap functionally with the express work upstream.
Can code from there be reused?  I also wonder whether is affects the
guest OS if it finds an express device on a non-express bridge.

> +    }
> +
> +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD))) {
> +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VPD, pos, 0x8);
> +    }
> +
> +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR))) {
> +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> +    }
> +
> +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX))) {
> +        uint16_t cmd;
> +        uint32_t status;
> +
> +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PCIX, pos, 0x8);
> +
> +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> +                PCI_X_CMD_MAX_SPLIT);
> +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> +
> +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> +                    PCI_X_STATUS_SPL_ERR);
> +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> +    }

This will be handy for non-assignment case so
I'd like to see this moved out of device-assignment.c:
we could create pcix.c or just add to pci.c.

>      return 0;
>  }
>  
> @@ -1466,7 +1603,8 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
>      dev->h_busnr = dev->host.bus;
>      dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
>  
> -    pci_register_capability_handlers(pci_dev, NULL,
> +    pci_register_capability_handlers(pci_dev,
> +                                     assigned_device_pci_cap_read_config,
>                                       assigned_device_pci_cap_write_config);

Maybe these could go away?

>  
>      if (assigned_device_pci_cap_init(pci_dev) < 0)
Alex Williamson - Nov. 12, 2010, 6:30 a.m.
On Fri, 2010-11-12 at 07:36 +0200, Michael S. Tsirkin wrote:
> On Thu, Nov 11, 2010 at 07:56:46PM -0700, Alex Williamson wrote:
> > Some drivers depend on finding capabilities like power management,
> > PCI express/X, vital product data, or vendor specific fields.  Now
> > that we have better capability support, we can pass more of these
> > tables through to the guest.  Note that VPD and VNDR are direct pass
> > through capabilies, the rest are mostly empty shells with a few
> > writable bits where necessary.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
> >  1 files changed, 149 insertions(+), 11 deletions(-)
> > 
> > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > index 179c7dc..1b228ad 100644
> > --- a/hw/device-assignment.c
> > +++ b/hw/device-assignment.c
> > @@ -366,6 +366,27 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> >      return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> >  }
> >  
> > +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> > +{
> > +    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
> > +    ssize_t ret;
> > +    int fd = pci_dev->real_device.config_fd;
> > +
> > +again:
> > +    ret = pwrite(fd, &val, len, pos);
> > +    if (ret != len) {
> > +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> > +	    goto again;
> 
> 
> do {} while() ?

Sure, this is just a copy of another place that does something similar.
They should either be merged or both converted in a separate patch.

> > +
> > +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> > +		__func__, ret, errno);
> > +
> > +	exit(1);
> > +    }
> > +
> > +    return;
> > +}
> > +
> >  static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
> >  {
> >      int id;
> > @@ -1244,37 +1265,75 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
> >  #endif
> >  #endif
> >  
> > +static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> > +                                                    uint8_t cap_id,
> > +                                                    uint32_t address, int len)
> > +{
> > +    uint8_t cap;
> > +
> > +    switch (cap_id) {
> > +
> > +    case PCI_CAP_ID_VPD:
> > +        cap = pci_find_capability(pci_dev, cap_id);
> > +        if (address - cap >= PCI_CAP_FLAGS) {
> > +            return assigned_dev_pci_read(pci_dev, address, len);
> > +        }
> > +        break;
> > +
> > +    case PCI_CAP_ID_VNDR:
> > +        cap = pci_find_capability(pci_dev, cap_id);
> > +        if (address - cap > PCI_CAP_FLAGS) {
> > +            return assigned_dev_pci_read(pci_dev, address, len);
> > +        }
> > +        break;
> > +    }
> > +
> > +    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
> > +}
> > +
> >  static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> >                                                   uint8_t cap_id,
> >                                                   uint32_t address,
> >                                                   uint32_t val, int len)
> >  {
> > +    uint8_t cap;
> > +
> >      pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
> >  
> >      switch (cap_id) {
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >      case PCI_CAP_ID_MSI:
> >  #ifdef KVM_CAP_DEVICE_MSI
> > -        {
> > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > -            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > -                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > -            }
> > +        cap = pci_find_capability(pci_dev, cap_id);
> > +        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > +            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> >          }
> >  #endif
> >          break;
> >  
> >      case PCI_CAP_ID_MSIX:
> >  #ifdef KVM_CAP_DEVICE_MSIX
> > -        {
> > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > -            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > -                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > -            }
> > +        cap = pci_find_capability(pci_dev, cap_id);
> > +        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > +            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> >          }
> >  #endif
> >          break;
> >  #endif
> > +
> > +    case PCI_CAP_ID_VPD:
> > +        cap = pci_find_capability(pci_dev, cap_id);
> > +        if (address - cap >= PCI_CAP_FLAGS) {
> > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > +        }
> > +        break;
> > +
> > +    case PCI_CAP_ID_VNDR:
> > +        cap = pci_find_capability(pci_dev, cap_id);
> > +        if (address - cap > PCI_CAP_FLAGS) {
> > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > +        }
> > +        break;
> 
> I have a feeling we should use overlap functions instead of
> address math. What do you think?

if (!ranges_overlap(address - cap, len, 0, PCI_CAP_FLAGS)) ?

Sure, that'd be a nice cleanup.

> Also - put cap offsets in assigned device structure to avoid
> find calls?

I suppose there aren't enough capability IDs that it'd take much space
to do so, but it doesn't sound like a unique to device assignment issue.
Maybe that should live on PCIDevice with an access function.

> >      }
> >      return;
> >  }
> > @@ -1340,6 +1399,84 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> >  #endif
> >  #endif
> >  
> > +    /* Minimal PM support, make the state bits writable so the guest
> > +     * thinks it's doing something. */
> > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM))) {
> > +        uint16_t pmc, pmcsr;
> > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PM, pos,
> > +                                     PCI_PM_SIZEOF);
> > +
> > +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> > +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> > +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> > +
> > +        pmcsr = pci_get_word(pci_dev->config + pos + PCI_PM_CTRL);
> > +        pmcsr &= (PCI_PM_CTRL_STATE_MASK);
> > +        pmcsr |= PCI_PM_CTRL_NO_SOFT_RST;
> > +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, pmcsr);
> > +
> > +        pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
> > +                     PCI_PM_CTRL_STATE_MASK);
> > +    }
> 
> we don't pass anything to device. So - can this be put in pci_pm.c
> so that emulated devices can use this too?

That's part of why this one is an RFC, should we allow the guest to do
some level of power management to the physical device?  Xen seems to
allow D-state manipulation by the guest.

> > +
> > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP))) {
> > +        uint16_t devctl, lnkcap, lnksta;
> > +
> > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_EXP, pos, 0x40);
> > +
> > +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> > +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> > +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> > +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> > +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> > +
> > +        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
> > +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> > +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> > +                   PCI_EXP_LNKCAP_L1EL);
> > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> > +
> > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
> > +                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
> > +                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
> > +                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
> > +        
> > +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> > +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> > +
> > +        pci_set_word(pci_dev->wmask + pos + 0x28, 0x1f);
> > +        pci_set_word(pci_dev->wmask + pos + 0x30, 0xfbf);
> 
> This seems to overlap functionally with the express work upstream.
> Can code from there be reused?  I also wonder whether is affects the
> guest OS if it finds an express device on a non-express bridge.

Yes, perhaps it can be merged.  I'd like to start with figuring out what
we need for device assignment, and merging where it makes sense later
than stall out trying to solve the whole problem upfront.

It could certainly be confusing for a driver to find an express device
under a PIIX chipset, but I think typically the driver is just looking
for link speed info for pretty printks.

> > +    }
> > +
> > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD))) {
> > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VPD, pos, 0x8);
> > +    }
> > +
> > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR))) {
> > +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> > +    }
> > +
> > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX))) {
> > +        uint16_t cmd;
> > +        uint32_t status;
> > +
> > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PCIX, pos, 0x8);
> > +
> > +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> > +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> > +                PCI_X_CMD_MAX_SPLIT);
> > +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> > +
> > +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> > +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> > +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> > +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> > +                    PCI_X_STATUS_SPL_ERR);
> > +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> > +    }
> 
> This will be handy for non-assignment case so
> I'd like to see this moved out of device-assignment.c:
> we could create pcix.c or just add to pci.c.

Yeah, I'm not sure there's enough we can poke at in a PCIX capability to
warrant it's own file.  Here for the same reason as the others, what do
we want to expose, what's emulated, what's poke-able.  We also have the
benefit here that we get default from hardware.  If it stays this small,
I'd just assume leave it here than try to generalize an interface when
we're the only user.

> >      return 0;
> >  }
> >  
> > @@ -1466,7 +1603,8 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
> >      dev->h_busnr = dev->host.bus;
> >      dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
> >  
> > -    pci_register_capability_handlers(pci_dev, NULL,
> > +    pci_register_capability_handlers(pci_dev,
> > +                                     assigned_device_pci_cap_read_config,
> >                                       assigned_device_pci_cap_write_config);
> 
> Maybe these could go away?

Sure, pass a capability ID to the read/write_config functions and I'd
support this going way.  I don't think that necessarily needs to be tied
to this series though.  Thanks,

Alex

> >  
> >      if (assigned_device_pci_cap_init(pci_dev) < 0)
Michael S. Tsirkin - Nov. 12, 2010, 9:11 a.m.
On Thu, Nov 11, 2010 at 11:30:07PM -0700, Alex Williamson wrote:
> On Fri, 2010-11-12 at 07:36 +0200, Michael S. Tsirkin wrote:
> > On Thu, Nov 11, 2010 at 07:56:46PM -0700, Alex Williamson wrote:
> > > Some drivers depend on finding capabilities like power management,
> > > PCI express/X, vital product data, or vendor specific fields.  Now
> > > that we have better capability support, we can pass more of these
> > > tables through to the guest.  Note that VPD and VNDR are direct pass
> > > through capabilies, the rest are mostly empty shells with a few
> > > writable bits where necessary.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > >  hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 files changed, 149 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > > index 179c7dc..1b228ad 100644
> > > --- a/hw/device-assignment.c
> > > +++ b/hw/device-assignment.c
> > > @@ -366,6 +366,27 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> > >      return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> > >  }
> > >  
> > > +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> > > +{
> > > +    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
> > > +    ssize_t ret;
> > > +    int fd = pci_dev->real_device.config_fd;
> > > +
> > > +again:
> > > +    ret = pwrite(fd, &val, len, pos);
> > > +    if (ret != len) {
> > > +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> > > +	    goto again;
> > 
> > 
> > do {} while() ?
> 
> Sure, this is just a copy of another place that does something similar.
> They should either be merged or both converted in a separate patch.
> 
> > > +
> > > +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> > > +		__func__, ret, errno);
> > > +
> > > +	exit(1);
> > > +    }
> > > +
> > > +    return;
> > > +}
> > > +
> > >  static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
> > >  {
> > >      int id;
> > > @@ -1244,37 +1265,75 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
> > >  #endif
> > >  #endif
> > >  
> > > +static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> > > +                                                    uint8_t cap_id,
> > > +                                                    uint32_t address, int len)
> > > +{
> > > +    uint8_t cap;
> > > +
> > > +    switch (cap_id) {
> > > +
> > > +    case PCI_CAP_ID_VPD:
> > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > +        }
> > > +        break;
> > > +
> > > +    case PCI_CAP_ID_VNDR:
> > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > +        }
> > > +        break;
> > > +    }
> > > +
> > > +    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
> > > +}
> > > +
> > >  static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> > >                                                   uint8_t cap_id,
> > >                                                   uint32_t address,
> > >                                                   uint32_t val, int len)
> > >  {
> > > +    uint8_t cap;
> > > +
> > >      pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
> > >  
> > >      switch (cap_id) {
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > >      case PCI_CAP_ID_MSI:
> > >  #ifdef KVM_CAP_DEVICE_MSI
> > > -        {
> > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > -            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > -                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > -            }
> > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > +        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > +            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > >          }
> > >  #endif
> > >          break;
> > >  
> > >      case PCI_CAP_ID_MSIX:
> > >  #ifdef KVM_CAP_DEVICE_MSIX
> > > -        {
> > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > -            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > -                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > -            }
> > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > +        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > +            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > >          }
> > >  #endif
> > >          break;
> > >  #endif
> > > +
> > > +    case PCI_CAP_ID_VPD:
> > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > +        }
> > > +        break;
> > > +
> > > +    case PCI_CAP_ID_VNDR:
> > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > +        }
> > > +        break;
> > 
> > I have a feeling we should use overlap functions instead of
> > address math. What do you think?
> 
> if (!ranges_overlap(address - cap, len, 0, PCI_CAP_FLAGS)) ?

	ranges_overlap(address, len, cap, PCI_CAP_FLAGS)

> Sure, that'd be a nice cleanup.
> 
> > Also - put cap offsets in assigned device structure to avoid
> > find calls?
> 
> I suppose there aren't enough capability IDs that it'd take much space
> to do so, but it doesn't sound like a unique to device assignment issue.
> Maybe that should live on PCIDevice with an access function.

Sure, I put all caps that we actually emulate in PCIDevice.
So that would apply to express, pcix, etc.
Sticking offsets to caps that core doesn't emulate in PCIDevice
seems a bit strange. That's why each device has its own device state.

> > >      }
> > >      return;
> > >  }
> > > @@ -1340,6 +1399,84 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> > >  #endif
> > >  #endif
> > >  
> > > +    /* Minimal PM support, make the state bits writable so the guest
> > > +     * thinks it's doing something. */
> > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM))) {
> > > +        uint16_t pmc, pmcsr;
> > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PM, pos,
> > > +                                     PCI_PM_SIZEOF);
> > > +
> > > +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> > > +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> > > +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> > > +
> > > +        pmcsr = pci_get_word(pci_dev->config + pos + PCI_PM_CTRL);
> > > +        pmcsr &= (PCI_PM_CTRL_STATE_MASK);
> > > +        pmcsr |= PCI_PM_CTRL_NO_SOFT_RST;
> > > +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, pmcsr);
> > > +
> > > +        pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
> > > +                     PCI_PM_CTRL_STATE_MASK);
> > > +    }
> > 
> > we don't pass anything to device. So - can this be put in pci_pm.c
> > so that emulated devices can use this too?
> 
> That's part of why this one is an RFC, should we allow the guest to do
> some level of power management to the physical device?  Xen seems to
> allow D-state manipulation by the guest.

Hmm, ok. I still hope we can do the emulated one and then
assigned device would call the relevant function and pass
info to the backend.

> > > +
> > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP))) {
> > > +        uint16_t devctl, lnkcap, lnksta;
> > > +
> > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_EXP, pos, 0x40);
> > > +
> > > +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> > > +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> > > +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> > > +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> > > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> > > +
> > > +        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
> > > +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> > > +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> > > +                   PCI_EXP_LNKCAP_L1EL);
> > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> > > +
> > > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
> > > +                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
> > > +                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
> > > +                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
> > > +        
> > > +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> > > +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> > > +
> > > +        pci_set_word(pci_dev->wmask + pos + 0x28, 0x1f);
> > > +        pci_set_word(pci_dev->wmask + pos + 0x30, 0xfbf);
> > 
> > This seems to overlap functionally with the express work upstream.
> > Can code from there be reused?  I also wonder whether is affects the
> > guest OS if it finds an express device on a non-express bridge.
> 
> Yes, perhaps it can be merged.  I'd like to start with figuring out what
> we need for device assignment, and merging where it makes sense later
> than stall out trying to solve the whole problem upfront.
> 
> It could certainly be confusing for a driver to find an express device
> under a PIIX chipset, but I think typically the driver is just looking
> for link speed info for pretty printks.
> > > +    }
> > > +
> > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD))) {
> > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VPD, pos, 0x8);
> > > +    }
> > > +
> > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR))) {
> > > +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> > > +    }
> > > +
> > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX))) {
> > > +        uint16_t cmd;
> > > +        uint32_t status;
> > > +
> > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PCIX, pos, 0x8);
> > > +
> > > +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> > > +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> > > +                PCI_X_CMD_MAX_SPLIT);
> > > +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> > > +
> > > +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> > > +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> > > +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> > > +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> > > +                    PCI_X_STATUS_SPL_ERR);
> > > +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> > > +    }
> > 
> > This will be handy for non-assignment case so
> > I'd like to see this moved out of device-assignment.c:
> > we could create pcix.c or just add to pci.c.
> 
> Yeah, I'm not sure there's enough we can poke at in a PCIX capability to
> warrant it's own file.  Here for the same reason as the others, what do
> we want to expose, what's emulated, what's poke-able.  We also have the
> benefit here that we get default from hardware.  If it stays this small,
> I'd just assume leave it here than try to generalize an interface when
> we're the only user.

Do you have a pcix device btw? Do we even care?

> > >      return 0;
> > >  }
> > >  
> > > @@ -1466,7 +1603,8 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
> > >      dev->h_busnr = dev->host.bus;
> > >      dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
> > >  
> > > -    pci_register_capability_handlers(pci_dev, NULL,
> > > +    pci_register_capability_handlers(pci_dev,
> > > +                                     assigned_device_pci_cap_read_config,
> > >                                       assigned_device_pci_cap_write_config);
> > 
> > Maybe these could go away?
> 
> Sure, pass a capability ID to the read/write_config functions and I'd
> support this going way.  I don't think that necessarily needs to be tied
> to this series though.  Thanks,
> 
> Alex

As in, can be a separate cleanup later. Yes.

> > >  
> > >      if (assigned_device_pci_cap_init(pci_dev) < 0)
> 
>
Alex Williamson - Nov. 12, 2010, 3:42 p.m.
On Fri, 2010-11-12 at 11:11 +0200, Michael S. Tsirkin wrote:
> On Thu, Nov 11, 2010 at 11:30:07PM -0700, Alex Williamson wrote:
> > On Fri, 2010-11-12 at 07:36 +0200, Michael S. Tsirkin wrote:
> > > On Thu, Nov 11, 2010 at 07:56:46PM -0700, Alex Williamson wrote:
> > > > Some drivers depend on finding capabilities like power management,
> > > > PCI express/X, vital product data, or vendor specific fields.  Now
> > > > that we have better capability support, we can pass more of these
> > > > tables through to the guest.  Note that VPD and VNDR are direct pass
> > > > through capabilies, the rest are mostly empty shells with a few
> > > > writable bits where necessary.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > > 
> > > >  hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 files changed, 149 insertions(+), 11 deletions(-)
> > > > 
> > > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > > > index 179c7dc..1b228ad 100644
> > > > --- a/hw/device-assignment.c
> > > > +++ b/hw/device-assignment.c
> > > > @@ -366,6 +366,27 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> > > >      return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> > > >  }
> > > >  
> > > > +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> > > > +{
> > > > +    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
> > > > +    ssize_t ret;
> > > > +    int fd = pci_dev->real_device.config_fd;
> > > > +
> > > > +again:
> > > > +    ret = pwrite(fd, &val, len, pos);
> > > > +    if (ret != len) {
> > > > +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> > > > +	    goto again;
> > > 
> > > 
> > > do {} while() ?
> > 
> > Sure, this is just a copy of another place that does something similar.
> > They should either be merged or both converted in a separate patch.
> > 
> > > > +
> > > > +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> > > > +		__func__, ret, errno);
> > > > +
> > > > +	exit(1);
> > > > +    }
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > >  static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
> > > >  {
> > > >      int id;
> > > > @@ -1244,37 +1265,75 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
> > > >  #endif
> > > >  #endif
> > > >  
> > > > +static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> > > > +                                                    uint8_t cap_id,
> > > > +                                                    uint32_t address, int len)
> > > > +{
> > > > +    uint8_t cap;
> > > > +
> > > > +    switch (cap_id) {
> > > > +
> > > > +    case PCI_CAP_ID_VPD:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > > +        }
> > > > +        break;
> > > > +
> > > > +    case PCI_CAP_ID_VNDR:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > > +        }
> > > > +        break;
> > > > +    }
> > > > +
> > > > +    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
> > > > +}
> > > > +
> > > >  static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> > > >                                                   uint8_t cap_id,
> > > >                                                   uint32_t address,
> > > >                                                   uint32_t val, int len)
> > > >  {
> > > > +    uint8_t cap;
> > > > +
> > > >      pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
> > > >  
> > > >      switch (cap_id) {
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > >      case PCI_CAP_ID_MSI:
> > > >  #ifdef KVM_CAP_DEVICE_MSI
> > > > -        {
> > > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > > -            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > > -                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > > -            }
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > > +            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > >          }
> > > >  #endif
> > > >          break;
> > > >  
> > > >      case PCI_CAP_ID_MSIX:
> > > >  #ifdef KVM_CAP_DEVICE_MSIX
> > > > -        {
> > > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > > -            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > > -                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > > -            }
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > > +            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > >          }
> > > >  #endif
> > > >          break;
> > > >  #endif
> > > > +
> > > > +    case PCI_CAP_ID_VPD:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > > +        }
> > > > +        break;
> > > > +
> > > > +    case PCI_CAP_ID_VNDR:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > > +        }
> > > > +        break;
> > > 
> > > I have a feeling we should use overlap functions instead of
> > > address math. What do you think?
> > 
> > if (!ranges_overlap(address - cap, len, 0, PCI_CAP_FLAGS)) ?
> 
> 	ranges_overlap(address, len, cap, PCI_CAP_FLAGS)
> 
> > Sure, that'd be a nice cleanup.
> > 
> > > Also - put cap offsets in assigned device structure to avoid
> > > find calls?
> > 
> > I suppose there aren't enough capability IDs that it'd take much space
> > to do so, but it doesn't sound like a unique to device assignment issue.
> > Maybe that should live on PCIDevice with an access function.
> 
> Sure, I put all caps that we actually emulate in PCIDevice.
> So that would apply to express, pcix, etc.
> Sticking offsets to caps that core doesn't emulate in PCIDevice
> seems a bit strange. That's why each device has its own device state.

The counter argument is that instead of sprinkling cap_msi, cap_msix,
cap_pcie, cap_foo into PCIDevice as support gets added, it would add a
lot of consistency to have a uint8_t caps[PCI_CAP_ID_MAX], then
pci_find_capability simply becomes return pdev->caps[cap_id], and we can
make more use of it.

> > > >      }
> > > >      return;
> > > >  }
> > > > @@ -1340,6 +1399,84 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> > > >  #endif
> > > >  #endif
> > > >  
> > > > +    /* Minimal PM support, make the state bits writable so the guest
> > > > +     * thinks it's doing something. */
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM))) {
> > > > +        uint16_t pmc, pmcsr;
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PM, pos,
> > > > +                                     PCI_PM_SIZEOF);
> > > > +
> > > > +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> > > > +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> > > > +
> > > > +        pmcsr = pci_get_word(pci_dev->config + pos + PCI_PM_CTRL);
> > > > +        pmcsr &= (PCI_PM_CTRL_STATE_MASK);
> > > > +        pmcsr |= PCI_PM_CTRL_NO_SOFT_RST;
> > > > +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, pmcsr);
> > > > +
> > > > +        pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
> > > > +                     PCI_PM_CTRL_STATE_MASK);
> > > > +    }
> > > 
> > > we don't pass anything to device. So - can this be put in pci_pm.c
> > > so that emulated devices can use this too?
> > 
> > That's part of why this one is an RFC, should we allow the guest to do
> > some level of power management to the physical device?  Xen seems to
> > allow D-state manipulation by the guest.
> 
> Hmm, ok. I still hope we can do the emulated one and then
> assigned device would call the relevant function and pass
> info to the backend.

I hope so too, but I don't want generalizing an interface to gate
supporting the capability for assigned devices.

> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP))) {
> > > > +        uint16_t devctl, lnkcap, lnksta;
> > > > +
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_EXP, pos, 0x40);
> > > > +
> > > > +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> > > > +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> > > > +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> > > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> > > > +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> > > > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> > > > +
> > > > +        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
> > > > +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> > > > +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> > > > +                   PCI_EXP_LNKCAP_L1EL);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> > > > +
> > > > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
> > > > +                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
> > > > +                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
> > > > +                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
> > > > +        
> > > > +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> > > > +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> > > > +
> > > > +        pci_set_word(pci_dev->wmask + pos + 0x28, 0x1f);
> > > > +        pci_set_word(pci_dev->wmask + pos + 0x30, 0xfbf);
> > > 
> > > This seems to overlap functionally with the express work upstream.
> > > Can code from there be reused?  I also wonder whether is affects the
> > > guest OS if it finds an express device on a non-express bridge.
> > 
> > Yes, perhaps it can be merged.  I'd like to start with figuring out what
> > we need for device assignment, and merging where it makes sense later
> > than stall out trying to solve the whole problem upfront.
> > 
> > It could certainly be confusing for a driver to find an express device
> > under a PIIX chipset, but I think typically the driver is just looking
> > for link speed info for pretty printks.
> > > > +    }
> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD))) {
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VPD, pos, 0x8);
> > > > +    }
> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR))) {
> > > > +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> > > > +    }
> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX))) {
> > > > +        uint16_t cmd;
> > > > +        uint32_t status;
> > > > +
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PCIX, pos, 0x8);
> > > > +
> > > > +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> > > > +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> > > > +                PCI_X_CMD_MAX_SPLIT);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> > > > +
> > > > +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> > > > +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> > > > +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> > > > +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> > > > +                    PCI_X_STATUS_SPL_ERR);
> > > > +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> > > > +    }
> > > 
> > > This will be handy for non-assignment case so
> > > I'd like to see this moved out of device-assignment.c:
> > > we could create pcix.c or just add to pci.c.
> > 
> > Yeah, I'm not sure there's enough we can poke at in a PCIX capability to
> > warrant it's own file.  Here for the same reason as the others, what do
> > we want to expose, what's emulated, what's poke-able.  We also have the
> > benefit here that we get default from hardware.  If it stays this small,
> > I'd just assume leave it here than try to generalize an interface when
> > we're the only user.
> 
> Do you have a pcix device btw? Do we even care?

I don't have one, but I know of VT-d capable systems that support PCI-e
to PCI-X bridges in their topology.

Alex

> > > >      return 0;
> > > >  }
> > > >  
> > > > @@ -1466,7 +1603,8 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
> > > >      dev->h_busnr = dev->host.bus;
> > > >      dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
> > > >  
> > > > -    pci_register_capability_handlers(pci_dev, NULL,
> > > > +    pci_register_capability_handlers(pci_dev,
> > > > +                                     assigned_device_pci_cap_read_config,
> > > >                                       assigned_device_pci_cap_write_config);
> > > 
> > > Maybe these could go away?
> > 
> > Sure, pass a capability ID to the read/write_config functions and I'd
> > support this going way.  I don't think that necessarily needs to be tied
> > to this series though.  Thanks,
> > 
> > Alex
> 
> As in, can be a separate cleanup later. Yes.
> 
> > > >  
> > > >      if (assigned_device_pci_cap_init(pci_dev) < 0)
> > 
> >
Michael S. Tsirkin - Nov. 16, 2010, 4:08 p.m.
On Fri, Nov 12, 2010 at 08:42:38AM -0700, Alex Williamson wrote:
> On Fri, 2010-11-12 at 11:11 +0200, Michael S. Tsirkin wrote:
> > On Thu, Nov 11, 2010 at 11:30:07PM -0700, Alex Williamson wrote:
> > > On Fri, 2010-11-12 at 07:36 +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Nov 11, 2010 at 07:56:46PM -0700, Alex Williamson wrote:
> > > > > Some drivers depend on finding capabilities like power management,
> > > > > PCI express/X, vital product data, or vendor specific fields.  Now
> > > > > that we have better capability support, we can pass more of these
> > > > > tables through to the guest.  Note that VPD and VNDR are direct pass
> > > > > through capabilies, the rest are mostly empty shells with a few
> > > > > writable bits where necessary.
> > > > > 
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > ---
> > > > > 
> > > > >  hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
> > > > >  1 files changed, 149 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > > > > index 179c7dc..1b228ad 100644
> > > > > --- a/hw/device-assignment.c
> > > > > +++ b/hw/device-assignment.c
> > > > > @@ -366,6 +366,27 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> > > > >      return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> > > > >  }
> > > > >  
> > > > > +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> > > > > +{
> > > > > +    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
> > > > > +    ssize_t ret;
> > > > > +    int fd = pci_dev->real_device.config_fd;
> > > > > +
> > > > > +again:
> > > > > +    ret = pwrite(fd, &val, len, pos);
> > > > > +    if (ret != len) {
> > > > > +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> > > > > +	    goto again;
> > > > 
> > > > 
> > > > do {} while() ?
> > > 
> > > Sure, this is just a copy of another place that does something similar.
> > > They should either be merged or both converted in a separate patch.
> > > 
> > > > > +
> > > > > +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> > > > > +		__func__, ret, errno);
> > > > > +
> > > > > +	exit(1);
> > > > > +    }
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > >  static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
> > > > >  {
> > > > >      int id;
> > > > > @@ -1244,37 +1265,75 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
> > > > >  #endif
> > > > >  #endif
> > > > >  
> > > > > +static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> > > > > +                                                    uint8_t cap_id,
> > > > > +                                                    uint32_t address, int len)
> > > > > +{
> > > > > +    uint8_t cap;
> > > > > +
> > > > > +    switch (cap_id) {
> > > > > +
> > > > > +    case PCI_CAP_ID_VPD:
> > > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > > > +        }
> > > > > +        break;
> > > > > +
> > > > > +    case PCI_CAP_ID_VNDR:
> > > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > > > +        }
> > > > > +        break;
> > > > > +    }
> > > > > +
> > > > > +    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
> > > > > +}
> > > > > +
> > > > >  static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> > > > >                                                   uint8_t cap_id,
> > > > >                                                   uint32_t address,
> > > > >                                                   uint32_t val, int len)
> > > > >  {
> > > > > +    uint8_t cap;
> > > > > +
> > > > >      pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
> > > > >  
> > > > >      switch (cap_id) {
> > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > >      case PCI_CAP_ID_MSI:
> > > > >  #ifdef KVM_CAP_DEVICE_MSI
> > > > > -        {
> > > > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > > > -            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > > > -                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > > > -            }
> > > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > > +        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > > > +            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > > >          }
> > > > >  #endif
> > > > >          break;
> > > > >  
> > > > >      case PCI_CAP_ID_MSIX:
> > > > >  #ifdef KVM_CAP_DEVICE_MSIX
> > > > > -        {
> > > > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > > > -            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > > > -                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > > > -            }
> > > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > > +        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > > > +            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > > >          }
> > > > >  #endif
> > > > >          break;
> > > > >  #endif
> > > > > +
> > > > > +    case PCI_CAP_ID_VPD:
> > > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > > > +        }
> > > > > +        break;
> > > > > +
> > > > > +    case PCI_CAP_ID_VNDR:
> > > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > > > +        }
> > > > > +        break;
> > > > 
> > > > I have a feeling we should use overlap functions instead of
> > > > address math. What do you think?
> > > 
> > > if (!ranges_overlap(address - cap, len, 0, PCI_CAP_FLAGS)) ?
> > 
> > 	ranges_overlap(address, len, cap, PCI_CAP_FLAGS)
> > 
> > > Sure, that'd be a nice cleanup.
> > > 
> > > > Also - put cap offsets in assigned device structure to avoid
> > > > find calls?
> > > 
> > > I suppose there aren't enough capability IDs that it'd take much space
> > > to do so, but it doesn't sound like a unique to device assignment issue.
> > > Maybe that should live on PCIDevice with an access function.
> > 
> > Sure, I put all caps that we actually emulate in PCIDevice.
> > So that would apply to express, pcix, etc.
> > Sticking offsets to caps that core doesn't emulate in PCIDevice
> > seems a bit strange. That's why each device has its own device state.
> 
> The counter argument is that instead of sprinkling cap_msi, cap_msix,
> cap_pcie, cap_foo into PCIDevice as support gets added, it would add a
> lot of consistency to have a uint8_t caps[PCI_CAP_ID_MAX], then
> pci_find_capability simply becomes return pdev->caps[cap_id], and we can
> make more use of it.

Consider that express has 16 bit IDs too. That might make it a problem
if we try to use them as indexes.

Patch

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 179c7dc..1b228ad 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -366,6 +366,27 @@  static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
     return (uint8_t)assigned_dev_pci_read(d, pos, 1);
 }
 
+static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
+{
+    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
+    ssize_t ret;
+    int fd = pci_dev->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, pos);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+
+    return;
+}
+
 static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
 {
     int id;
@@ -1244,37 +1265,75 @@  static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
 #endif
 #endif
 
+static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
+                                                    uint8_t cap_id,
+                                                    uint32_t address, int len)
+{
+    uint8_t cap;
+
+    switch (cap_id) {
+
+    case PCI_CAP_ID_VPD:
+        cap = pci_find_capability(pci_dev, cap_id);
+        if (address - cap >= PCI_CAP_FLAGS) {
+            return assigned_dev_pci_read(pci_dev, address, len);
+        }
+        break;
+
+    case PCI_CAP_ID_VNDR:
+        cap = pci_find_capability(pci_dev, cap_id);
+        if (address - cap > PCI_CAP_FLAGS) {
+            return assigned_dev_pci_read(pci_dev, address, len);
+        }
+        break;
+    }
+
+    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
+}
+
 static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
                                                  uint8_t cap_id,
                                                  uint32_t address,
                                                  uint32_t val, int len)
 {
+    uint8_t cap;
+
     pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
 
     switch (cap_id) {
 #ifdef KVM_CAP_IRQ_ROUTING
     case PCI_CAP_ID_MSI:
 #ifdef KVM_CAP_DEVICE_MSI
-        {
-            uint8_t cap = pci_find_capability(pci_dev, cap_id);
-            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
-                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
-            }
+        cap = pci_find_capability(pci_dev, cap_id);
+        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
+            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
         }
 #endif
         break;
 
     case PCI_CAP_ID_MSIX:
 #ifdef KVM_CAP_DEVICE_MSIX
-        {
-            uint8_t cap = pci_find_capability(pci_dev, cap_id);
-            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
-                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
-            }
+        cap = pci_find_capability(pci_dev, cap_id);
+        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
+            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
         }
 #endif
         break;
 #endif
+
+    case PCI_CAP_ID_VPD:
+        cap = pci_find_capability(pci_dev, cap_id);
+        if (address - cap >= PCI_CAP_FLAGS) {
+            assigned_dev_pci_write(pci_dev, address, val, len);
+        }
+        break;
+
+    case PCI_CAP_ID_VNDR:
+        cap = pci_find_capability(pci_dev, cap_id);
+        if (address - cap > PCI_CAP_FLAGS) {
+            assigned_dev_pci_write(pci_dev, address, val, len);
+        }
+        break;
     }
     return;
 }
@@ -1340,6 +1399,84 @@  static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
 #endif
 #endif
 
+    /* Minimal PM support, make the state bits writable so the guest
+     * thinks it's doing something. */
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM))) {
+        uint16_t pmc, pmcsr;
+        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PM, pos,
+                                     PCI_PM_SIZEOF);
+
+        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
+        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
+        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
+
+        pmcsr = pci_get_word(pci_dev->config + pos + PCI_PM_CTRL);
+        pmcsr &= (PCI_PM_CTRL_STATE_MASK);
+        pmcsr |= PCI_PM_CTRL_NO_SOFT_RST;
+        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, pmcsr);
+
+        pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
+                     PCI_PM_CTRL_STATE_MASK);
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP))) {
+        uint16_t devctl, lnkcap, lnksta;
+
+        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_EXP, pos, 0x40);
+
+        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
+        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
+                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
+        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
+        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
+        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
+
+        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
+        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
+                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
+                   PCI_EXP_LNKCAP_L1EL);
+        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
+
+        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
+                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
+                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
+                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
+        
+        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
+        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
+        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
+
+        pci_set_word(pci_dev->wmask + pos + 0x28, 0x1f);
+        pci_set_word(pci_dev->wmask + pos + 0x30, 0xfbf);
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD))) {
+        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VPD, pos, 0x8);
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR))) {
+        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
+        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VNDR, pos, len);
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX))) {
+        uint16_t cmd;
+        uint32_t status;
+
+        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PCIX, pos, 0x8);
+
+        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
+        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
+                PCI_X_CMD_MAX_SPLIT);
+        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
+
+        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
+        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
+        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
+        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
+                    PCI_X_STATUS_SPL_ERR);
+        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
+    }
     return 0;
 }
 
@@ -1466,7 +1603,8 @@  static int assigned_initfn(struct PCIDevice *pci_dev)
     dev->h_busnr = dev->host.bus;
     dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
 
-    pci_register_capability_handlers(pci_dev, NULL,
+    pci_register_capability_handlers(pci_dev,
+                                     assigned_device_pci_cap_read_config,
                                      assigned_device_pci_cap_write_config);
 
     if (assigned_device_pci_cap_init(pci_dev) < 0)