diff mbox

[2/4] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

Message ID 1409296629-9078-3-git-send-email-knut.omang@oracle.com
State New
Headers show

Commit Message

Knut Omang Aug. 29, 2014, 7:17 a.m. UTC
This patch provides the building blocks for creating an SR/IOV
PCIe Extended Capability header and creating and removing
SR/IOV Virtual Functions.

Signed-off-by: Knut Omang <knut.omang@oracle.com>
---
 hw/pci/pci.c          | 107 +++++++++++++++++++-------
 hw/pci/pcie.c         | 205 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/hw/pci/pci.h  |   6 +-
 include/hw/pci/pcie.h |  26 +++++++
 4 files changed, 311 insertions(+), 33 deletions(-)

Comments

Michael S. Tsirkin Sept. 1, 2014, 9:39 a.m. UTC | #1
On Fri, Aug 29, 2014 at 09:17:07AM +0200, Knut Omang wrote:
> This patch provides the building blocks for creating an SR/IOV
> PCIe Extended Capability header and creating and removing
> SR/IOV Virtual Functions.
> 
> Signed-off-by: Knut Omang <knut.omang@oracle.com>
> ---
>  hw/pci/pci.c          | 107 +++++++++++++++++++-------
>  hw/pci/pcie.c         | 205 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/hw/pci/pci.h  |   6 +-
>  include/hw/pci/pcie.h |  26 +++++++
>  4 files changed, 311 insertions(+), 33 deletions(-)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index daeaeac..071ab81 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -35,7 +35,6 @@
>  #include "hw/pci/msi.h"
>  #include "hw/pci/msix.h"
>  #include "exec/address-spaces.h"
> -#include "hw/hotplug.h"
>  
>  //#define DEBUG_PCI
>  #ifdef DEBUG_PCI
> @@ -126,6 +125,9 @@ static int pci_bar(PCIDevice *d, int reg)
>  {
>      uint8_t type;
>  
> +    /* PCIe virtual functions do not have their own BARs */
> +    assert(!d->exp.is_vf);
> +
>      if (reg != PCI_ROM_SLOT)
>          return PCI_BASE_ADDRESS_0 + reg * 4;
>  
> @@ -184,22 +186,13 @@ void pci_device_deassert_intx(PCIDevice *dev)
>      }
>  }
>  
> -static void pci_do_device_reset(PCIDevice *dev)
> +static void pci_reset_regions(PCIDevice *dev)
>  {
>      int r;
> +    if (dev->exp.is_vf) {
> +        return;
> +    }
>  
> -    pci_device_deassert_intx(dev);
> -    assert(dev->irq_state == 0);
> -
> -    /* Clear all writable bits */
> -    pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
> -                                 pci_get_word(dev->wmask + PCI_COMMAND) |
> -                                 pci_get_word(dev->w1cmask + PCI_COMMAND));
> -    pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
> -                                 pci_get_word(dev->wmask + PCI_STATUS) |
> -                                 pci_get_word(dev->w1cmask + PCI_STATUS));
> -    dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> -    dev->config[PCI_INTERRUPT_LINE] = 0x0;
>      for (r = 0; r < PCI_NUM_REGIONS; ++r) {
>          PCIIORegion *region = &dev->io_regions[r];
>          if (!region->size) {
> @@ -213,6 +206,27 @@ static void pci_do_device_reset(PCIDevice *dev)
>              pci_set_long(dev->config + pci_bar(dev, r), region->type);
>          }
>      }
> +}
> +
> +static void pci_do_device_reset(PCIDevice *dev)
> +{
> +    qdev_reset_all(&dev->qdev);
> +
> +    dev->irq_state = 0;
> +    pci_update_irq_status(dev);
> +    pci_device_deassert_intx(dev);
> +    assert(dev->irq_state == 0);
> +
> +    /* Clear all writable bits */
> +    pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
> +                                 pci_get_word(dev->wmask + PCI_COMMAND) |
> +                                 pci_get_word(dev->w1cmask + PCI_COMMAND));
> +    pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
> +                                 pci_get_word(dev->wmask + PCI_STATUS) |
> +                                 pci_get_word(dev->w1cmask + PCI_STATUS));
> +    dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> +    dev->config[PCI_INTERRUPT_LINE] = 0x0;
> +    pci_reset_regions(dev);
>      pci_update_mappings(dev);
>  
>      msi_reset(dev);
> @@ -734,6 +748,14 @@ static int pci_init_multifunction(PCIBus *bus, PCIDevice *dev)
>          dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
>      }
>  
> +    /* With SR/IOV and ARI, subsequent function 0's are only
> +     * another VF which the physical function is placed in the initial
> +     * function (0.0)

Couldn't parse this. Could you please reword?

> +     */
> +    if (dev->exp.pf && dev->exp.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +        return 0;
> +    }
> +
>      /*
>       * multifunction bit is interpreted in two ways as follows.
>       *   - all functions must set the bit to 1.
> @@ -920,6 +942,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
>      uint64_t wmask;
>      pcibus_t size = memory_region_size(memory);
>  
> +    assert(!pci_dev->exp.is_vf); /* VFs must use pcie_register_vf_bar */
>      assert(region_num >= 0);
>      assert(region_num < PCI_NUM_REGIONS);
>      if (size & (size-1)) {
> @@ -1018,18 +1041,51 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int region_num)
>      return pci_dev->io_regions[region_num].addr;
>  }
>  
> -static pcibus_t pci_bar_address(PCIDevice *d,
> -				int reg, uint8_t type, pcibus_t size)
> +
> +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> +                                        uint8_t type, pcibus_t size)
> +{
> +    pcibus_t new_addr;
> +    if (!d->exp.is_vf) {
> +        int bar = pci_bar(d, reg);
> +        if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +            new_addr = pci_get_quad(d->config + bar);
> +        } else {
> +            new_addr = pci_get_long(d->config + bar);
> +        }
> +    } else {
> +        int bar = d->exp.pf->exp.sriov_cap + PCI_SRIOV_BAR + reg * 4;
> +        uint32_t vf_num = d->devfn - (d->exp.pf->devfn + 1);
> +        if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +            new_addr = pci_get_quad(d->exp.pf->config + bar);
> +        } else {
> +            new_addr = pci_get_long(d->exp.pf->config + bar);
> +        }
> +        new_addr += vf_num * size;
> +
> +        PCI_DPRINTF("%02d:%02d.%d: (vf %d) found config reg %d, "
> +                    "size 0x%lx addr 0x%lx\n",
> +                    pci_bus_num(d->bus), d->devfn >> 3, d->devfn & 7,
> +                    vf_num, reg, size, new_addr);

I don't think this is needed, assuming it is, don't we want this
for all functions, bot just vfs?

> +    }
> +    if (reg != PCI_ROM_SLOT) {
> +        /* Preserve the rom enable bit */
> +        new_addr &= ~(size - 1);
> +    }
> +    return new_addr;
> +}
> +
> +pcibus_t pci_bar_address(PCIDevice *d,
> +                         int reg, uint8_t type, pcibus_t size)
>  {
>      pcibus_t new_addr, last_addr;
> -    int bar = pci_bar(d, reg);
>      uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
>  
>      if (type & PCI_BASE_ADDRESS_SPACE_IO) {
>          if (!(cmd & PCI_COMMAND_IO)) {
>              return PCI_BAR_UNMAPPED;
>          }
> -        new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> +        new_addr = pci_config_get_bar_addr(d, reg, type, size);

Hmm this is IO, VFs don't have IO, do they? Change really necessary?

>          last_addr = new_addr + size - 1;
>          /* Check if 32 bit BAR wraps around explicitly.
>           * TODO: make priorities correct and remove this work around.
> @@ -1043,11 +1099,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
>      if (!(cmd & PCI_COMMAND_MEMORY)) {
>          return PCI_BAR_UNMAPPED;
>      }
> -    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> -        new_addr = pci_get_quad(d->config + bar);
> -    } else {
> -        new_addr = pci_get_long(d->config + bar);
> -    }
> +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
>      /* the ROM slot has a specific enable bit */
>      if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
>          return PCI_BAR_UNMAPPED;
> @@ -1146,9 +1198,10 @@ uint32_t pci_default_read_config(PCIDevice *d,
>      return le32_to_cpu(val);
>  }
>  
> -void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
> +void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
>  {
>      int i, was_irq_disabled = pci_irq_disabled(d);
> +    uint32_t val = val_in;
>  
>      for (i = 0; i < l; val >>= 8, ++i) {
>          uint8_t wmask = d->wmask[addr + i];
> @@ -1170,8 +1223,9 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
>                                      & PCI_COMMAND_MASTER);
>      }
>  
> -    msi_write_config(d, addr, val, l);
> -    msix_write_config(d, addr, val, l);
> +    msi_write_config(d, addr, val_in, l);
> +    msix_write_config(d, addr, val_in, l);
> +    pcie_sriov_config_write(d, addr, val_in, l);
>  }


Not sure I get this chunk.
A code comment might be helpful.


>  
>  /***********************************************************/
> @@ -1776,7 +1830,6 @@ static int pci_qdev_init(DeviceState *qdev)
>          is_default_rom = true;
>      }
>      pci_add_option_rom(pci_dev, is_default_rom);
> -
>      return 0;
>  }
>  

generally, don't make unrelated changes, review is
hard enough.

> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index 6cb6e0c..b7af693 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -254,7 +254,7 @@ void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
>       * Right now, only a device of function = 0 is allowed to be
>       * hot plugged/unplugged.
>       */
> -    assert(PCI_FUNC(pci_dev->devfn) == 0);
> +    assert(PCI_FUNC(pci_dev->devfn) == 0 || pci_dev->exp.is_vf);
>  
>      pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA,
>                                 PCI_EXP_SLTSTA_PDS);
> @@ -266,10 +266,11 @@ void pcie_cap_slot_hot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
>                                   Error **errp)
>  {
>      uint8_t *exp_cap;
> +    PCIDevice *pdev = PCI_DEVICE(hotplug_dev);
>  
> -    pcie_cap_slot_hotplug_common(PCI_DEVICE(hotplug_dev), dev, &exp_cap, errp);
> +    pcie_cap_slot_hotplug_common(pdev, dev, &exp_cap, errp);
>  
> -    pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev));
> +    pcie_cap_slot_push_attention_button(pdev);
>  }
>  
>  /* pci express slot for pci express root/downstream port
> @@ -409,7 +410,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
>      }
>  
>      /*
> -     * If the slot is polulated, power indicator is off and power
> +     * If the slot is populated, power indicator is off and power
>       * controller is off, it is safe to detach the devices.
>       */
>      if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
> @@ -633,3 +634,199 @@ void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
>                          offset, PCI_ARI_SIZEOF);
>      pci_set_long(dev->config + offset + PCI_ARI_CAP, (nextfn & 0xff) << 8);
>  }
> +
> +
> +

don't add > 1 empty line in a row.

> +/* SR/IOV */

this comment is really useless, function name makes it clear.

> +void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
> +                     const char *vfname, uint16_t vf_dev_id,
> +                     uint16_t init_vfs, uint16_t total_vfs)
> +{
> +    uint8_t *cfg = dev->config + offset;
> +    uint8_t *wmask;

empty line here after variables.

> +    pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
> +                        offset, PCI_EXT_CAP_SRIOV_SIZEOF);
> +    dev->exp.sriov_cap = offset;
> +    dev->exp.num_vfs = 0;
> +    dev->exp.vfname = g_strdup(vfname);

I don't see a free for this field anywhere.

> +    dev->exp.vf = NULL;
> +
> +    /* set some sensible defaults - devices can override later */

do they in fact override?

> +    pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, 0x1);
> +    pci_set_word(cfg + PCI_SRIOV_VF_STRIDE, 0x1);
> +    pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, 0x553);

what's this magic constant?
needs a comment.

> +    pci_set_word(cfg + PCI_SRIOV_SYS_PGSIZE, 0x1);
> +
> +    /* Set up device ID and initial/total number of VFs available */
> +    pci_set_word(cfg + PCI_SRIOV_VF_DID, vf_dev_id);
> +    pci_set_word(cfg + PCI_SRIOV_INITIAL_VF, init_vfs);
> +    pci_set_word(cfg + PCI_SRIOV_TOTAL_VF, total_vfs);
> +
> +    /* Write enable control bits */
> +    wmask = dev->wmask + offset;
> +    pci_set_word(wmask + PCI_SRIOV_CTRL,
> +                 PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI);
> +    pci_set_word(wmask + PCI_SRIOV_NUM_VF, 0xffff);
> +
> +    qdev_prop_set_bit(&dev->qdev, "multifunction", true);
> +}
> +
> +


2 empty lines again

> +void pcie_sriov_exit(PCIDevice *dev)
> +{
> +    PCIE_DPRINTF("\n");
> +    pcie_sriov_reset_vfs(dev);
> +}
> +
> +void pcie_sriov_init_bar(PCIDevice *dev, int region_num,
> +                         uint8_t type, dma_addr_t size)
> +{
> +    uint32_t addr;
> +    uint64_t wmask;
> +    uint16_t sriov_cap = dev->exp.sriov_cap;
> +
> +    assert(sriov_cap > 0);
> +    assert(region_num >= 0);
> +    assert(region_num < PCI_NUM_REGIONS);
> +    assert(region_num != PCI_ROM_SLOT);
> +
> +    wmask = ~(size - 1);
> +    addr = sriov_cap + PCI_SRIOV_BAR + region_num * 4;
> +
> +    pci_set_long(dev->config + addr, type);
> +    if (!(type & PCI_BASE_ADDRESS_SPACE_IO) &&
> +        type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +        pci_set_quad(dev->wmask + addr, wmask);
> +        pci_set_quad(dev->cmask + addr, ~0ULL);
> +    } else {
> +        pci_set_long(dev->wmask + addr, wmask & 0xffffffff);
> +        pci_set_long(dev->cmask + addr, 0xffffffff);
> +    }
> +    dev->exp.vf_bar_type[region_num] = type;
> +}
> +
> +void pcie_register_vf_bar(PCIDevice *dev, int region_num,
> +                          MemoryRegion *memory)
> +{
> +    PCIIORegion *r;
> +    uint8_t type;
> +    pcibus_t size = memory_region_size(memory);
> +
> +    assert(dev->exp.is_vf); /* PFs must use pci_register_bar */
> +    assert(region_num >= 0);
> +    assert(region_num < PCI_NUM_REGIONS);
> +    type = dev->exp.pf->exp.vf_bar_type[region_num];
> +
> +    if (size & (size-1)) {
> +        fprintf(stderr, "ERROR: PCI region size must be pow2 "

power of 2

eschew abbreviation

> +                    "type=0x%x, size=0x%"FMT_PCIBUS"\n", type, size);
> +        exit(1);
> +    }
> +
> +    r = &dev->io_regions[region_num];
> +    r->memory = memory;
> +    r->address_space =
> +        type & PCI_BASE_ADDRESS_SPACE_IO
> +        ? dev->bus->address_space_io
> +        : dev->bus->address_space_mem;
> +    r->size = size;
> +    r->type = type;
> +
> +    r->addr = pci_bar_address(dev, region_num, r->type, r->size);
> +    if (r->addr != PCI_BAR_UNMAPPED) {
> +        memory_region_add_subregion_overlap(r->address_space,
> +                                            r->addr, r->memory, 1);
> +    }
> +}
> +
> +
> +static PCIDevice *pcie_create_vf(PCIDevice *pf, int devfn, const char *name)
> +{
> +    int ret;
> +    PCIDevice *dev = pci_create(pf->bus, devfn, name);
> +    dev->exp.is_vf = true;
> +    dev->exp.pf = pf;
> +
> +    ret = qdev_init(&dev->qdev);
> +    if (ret)
> +        return NULL;
> +
> +    /* set vid/did according to sr/iov spec - they are not used */
> +    pci_config_set_vendor_id(dev->config, 0xffff);
> +    pci_config_set_device_id(dev->config, 0xffff);
> +    return dev;
> +}
> +
> +
> +void pcie_sriov_create_vfs(PCIDevice *dev)
> +{
> +    uint16_t num_vfs;
> +    uint16_t i;
> +    int32_t devfn = dev->devfn + 1;
> +    uint16_t sriov_cap = dev->exp.sriov_cap;
> +
> +    assert(sriov_cap > 0);
> +    num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
> +
> +    dev->exp.vf = g_malloc(sizeof(PCIDevice*) * num_vfs);

Don't see a free to match this malloc call.

> +    assert(dev->exp.vf);
> +
> +    PCIE_DEV_PRINTF(dev, "creating %d vf devs\n", num_vfs);
> +    for (i = 0; i < num_vfs; i++) {
> +        dev->exp.vf[i] = pcie_create_vf(dev, devfn++, dev->exp.vfname);
> +        if (!dev->exp.vf[i]) {
> +            PCIE_DEV_PRINTF(dev, "Failed to create VF %d\n", i);
> +            num_vfs = i;
> +            break;
> +        }
> +    }
> +    dev->exp.num_vfs = num_vfs;
> +}
> +
> +
> +void pcie_sriov_reset_vfs(PCIDevice *dev)
> +{
> +    Error *local_err = NULL;
> +    uint16_t num_vfs = dev->exp.num_vfs;
> +    uint16_t i;
> +    PCIE_DEV_PRINTF(dev, "Resetting %d vf devs\n", num_vfs);
> +    for (i = 0; i < num_vfs; i++) {
> +        qdev_unplug(&dev->exp.vf[i]->qdev, &local_err);
> +        if (local_err) {
> +            fprintf(stderr, "Failed to unplug: %s\n",
> +                    error_get_pretty(local_err));
> +            error_free(local_err);
> +        }
> +    }
> +    dev->exp.num_vfs = 0;
> +}
> +
> +
> +void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val, int len)
> +{
> +    uint32_t off;
> +    uint16_t sriov_cap = dev->exp.sriov_cap;
> +
> +    if (!sriov_cap || address < sriov_cap) {
> +        return;
> +    }
> +    off = address - sriov_cap;
> +    if (off >= PCI_EXT_CAP_SRIOV_SIZEOF) {
> +        return;
> +    }
> +
> +    PCIE_DEV_PRINTF(dev, "cap at 0x%x sriov offset 0x%x val 0x%x len %d\n",
> +                 sriov_cap, off, val, len);
> +
> +    if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
> +        if (dev->exp.num_vfs) {
> +            if (!(val & PCI_SRIOV_CTRL_VFE)) {
> +                pcie_sriov_reset_vfs(dev);
> +            }
> +        } else {
> +            if (val & PCI_SRIOV_CTRL_VFE) {
> +                pcie_sriov_create_vfs(dev);
> +            }
> +        }
> +    }
> +}
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index c352c7b..d36c68d 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -11,8 +11,6 @@
>  /* PCI includes legacy ISA access.  */
>  #include "hw/isa/isa.h"
>  
> -#include "hw/pci/pcie.h"
> -
>  /* PCI bus */
>  
>  #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> @@ -127,6 +125,7 @@ enum {
>  #define QEMU_PCI_VGA_IO_HI_SIZE 0x20
>  
>  #include "hw/pci/pci_regs.h"
> +#include "hw/pci/pcie.h"
>  
>  /* PCI HEADER_TYPE */
>  #define  PCI_HEADER_TYPE_MULTI_FUNCTION 0x80

why move it here?

> @@ -413,6 +412,9 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
>  void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
>  
> +pcibus_t pci_bar_address(PCIDevice *d,
> +                         int reg, uint8_t type, pcibus_t size);
> +
>  static inline void
>  pci_set_byte(uint8_t *config, uint8_t val)
>  {
> diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> index d139d58..b34d831 100644
> --- a/include/hw/pci/pcie.h
> +++ b/include/hw/pci/pcie.h
> @@ -74,6 +74,15 @@ struct PCIExpressDevice {
>      /* AER */
>      uint16_t aer_cap;
>      PCIEAERLog aer_log;
> +
> +    /* SR/IOV */
> +    uint16_t sriov_cap;
> +    uint16_t num_vfs;           /* Number of virtual functions created */
> +    bool is_vf;                 /* Set if this device is a virtual function */

Checking pf pointer not enough?

> +    const char *vfname;         /* Reference to the device type used for the VFs */
> +    PCIDevice **vf;             /* Pointer to an array of num_vfs VF devices */
> +    PCIDevice *pf;              /* Pointer back to owner physical function */
> +    uint8_t vf_bar_type[PCI_NUM_REGIONS];  /* Store type for each VF bar */

There are two types of fields here: some are per-vf,
some are sriov fields in pf.
Pls make this clearer:
	- create structures for each group
	- name fields sriov_pf and sriov_vf accordingly.

>  };
>  
>  #define COMPAT_PROP_PCP "power_controller_present"
> @@ -115,6 +124,23 @@ void pcie_add_capability(PCIDevice *dev,
>                           uint16_t offset, uint16_t size);
>  
>  void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
> +void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
> +                     const char *vfname, uint16_t vf_dev_id,
> +                     uint16_t init_vfs, uint16_t total_vfs);
> +void pcie_sriov_exit(PCIDevice *dev);
> +

Assumption is, this applies to pf only?

> +/* Set up a VF bar in the SR/IOV bar area */
> +void pcie_sriov_init_bar(PCIDevice *s, int region_num,
> +                         uint8_t type, dma_addr_t size);
> +
> +/* Instantiate a bar for a VF */
> +void pcie_register_vf_bar(PCIDevice *pci_dev, int region_num,
> +                          MemoryRegion *memory);
> +

Again, this isn't very clear. prefix everything with
pcie_sriov_pf_ and pcie_sriov_vf_ to make it clear
what each applies to.

> +void pcie_sriov_create_vfs(PCIDevice *dev);


Do we need to export this one?

> +void pcie_sriov_reset_vfs(PCIDevice *dev);

How about pcie_sriov_reset? Callers don't need to know
what you do internally I think.

> +void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
> +                             uint32_t val, int len);
>  
>  extern const VMStateDescription vmstate_pcie_device;


I'm not sure which functions apply to pfs,vfs, and which to both
pfs and vfs. function names should make this clear.
>  
> -- 
> 1.9.0
Knut Omang Sept. 1, 2014, 6:34 p.m. UTC | #2
On Mon, 2014-09-01 at 12:39 +0300, Michael S. Tsirkin wrote:
> On Fri, Aug 29, 2014 at 09:17:07AM +0200, Knut Omang wrote:
> > This patch provides the building blocks for creating an SR/IOV
> > PCIe Extended Capability header and creating and removing
> > SR/IOV Virtual Functions.
> > 
> > Signed-off-by: Knut Omang <knut.omang@oracle.com>
> > ---
> >  hw/pci/pci.c          | 107 +++++++++++++++++++-------
> >  hw/pci/pcie.c         | 205 +++++++++++++++++++++++++++++++++++++++++++++++++-
> >  include/hw/pci/pci.h  |   6 +-
> >  include/hw/pci/pcie.h |  26 +++++++
> >  4 files changed, 311 insertions(+), 33 deletions(-)
> > 
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index daeaeac..071ab81 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -35,7 +35,6 @@
> >  #include "hw/pci/msi.h"
> >  #include "hw/pci/msix.h"
> >  #include "exec/address-spaces.h"
> > -#include "hw/hotplug.h"
> >  
> >  //#define DEBUG_PCI
> >  #ifdef DEBUG_PCI
> > @@ -126,6 +125,9 @@ static int pci_bar(PCIDevice *d, int reg)
> >  {
> >      uint8_t type;
> >  
> > +    /* PCIe virtual functions do not have their own BARs */
> > +    assert(!d->exp.is_vf);
> > +
> >      if (reg != PCI_ROM_SLOT)
> >          return PCI_BASE_ADDRESS_0 + reg * 4;
> >  
> > @@ -184,22 +186,13 @@ void pci_device_deassert_intx(PCIDevice *dev)
> >      }
> >  }
> >  
> > -static void pci_do_device_reset(PCIDevice *dev)
> > +static void pci_reset_regions(PCIDevice *dev)
> >  {
> >      int r;
> > +    if (dev->exp.is_vf) {
> > +        return;
> > +    }
> >  
> > -    pci_device_deassert_intx(dev);
> > -    assert(dev->irq_state == 0);
> > -
> > -    /* Clear all writable bits */
> > -    pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
> > -                                 pci_get_word(dev->wmask + PCI_COMMAND) |
> > -                                 pci_get_word(dev->w1cmask + PCI_COMMAND));
> > -    pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
> > -                                 pci_get_word(dev->wmask + PCI_STATUS) |
> > -                                 pci_get_word(dev->w1cmask + PCI_STATUS));
> > -    dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> > -    dev->config[PCI_INTERRUPT_LINE] = 0x0;
> >      for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> >          PCIIORegion *region = &dev->io_regions[r];
> >          if (!region->size) {
> > @@ -213,6 +206,27 @@ static void pci_do_device_reset(PCIDevice *dev)
> >              pci_set_long(dev->config + pci_bar(dev, r), region->type);
> >          }
> >      }
> > +}
> > +
> > +static void pci_do_device_reset(PCIDevice *dev)
> > +{
> > +    qdev_reset_all(&dev->qdev);
> > +
> > +    dev->irq_state = 0;
> > +    pci_update_irq_status(dev);
> > +    pci_device_deassert_intx(dev);
> > +    assert(dev->irq_state == 0);
> > +
> > +    /* Clear all writable bits */
> > +    pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
> > +                                 pci_get_word(dev->wmask + PCI_COMMAND) |
> > +                                 pci_get_word(dev->w1cmask + PCI_COMMAND));
> > +    pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
> > +                                 pci_get_word(dev->wmask + PCI_STATUS) |
> > +                                 pci_get_word(dev->w1cmask + PCI_STATUS));
> > +    dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> > +    dev->config[PCI_INTERRUPT_LINE] = 0x0;
> > +    pci_reset_regions(dev);
> >      pci_update_mappings(dev);
> >  
> >      msi_reset(dev);
> > @@ -734,6 +748,14 @@ static int pci_init_multifunction(PCIBus *bus, PCIDevice *dev)
> >          dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
> >      }
> >  
> > +    /* With SR/IOV and ARI, subsequent function 0's are only
> > +     * another VF which the physical function is placed in the initial
> > +     * function (0.0)
> 
> Couldn't parse this. Could you please reword?

The main point is that if this is a VF (it references a pf), the below
checks for multifunction on function 0 does not apply, as it is really a
virtual function which happens to get a devfn which yields a 
function number 0 in the legacy pci interpretation.

I realize I would need (at least for the purpose of documentation)
to do some more on the VF_OFFSET and VF_STRIDE,
which I have assumed are both 1 for simplicity, see comment below,

> > +     */
> > +    if (dev->exp.pf && dev->exp.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> > +        return 0;
> > +    }
> > +
> >      /*
> >       * multifunction bit is interpreted in two ways as follows.
> >       *   - all functions must set the bit to 1.
> > @@ -920,6 +942,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
> >      uint64_t wmask;
> >      pcibus_t size = memory_region_size(memory);
> >  
> > +    assert(!pci_dev->exp.is_vf); /* VFs must use pcie_register_vf_bar */
> >      assert(region_num >= 0);
> >      assert(region_num < PCI_NUM_REGIONS);
> >      if (size & (size-1)) {
> > @@ -1018,18 +1041,51 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int region_num)
> >      return pci_dev->io_regions[region_num].addr;
> >  }
> >  
> > -static pcibus_t pci_bar_address(PCIDevice *d,
> > -				int reg, uint8_t type, pcibus_t size)
> > +
> > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> > +                                        uint8_t type, pcibus_t size)
> > +{
> > +    pcibus_t new_addr;
> > +    if (!d->exp.is_vf) {
> > +        int bar = pci_bar(d, reg);
> > +        if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +            new_addr = pci_get_quad(d->config + bar);
> > +        } else {
> > +            new_addr = pci_get_long(d->config + bar);
> > +        }
> > +    } else {
> > +        int bar = d->exp.pf->exp.sriov_cap + PCI_SRIOV_BAR + reg * 4;
> > +        uint32_t vf_num = d->devfn - (d->exp.pf->devfn + 1);
> > +        if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +            new_addr = pci_get_quad(d->exp.pf->config + bar);
> > +        } else {
> > +            new_addr = pci_get_long(d->exp.pf->config + bar);
> > +        }
> > +        new_addr += vf_num * size;
> > +
> > +        PCI_DPRINTF("%02d:%02d.%d: (vf %d) found config reg %d, "
> > +                    "size 0x%lx addr 0x%lx\n",
> > +                    pci_bus_num(d->bus), d->devfn >> 3, d->devfn & 7,
> > +                    vf_num, reg, size, new_addr);
> 
> I don't think this is needed, assuming it is, don't we want this
> for all functions, bot just vfs?

It was useful for debugging the size and address calculations, 
which are less obvious than in the plain PCI case since they are derived
from the VF bars in the PF, but I can remove it.

> > +    }
> > +    if (reg != PCI_ROM_SLOT) {
> > +        /* Preserve the rom enable bit */
> > +        new_addr &= ~(size - 1);
> > +    }
> > +    return new_addr;
> > +}
> > +
> > +pcibus_t pci_bar_address(PCIDevice *d,
> > +                         int reg, uint8_t type, pcibus_t size)
> >  {
> >      pcibus_t new_addr, last_addr;
> > -    int bar = pci_bar(d, reg);
> >      uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
> >  
> >      if (type & PCI_BASE_ADDRESS_SPACE_IO) {
> >          if (!(cmd & PCI_COMMAND_IO)) {
> >              return PCI_BAR_UNMAPPED;
> >          }
> > -        new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> > +        new_addr = pci_config_get_bar_addr(d, reg, type, size);
> 
> Hmm this is IO, VFs don't have IO, do they? Change really necessary?

I think it simplifies the code, notice that the outer 
bar variable is not needed elsewhere, so I would have had to 
change something in that if() anyway.

> >          last_addr = new_addr + size - 1;
> >          /* Check if 32 bit BAR wraps around explicitly.
> >           * TODO: make priorities correct and remove this work around.
> > @@ -1043,11 +1099,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> >      if (!(cmd & PCI_COMMAND_MEMORY)) {
> >          return PCI_BAR_UNMAPPED;
> >      }
> > -    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > -        new_addr = pci_get_quad(d->config + bar);
> > -    } else {
> > -        new_addr = pci_get_long(d->config + bar);
> > -    }
> > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> >      /* the ROM slot has a specific enable bit */
> >      if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
> >          return PCI_BAR_UNMAPPED;
> > @@ -1146,9 +1198,10 @@ uint32_t pci_default_read_config(PCIDevice *d,
> >      return le32_to_cpu(val);
> >  }
> >  
> > -void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
> > +void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
> >  {
> >      int i, was_irq_disabled = pci_irq_disabled(d);
> > +    uint32_t val = val_in;
> >  
> >      for (i = 0; i < l; val >>= 8, ++i) {
> >          uint8_t wmask = d->wmask[addr + i];
> > @@ -1170,8 +1223,9 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
> >                                      & PCI_COMMAND_MASTER);
> >      }
> >  
> > -    msi_write_config(d, addr, val, l);
> > -    msix_write_config(d, addr, val, l);
> > +    msi_write_config(d, addr, val_in, l);
> > +    msix_write_config(d, addr, val_in, l);
> > +    pcie_sriov_config_write(d, addr, val_in, l);
> >  }
> 
> 
> Not sure I get this chunk.
> A code comment might be helpful.

I believe I fix a bug here - should perhaps have been a separate patch
(should I factor it out, if you agree?) 
This fix lost my attention as things started to work and I moved forward:

The old code would never propagate any set bits through to the msi/msix
write functions as they get lost in the shift inside for(..) above,
which is why I introduced a new local val and renamed the argument to
val_in.

Otherwise it is just another capability subscribing for config writes,
eg. pcie_sriov_config_write would only have any effect if the config
address matches a valid sriov capability, which is similar usage as for
the msi and msix calls.

> >  
> >  /***********************************************************/
> > @@ -1776,7 +1830,6 @@ static int pci_qdev_init(DeviceState *qdev)
> >          is_default_rom = true;
> >      }
> >      pci_add_option_rom(pci_dev, is_default_rom);
> > -
> >      return 0;
> >  }
> >  
> 
> generally, don't make unrelated changes, review is
> hard enough.

Sorry about that - I tried to look for unintended white space changes
but some seems to have slipped by anyway - the code went 
through several rebases on the way..

> > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > index 6cb6e0c..b7af693 100644
> > --- a/hw/pci/pcie.c
> > +++ b/hw/pci/pcie.c
> > @@ -254,7 +254,7 @@ void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
> >       * Right now, only a device of function = 0 is allowed to be
> >       * hot plugged/unplugged.
> >       */
> > -    assert(PCI_FUNC(pci_dev->devfn) == 0);
> > +    assert(PCI_FUNC(pci_dev->devfn) == 0 || pci_dev->exp.is_vf);
> >  
> >      pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA,
> >                                 PCI_EXP_SLTSTA_PDS);
> > @@ -266,10 +266,11 @@ void pcie_cap_slot_hot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
> >                                   Error **errp)
> >  {
> >      uint8_t *exp_cap;
> > +    PCIDevice *pdev = PCI_DEVICE(hotplug_dev);
> >  
> > -    pcie_cap_slot_hotplug_common(PCI_DEVICE(hotplug_dev), dev, &exp_cap, errp);
> > +    pcie_cap_slot_hotplug_common(pdev, dev, &exp_cap, errp);
> >  
> > -    pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev));
> > +    pcie_cap_slot_push_attention_button(pdev);
> >  }
> >  
> >  /* pci express slot for pci express root/downstream port
> > @@ -409,7 +410,7 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
> >      }
> >  
> >      /*
> > -     * If the slot is polulated, power indicator is off and power
> > +     * If the slot is populated, power indicator is off and power
> >       * controller is off, it is safe to detach the devices.
> >       */
> >      if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
> > @@ -633,3 +634,199 @@ void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
> >                          offset, PCI_ARI_SIZEOF);
> >      pci_set_long(dev->config + offset + PCI_ARI_CAP, (nextfn & 0xff) << 8);
> >  }
> > +
> > +
> > +
> 
> don't add > 1 empty line in a row.

Get it - will clean up - same for the rest of the unintended white space
changes..

> > +/* SR/IOV */
> 
> this comment is really useless, function name makes it clear.

I just tried to follow the style in the source file - eg. /* ARI */
above, 

Continuing with your suggestions for better datatypes below, I ended up
factoring the sriov code out into a separate source/header file pair
simular to the aer code instead, hope that's ok.

> > +void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
> > +                     const char *vfname, uint16_t vf_dev_id,
> > +                     uint16_t init_vfs, uint16_t total_vfs)
> > +{
> > +    uint8_t *cfg = dev->config + offset;
> > +    uint8_t *wmask;
> 
> empty line here after variables.

Good, I like that myself too, 
the existing code uses both 0 and 1 lines,

> > +    pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
> > +                        offset, PCI_EXT_CAP_SRIOV_SIZEOF);
> > +    dev->exp.sriov_cap = offset;
> > +    dev->exp.num_vfs = 0;
> > +    dev->exp.vfname = g_strdup(vfname);
> 
> I don't see a free for this field anywhere.

Ouch - will fix.
 
> > +    dev->exp.vf = NULL;
> > +
> > +    /* set some sensible defaults - devices can override later */
> 
> do they in fact override?

Hmm - I realize I need to handle this better, at present it is 
hard coded to work for stride 1, offset 1 - it should be relatively
easy to be more generic.

If a device implementing SR/IOV wants stride > 1 or for instance has
more than 1 PF, it needs to start at offset > 1 or supports 
more pages sizes, it would have to set other values accordingly.

The igb I have experimented with in the lab actually sets stride 2 -
it also sets offset 0x80 that is, it implements two PFs at 
function 0 and 1 and the VFs from each PF ends
up interleaved at every second function, starting at devfn 10.0.
In my igb example code, I deliberately just used the hard coded stride 1, 
offset 1 - will fix that instead of documenting the misfeature..

> > +    pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, 0x1);
> > +    pci_set_word(cfg + PCI_SRIOV_VF_STRIDE, 0x1);
> > +    pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, 0x553);
> 
> what's this magic constant?
> needs a comment.

I agree, will add it - the mandatory set of page sizes to support.

> > +    pci_set_word(cfg + PCI_SRIOV_SYS_PGSIZE, 0x1);
> > +
> > +    /* Set up device ID and initial/total number of VFs available */
> > +    pci_set_word(cfg + PCI_SRIOV_VF_DID, vf_dev_id);
> > +    pci_set_word(cfg + PCI_SRIOV_INITIAL_VF, init_vfs);
> > +    pci_set_word(cfg + PCI_SRIOV_TOTAL_VF, total_vfs);
> > +
> > +    /* Write enable control bits */
> > +    wmask = dev->wmask + offset;
> > +    pci_set_word(wmask + PCI_SRIOV_CTRL,
> > +                 PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI);
> > +    pci_set_word(wmask + PCI_SRIOV_NUM_VF, 0xffff);
> > +
> > +    qdev_prop_set_bit(&dev->qdev, "multifunction", true);
> > +}
> > +
> > +
> 
> 
> 2 empty lines again

sorry, old habits...

> > +void pcie_sriov_exit(PCIDevice *dev)
> > +{
> > +    PCIE_DPRINTF("\n");
> > +    pcie_sriov_reset_vfs(dev);
> > +}
> > +
> > +void pcie_sriov_init_bar(PCIDevice *dev, int region_num,
> > +                         uint8_t type, dma_addr_t size)
> > +{
> > +    uint32_t addr;
> > +    uint64_t wmask;
> > +    uint16_t sriov_cap = dev->exp.sriov_cap;
> > +
> > +    assert(sriov_cap > 0);
> > +    assert(region_num >= 0);
> > +    assert(region_num < PCI_NUM_REGIONS);
> > +    assert(region_num != PCI_ROM_SLOT);
> > +
> > +    wmask = ~(size - 1);
> > +    addr = sriov_cap + PCI_SRIOV_BAR + region_num * 4;
> > +
> > +    pci_set_long(dev->config + addr, type);
> > +    if (!(type & PCI_BASE_ADDRESS_SPACE_IO) &&
> > +        type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +        pci_set_quad(dev->wmask + addr, wmask);
> > +        pci_set_quad(dev->cmask + addr, ~0ULL);
> > +    } else {
> > +        pci_set_long(dev->wmask + addr, wmask & 0xffffffff);
> > +        pci_set_long(dev->cmask + addr, 0xffffffff);
> > +    }
> > +    dev->exp.vf_bar_type[region_num] = type;
> > +}
> > +
> > +void pcie_register_vf_bar(PCIDevice *dev, int region_num,
> > +                          MemoryRegion *memory)
> > +{
> > +    PCIIORegion *r;
> > +    uint8_t type;
> > +    pcibus_t size = memory_region_size(memory);
> > +
> > +    assert(dev->exp.is_vf); /* PFs must use pci_register_bar */
> > +    assert(region_num >= 0);
> > +    assert(region_num < PCI_NUM_REGIONS);
> > +    type = dev->exp.pf->exp.vf_bar_type[region_num];
> > +
> > +    if (size & (size-1)) {
> > +        fprintf(stderr, "ERROR: PCI region size must be pow2 "
> 
> power of 2
> 
> eschew abbreviation

agree - cut'n paste from pci.c:pci_register_bar() but nevertheless...

> > +                    "type=0x%x, size=0x%"FMT_PCIBUS"\n", type, size);
> > +        exit(1);
> > +    }
> > +
> > +    r = &dev->io_regions[region_num];
> > +    r->memory = memory;
> > +    r->address_space =
> > +        type & PCI_BASE_ADDRESS_SPACE_IO
> > +        ? dev->bus->address_space_io
> > +        : dev->bus->address_space_mem;
> > +    r->size = size;
> > +    r->type = type;
> > +
> > +    r->addr = pci_bar_address(dev, region_num, r->type, r->size);
> > +    if (r->addr != PCI_BAR_UNMAPPED) {
> > +        memory_region_add_subregion_overlap(r->address_space,
> > +                                            r->addr, r->memory, 1);
> > +    }
> > +}
> > +
> > +
> > +static PCIDevice *pcie_create_vf(PCIDevice *pf, int devfn, const char *name)
> > +{
> > +    int ret;
> > +    PCIDevice *dev = pci_create(pf->bus, devfn, name);
> > +    dev->exp.is_vf = true;
> > +    dev->exp.pf = pf;
> > +
> > +    ret = qdev_init(&dev->qdev);
> > +    if (ret)
> > +        return NULL;
> > +
> > +    /* set vid/did according to sr/iov spec - they are not used */
> > +    pci_config_set_vendor_id(dev->config, 0xffff);
> > +    pci_config_set_device_id(dev->config, 0xffff);
> > +    return dev;
> > +}
> > +
> > +
> > +void pcie_sriov_create_vfs(PCIDevice *dev)
> > +{
> > +    uint16_t num_vfs;
> > +    uint16_t i;
> > +    int32_t devfn = dev->devfn + 1;
> > +    uint16_t sriov_cap = dev->exp.sriov_cap;
> > +
> > +    assert(sriov_cap > 0);
> > +    num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
> > +
> > +    dev->exp.vf = g_malloc(sizeof(PCIDevice*) * num_vfs);
> 
> Don't see a free to match this malloc call.

You're quite right - will fix, thanks,

> > +    assert(dev->exp.vf);
> > +
> > +    PCIE_DEV_PRINTF(dev, "creating %d vf devs\n", num_vfs);
> > +    for (i = 0; i < num_vfs; i++) {
> > +        dev->exp.vf[i] = pcie_create_vf(dev, devfn++, dev->exp.vfname);
> > +        if (!dev->exp.vf[i]) {
> > +            PCIE_DEV_PRINTF(dev, "Failed to create VF %d\n", i);
> > +            num_vfs = i;
> > +            break;
> > +        }
> > +    }
> > +    dev->exp.num_vfs = num_vfs;
> > +}
> > +
> > +
> > +void pcie_sriov_reset_vfs(PCIDevice *dev)
> > +{
> > +    Error *local_err = NULL;
> > +    uint16_t num_vfs = dev->exp.num_vfs;
> > +    uint16_t i;
> > +    PCIE_DEV_PRINTF(dev, "Resetting %d vf devs\n", num_vfs);
> > +    for (i = 0; i < num_vfs; i++) {
> > +        qdev_unplug(&dev->exp.vf[i]->qdev, &local_err);
> > +        if (local_err) {
> > +            fprintf(stderr, "Failed to unplug: %s\n",
> > +                    error_get_pretty(local_err));
> > +            error_free(local_err);
> > +        }
> > +    }
> > +    dev->exp.num_vfs = 0;
> > +}
> > +
> > +
> > +void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val, int len)
> > +{
> > +    uint32_t off;
> > +    uint16_t sriov_cap = dev->exp.sriov_cap;
> > +
> > +    if (!sriov_cap || address < sriov_cap) {
> > +        return;
> > +    }
> > +    off = address - sriov_cap;
> > +    if (off >= PCI_EXT_CAP_SRIOV_SIZEOF) {
> > +        return;
> > +    }
> > +
> > +    PCIE_DEV_PRINTF(dev, "cap at 0x%x sriov offset 0x%x val 0x%x len %d\n",
> > +                 sriov_cap, off, val, len);
> > +
> > +    if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
> > +        if (dev->exp.num_vfs) {
> > +            if (!(val & PCI_SRIOV_CTRL_VFE)) {
> > +                pcie_sriov_reset_vfs(dev);
> > +            }
> > +        } else {
> > +            if (val & PCI_SRIOV_CTRL_VFE) {
> > +                pcie_sriov_create_vfs(dev);
> > +            }
> > +        }
> > +    }
> > +}
> > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > index c352c7b..d36c68d 100644
> > --- a/include/hw/pci/pci.h
> > +++ b/include/hw/pci/pci.h
> > @@ -11,8 +11,6 @@
> >  /* PCI includes legacy ISA access.  */
> >  #include "hw/isa/isa.h"
> >  
> > -#include "hw/pci/pcie.h"
> > -
> >  /* PCI bus */
> >  
> >  #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> > @@ -127,6 +125,7 @@ enum {
> >  #define QEMU_PCI_VGA_IO_HI_SIZE 0x20
> >  
> >  #include "hw/pci/pci_regs.h"
> > +#include "hw/pci/pcie.h"
> >  
> >  /* PCI HEADER_TYPE */
> >  #define  PCI_HEADER_TYPE_MULTI_FUNCTION 0x80
> 
> why move it here?

pcie.h now depends on PCI_NUM_REGIONS which are declared in pci.h after the old
pcie.h include.

> > @@ -413,6 +412,9 @@ typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
> >  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
> >  void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
> >  
> > +pcibus_t pci_bar_address(PCIDevice *d,
> > +                         int reg, uint8_t type, pcibus_t size);
> > +
> >  static inline void
> >  pci_set_byte(uint8_t *config, uint8_t val)
> >  {
> > diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
> > index d139d58..b34d831 100644
> > --- a/include/hw/pci/pcie.h
> > +++ b/include/hw/pci/pcie.h
> > @@ -74,6 +74,15 @@ struct PCIExpressDevice {
> >      /* AER */
> >      uint16_t aer_cap;
> >      PCIEAERLog aer_log;
> > +
> > +    /* SR/IOV */
> > +    uint16_t sriov_cap;
> > +    uint16_t num_vfs;           /* Number of virtual functions created */
> > +    bool is_vf;                 /* Set if this device is a virtual function */
> 
> Checking pf pointer not enough?

Should be - was considering it, but kept it to
be coherent with is_express and is_bridge.
Will eliminate and add+use a pci_is_vf() 
instead,

> > +    const char *vfname;         /* Reference to the device type used for the VFs */
> > +    PCIDevice **vf;             /* Pointer to an array of num_vfs VF devices */
> > +    PCIDevice *pf;              /* Pointer back to owner physical function */
> > +    uint8_t vf_bar_type[PCI_NUM_REGIONS];  /* Store type for each VF bar */
> 
> There are two types of fields here: some are per-vf,
> some are sriov fields in pf.
> Pls make this clearer:
> 	- create structures for each group
> 	- name fields sriov_pf and sriov_vf accordingly.

Was unsure about desired number of levels vs abstraction for 
readability - will do as you suggest,

> >  };
> >  
> >  #define COMPAT_PROP_PCP "power_controller_present"
> > @@ -115,6 +124,23 @@ void pcie_add_capability(PCIDevice *dev,
> >                           uint16_t offset, uint16_t size);
> >  
> >  void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
> > +void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
> > +                     const char *vfname, uint16_t vf_dev_id,
> > +                     uint16_t init_vfs, uint16_t total_vfs);
> > +void pcie_sriov_exit(PCIDevice *dev);
> > +
> 
> Assumption is, this applies to pf only?

I see your concern - similar to the one I had with ARIfwd vs ARI cap.
will make it clearer!

> > +/* Set up a VF bar in the SR/IOV bar area */
> > +void pcie_sriov_init_bar(PCIDevice *s, int region_num,
> > +                         uint8_t type, dma_addr_t size);
> > +
> > +/* Instantiate a bar for a VF */
> > +void pcie_register_vf_bar(PCIDevice *pci_dev, int region_num,
> > +                          MemoryRegion *memory);
> > +
> 
> Again, this isn't very clear. prefix everything with
> pcie_sriov_pf_ and pcie_sriov_vf_ to make it clear
> what each applies to.

Yes

> > +void pcie_sriov_create_vfs(PCIDevice *dev);
> 
> 
> Do we need to export this one?
> 
> > +void pcie_sriov_reset_vfs(PCIDevice *dev);

Hmm, it is needed both as a result of PF reset and during destruction of
a device (which does not necessarily yield a PCI reset) and I call it
from the igb code, 

but your comment makes me realize:

1) I should probably rename the create/reset_vfs functions to
pcie_sriov_pf_register/unregister_vfs() or similar as that would better
reflect what it does.

2) call reset (unregister) unconditionally for SR/IOV devices (PFs) from
pci_unregister_device()
   
> How about pcie_sriov_reset? Callers don't need to know
> what you do internally I think.

Actually, it seems the external API can be made a lot simpler,
working on that,...

> > +void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
> > +                             uint32_t val, int len);
> >  
> >  extern const VMStateDescription vmstate_pcie_device;
> 
> 
> I'm not sure which functions apply to pfs,vfs, and which to both
> pfs and vfs. function names should make this clear.

I get it,..

Thanks a lot for your thorough review,

Knut

> > -- 
> > 1.9.0
diff mbox

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index daeaeac..071ab81 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -35,7 +35,6 @@ 
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "exec/address-spaces.h"
-#include "hw/hotplug.h"
 
 //#define DEBUG_PCI
 #ifdef DEBUG_PCI
@@ -126,6 +125,9 @@  static int pci_bar(PCIDevice *d, int reg)
 {
     uint8_t type;
 
+    /* PCIe virtual functions do not have their own BARs */
+    assert(!d->exp.is_vf);
+
     if (reg != PCI_ROM_SLOT)
         return PCI_BASE_ADDRESS_0 + reg * 4;
 
@@ -184,22 +186,13 @@  void pci_device_deassert_intx(PCIDevice *dev)
     }
 }
 
-static void pci_do_device_reset(PCIDevice *dev)
+static void pci_reset_regions(PCIDevice *dev)
 {
     int r;
+    if (dev->exp.is_vf) {
+        return;
+    }
 
-    pci_device_deassert_intx(dev);
-    assert(dev->irq_state == 0);
-
-    /* Clear all writable bits */
-    pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
-                                 pci_get_word(dev->wmask + PCI_COMMAND) |
-                                 pci_get_word(dev->w1cmask + PCI_COMMAND));
-    pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
-                                 pci_get_word(dev->wmask + PCI_STATUS) |
-                                 pci_get_word(dev->w1cmask + PCI_STATUS));
-    dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
-    dev->config[PCI_INTERRUPT_LINE] = 0x0;
     for (r = 0; r < PCI_NUM_REGIONS; ++r) {
         PCIIORegion *region = &dev->io_regions[r];
         if (!region->size) {
@@ -213,6 +206,27 @@  static void pci_do_device_reset(PCIDevice *dev)
             pci_set_long(dev->config + pci_bar(dev, r), region->type);
         }
     }
+}
+
+static void pci_do_device_reset(PCIDevice *dev)
+{
+    qdev_reset_all(&dev->qdev);
+
+    dev->irq_state = 0;
+    pci_update_irq_status(dev);
+    pci_device_deassert_intx(dev);
+    assert(dev->irq_state == 0);
+
+    /* Clear all writable bits */
+    pci_word_test_and_clear_mask(dev->config + PCI_COMMAND,
+                                 pci_get_word(dev->wmask + PCI_COMMAND) |
+                                 pci_get_word(dev->w1cmask + PCI_COMMAND));
+    pci_word_test_and_clear_mask(dev->config + PCI_STATUS,
+                                 pci_get_word(dev->wmask + PCI_STATUS) |
+                                 pci_get_word(dev->w1cmask + PCI_STATUS));
+    dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
+    dev->config[PCI_INTERRUPT_LINE] = 0x0;
+    pci_reset_regions(dev);
     pci_update_mappings(dev);
 
     msi_reset(dev);
@@ -734,6 +748,14 @@  static int pci_init_multifunction(PCIBus *bus, PCIDevice *dev)
         dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
     }
 
+    /* With SR/IOV and ARI, subsequent function 0's are only
+     * another VF which the physical function is placed in the initial
+     * function (0.0)
+     */
+    if (dev->exp.pf && dev->exp.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+        return 0;
+    }
+
     /*
      * multifunction bit is interpreted in two ways as follows.
      *   - all functions must set the bit to 1.
@@ -920,6 +942,7 @@  void pci_register_bar(PCIDevice *pci_dev, int region_num,
     uint64_t wmask;
     pcibus_t size = memory_region_size(memory);
 
+    assert(!pci_dev->exp.is_vf); /* VFs must use pcie_register_vf_bar */
     assert(region_num >= 0);
     assert(region_num < PCI_NUM_REGIONS);
     if (size & (size-1)) {
@@ -1018,18 +1041,51 @@  pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int region_num)
     return pci_dev->io_regions[region_num].addr;
 }
 
-static pcibus_t pci_bar_address(PCIDevice *d,
-				int reg, uint8_t type, pcibus_t size)
+
+static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
+                                        uint8_t type, pcibus_t size)
+{
+    pcibus_t new_addr;
+    if (!d->exp.is_vf) {
+        int bar = pci_bar(d, reg);
+        if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+            new_addr = pci_get_quad(d->config + bar);
+        } else {
+            new_addr = pci_get_long(d->config + bar);
+        }
+    } else {
+        int bar = d->exp.pf->exp.sriov_cap + PCI_SRIOV_BAR + reg * 4;
+        uint32_t vf_num = d->devfn - (d->exp.pf->devfn + 1);
+        if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+            new_addr = pci_get_quad(d->exp.pf->config + bar);
+        } else {
+            new_addr = pci_get_long(d->exp.pf->config + bar);
+        }
+        new_addr += vf_num * size;
+
+        PCI_DPRINTF("%02d:%02d.%d: (vf %d) found config reg %d, "
+                    "size 0x%lx addr 0x%lx\n",
+                    pci_bus_num(d->bus), d->devfn >> 3, d->devfn & 7,
+                    vf_num, reg, size, new_addr);
+    }
+    if (reg != PCI_ROM_SLOT) {
+        /* Preserve the rom enable bit */
+        new_addr &= ~(size - 1);
+    }
+    return new_addr;
+}
+
+pcibus_t pci_bar_address(PCIDevice *d,
+                         int reg, uint8_t type, pcibus_t size)
 {
     pcibus_t new_addr, last_addr;
-    int bar = pci_bar(d, reg);
     uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
 
     if (type & PCI_BASE_ADDRESS_SPACE_IO) {
         if (!(cmd & PCI_COMMAND_IO)) {
             return PCI_BAR_UNMAPPED;
         }
-        new_addr = pci_get_long(d->config + bar) & ~(size - 1);
+        new_addr = pci_config_get_bar_addr(d, reg, type, size);
         last_addr = new_addr + size - 1;
         /* Check if 32 bit BAR wraps around explicitly.
          * TODO: make priorities correct and remove this work around.
@@ -1043,11 +1099,7 @@  static pcibus_t pci_bar_address(PCIDevice *d,
     if (!(cmd & PCI_COMMAND_MEMORY)) {
         return PCI_BAR_UNMAPPED;
     }
-    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
-        new_addr = pci_get_quad(d->config + bar);
-    } else {
-        new_addr = pci_get_long(d->config + bar);
-    }
+    new_addr = pci_config_get_bar_addr(d, reg, type, size);
     /* the ROM slot has a specific enable bit */
     if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
         return PCI_BAR_UNMAPPED;
@@ -1146,9 +1198,10 @@  uint32_t pci_default_read_config(PCIDevice *d,
     return le32_to_cpu(val);
 }
 
-void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
+void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
 {
     int i, was_irq_disabled = pci_irq_disabled(d);
+    uint32_t val = val_in;
 
     for (i = 0; i < l; val >>= 8, ++i) {
         uint8_t wmask = d->wmask[addr + i];
@@ -1170,8 +1223,9 @@  void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
                                     & PCI_COMMAND_MASTER);
     }
 
-    msi_write_config(d, addr, val, l);
-    msix_write_config(d, addr, val, l);
+    msi_write_config(d, addr, val_in, l);
+    msix_write_config(d, addr, val_in, l);
+    pcie_sriov_config_write(d, addr, val_in, l);
 }
 
 /***********************************************************/
@@ -1776,7 +1830,6 @@  static int pci_qdev_init(DeviceState *qdev)
         is_default_rom = true;
     }
     pci_add_option_rom(pci_dev, is_default_rom);
-
     return 0;
 }
 
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 6cb6e0c..b7af693 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -254,7 +254,7 @@  void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
      * Right now, only a device of function = 0 is allowed to be
      * hot plugged/unplugged.
      */
-    assert(PCI_FUNC(pci_dev->devfn) == 0);
+    assert(PCI_FUNC(pci_dev->devfn) == 0 || pci_dev->exp.is_vf);
 
     pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA,
                                PCI_EXP_SLTSTA_PDS);
@@ -266,10 +266,11 @@  void pcie_cap_slot_hot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev,
                                  Error **errp)
 {
     uint8_t *exp_cap;
+    PCIDevice *pdev = PCI_DEVICE(hotplug_dev);
 
-    pcie_cap_slot_hotplug_common(PCI_DEVICE(hotplug_dev), dev, &exp_cap, errp);
+    pcie_cap_slot_hotplug_common(pdev, dev, &exp_cap, errp);
 
-    pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev));
+    pcie_cap_slot_push_attention_button(pdev);
 }
 
 /* pci express slot for pci express root/downstream port
@@ -409,7 +410,7 @@  void pcie_cap_slot_write_config(PCIDevice *dev,
     }
 
     /*
-     * If the slot is polulated, power indicator is off and power
+     * If the slot is populated, power indicator is off and power
      * controller is off, it is safe to detach the devices.
      */
     if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
@@ -633,3 +634,199 @@  void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
                         offset, PCI_ARI_SIZEOF);
     pci_set_long(dev->config + offset + PCI_ARI_CAP, (nextfn & 0xff) << 8);
 }
+
+
+
+/* SR/IOV */
+void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
+                     const char *vfname, uint16_t vf_dev_id,
+                     uint16_t init_vfs, uint16_t total_vfs)
+{
+    uint8_t *cfg = dev->config + offset;
+    uint8_t *wmask;
+    pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1,
+                        offset, PCI_EXT_CAP_SRIOV_SIZEOF);
+    dev->exp.sriov_cap = offset;
+    dev->exp.num_vfs = 0;
+    dev->exp.vfname = g_strdup(vfname);
+    dev->exp.vf = NULL;
+
+    /* set some sensible defaults - devices can override later */
+    pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, 0x1);
+    pci_set_word(cfg + PCI_SRIOV_VF_STRIDE, 0x1);
+    pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, 0x553);
+    pci_set_word(cfg + PCI_SRIOV_SYS_PGSIZE, 0x1);
+
+    /* Set up device ID and initial/total number of VFs available */
+    pci_set_word(cfg + PCI_SRIOV_VF_DID, vf_dev_id);
+    pci_set_word(cfg + PCI_SRIOV_INITIAL_VF, init_vfs);
+    pci_set_word(cfg + PCI_SRIOV_TOTAL_VF, total_vfs);
+
+    /* Write enable control bits */
+    wmask = dev->wmask + offset;
+    pci_set_word(wmask + PCI_SRIOV_CTRL,
+                 PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI);
+    pci_set_word(wmask + PCI_SRIOV_NUM_VF, 0xffff);
+
+    qdev_prop_set_bit(&dev->qdev, "multifunction", true);
+}
+
+
+void pcie_sriov_exit(PCIDevice *dev)
+{
+    PCIE_DPRINTF("\n");
+    pcie_sriov_reset_vfs(dev);
+}
+
+void pcie_sriov_init_bar(PCIDevice *dev, int region_num,
+                         uint8_t type, dma_addr_t size)
+{
+    uint32_t addr;
+    uint64_t wmask;
+    uint16_t sriov_cap = dev->exp.sriov_cap;
+
+    assert(sriov_cap > 0);
+    assert(region_num >= 0);
+    assert(region_num < PCI_NUM_REGIONS);
+    assert(region_num != PCI_ROM_SLOT);
+
+    wmask = ~(size - 1);
+    addr = sriov_cap + PCI_SRIOV_BAR + region_num * 4;
+
+    pci_set_long(dev->config + addr, type);
+    if (!(type & PCI_BASE_ADDRESS_SPACE_IO) &&
+        type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+        pci_set_quad(dev->wmask + addr, wmask);
+        pci_set_quad(dev->cmask + addr, ~0ULL);
+    } else {
+        pci_set_long(dev->wmask + addr, wmask & 0xffffffff);
+        pci_set_long(dev->cmask + addr, 0xffffffff);
+    }
+    dev->exp.vf_bar_type[region_num] = type;
+}
+
+void pcie_register_vf_bar(PCIDevice *dev, int region_num,
+                          MemoryRegion *memory)
+{
+    PCIIORegion *r;
+    uint8_t type;
+    pcibus_t size = memory_region_size(memory);
+
+    assert(dev->exp.is_vf); /* PFs must use pci_register_bar */
+    assert(region_num >= 0);
+    assert(region_num < PCI_NUM_REGIONS);
+    type = dev->exp.pf->exp.vf_bar_type[region_num];
+
+    if (size & (size-1)) {
+        fprintf(stderr, "ERROR: PCI region size must be pow2 "
+                    "type=0x%x, size=0x%"FMT_PCIBUS"\n", type, size);
+        exit(1);
+    }
+
+    r = &dev->io_regions[region_num];
+    r->memory = memory;
+    r->address_space =
+        type & PCI_BASE_ADDRESS_SPACE_IO
+        ? dev->bus->address_space_io
+        : dev->bus->address_space_mem;
+    r->size = size;
+    r->type = type;
+
+    r->addr = pci_bar_address(dev, region_num, r->type, r->size);
+    if (r->addr != PCI_BAR_UNMAPPED) {
+        memory_region_add_subregion_overlap(r->address_space,
+                                            r->addr, r->memory, 1);
+    }
+}
+
+
+static PCIDevice *pcie_create_vf(PCIDevice *pf, int devfn, const char *name)
+{
+    int ret;
+    PCIDevice *dev = pci_create(pf->bus, devfn, name);
+    dev->exp.is_vf = true;
+    dev->exp.pf = pf;
+
+    ret = qdev_init(&dev->qdev);
+    if (ret)
+        return NULL;
+
+    /* set vid/did according to sr/iov spec - they are not used */
+    pci_config_set_vendor_id(dev->config, 0xffff);
+    pci_config_set_device_id(dev->config, 0xffff);
+    return dev;
+}
+
+
+void pcie_sriov_create_vfs(PCIDevice *dev)
+{
+    uint16_t num_vfs;
+    uint16_t i;
+    int32_t devfn = dev->devfn + 1;
+    uint16_t sriov_cap = dev->exp.sriov_cap;
+
+    assert(sriov_cap > 0);
+    num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
+
+    dev->exp.vf = g_malloc(sizeof(PCIDevice*) * num_vfs);
+    assert(dev->exp.vf);
+
+    PCIE_DEV_PRINTF(dev, "creating %d vf devs\n", num_vfs);
+    for (i = 0; i < num_vfs; i++) {
+        dev->exp.vf[i] = pcie_create_vf(dev, devfn++, dev->exp.vfname);
+        if (!dev->exp.vf[i]) {
+            PCIE_DEV_PRINTF(dev, "Failed to create VF %d\n", i);
+            num_vfs = i;
+            break;
+        }
+    }
+    dev->exp.num_vfs = num_vfs;
+}
+
+
+void pcie_sriov_reset_vfs(PCIDevice *dev)
+{
+    Error *local_err = NULL;
+    uint16_t num_vfs = dev->exp.num_vfs;
+    uint16_t i;
+    PCIE_DEV_PRINTF(dev, "Resetting %d vf devs\n", num_vfs);
+    for (i = 0; i < num_vfs; i++) {
+        qdev_unplug(&dev->exp.vf[i]->qdev, &local_err);
+        if (local_err) {
+            fprintf(stderr, "Failed to unplug: %s\n",
+                    error_get_pretty(local_err));
+            error_free(local_err);
+        }
+    }
+    dev->exp.num_vfs = 0;
+}
+
+
+void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val, int len)
+{
+    uint32_t off;
+    uint16_t sriov_cap = dev->exp.sriov_cap;
+
+    if (!sriov_cap || address < sriov_cap) {
+        return;
+    }
+    off = address - sriov_cap;
+    if (off >= PCI_EXT_CAP_SRIOV_SIZEOF) {
+        return;
+    }
+
+    PCIE_DEV_PRINTF(dev, "cap at 0x%x sriov offset 0x%x val 0x%x len %d\n",
+                 sriov_cap, off, val, len);
+
+    if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
+        if (dev->exp.num_vfs) {
+            if (!(val & PCI_SRIOV_CTRL_VFE)) {
+                pcie_sriov_reset_vfs(dev);
+            }
+        } else {
+            if (val & PCI_SRIOV_CTRL_VFE) {
+                pcie_sriov_create_vfs(dev);
+            }
+        }
+    }
+}
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index c352c7b..d36c68d 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -11,8 +11,6 @@ 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
 
-#include "hw/pci/pcie.h"
-
 /* PCI bus */
 
 #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
@@ -127,6 +125,7 @@  enum {
 #define QEMU_PCI_VGA_IO_HI_SIZE 0x20
 
 #include "hw/pci/pci_regs.h"
+#include "hw/pci/pcie.h"
 
 /* PCI HEADER_TYPE */
 #define  PCI_HEADER_TYPE_MULTI_FUNCTION 0x80
@@ -413,6 +412,9 @@  typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
 void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
 
+pcibus_t pci_bar_address(PCIDevice *d,
+                         int reg, uint8_t type, pcibus_t size);
+
 static inline void
 pci_set_byte(uint8_t *config, uint8_t val)
 {
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index d139d58..b34d831 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -74,6 +74,15 @@  struct PCIExpressDevice {
     /* AER */
     uint16_t aer_cap;
     PCIEAERLog aer_log;
+
+    /* SR/IOV */
+    uint16_t sriov_cap;
+    uint16_t num_vfs;           /* Number of virtual functions created */
+    bool is_vf;                 /* Set if this device is a virtual function */
+    const char *vfname;         /* Reference to the device type used for the VFs */
+    PCIDevice **vf;             /* Pointer to an array of num_vfs VF devices */
+    PCIDevice *pf;              /* Pointer back to owner physical function */
+    uint8_t vf_bar_type[PCI_NUM_REGIONS];  /* Store type for each VF bar */
 };
 
 #define COMPAT_PROP_PCP "power_controller_present"
@@ -115,6 +124,23 @@  void pcie_add_capability(PCIDevice *dev,
                          uint16_t offset, uint16_t size);
 
 void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
+void pcie_sriov_init(PCIDevice *dev, uint16_t offset,
+                     const char *vfname, uint16_t vf_dev_id,
+                     uint16_t init_vfs, uint16_t total_vfs);
+void pcie_sriov_exit(PCIDevice *dev);
+
+/* Set up a VF bar in the SR/IOV bar area */
+void pcie_sriov_init_bar(PCIDevice *s, int region_num,
+                         uint8_t type, dma_addr_t size);
+
+/* Instantiate a bar for a VF */
+void pcie_register_vf_bar(PCIDevice *pci_dev, int region_num,
+                          MemoryRegion *memory);
+
+void pcie_sriov_create_vfs(PCIDevice *dev);
+void pcie_sriov_reset_vfs(PCIDevice *dev);
+void pcie_sriov_config_write(PCIDevice *dev, uint32_t address,
+                             uint32_t val, int len);
 
 extern const VMStateDescription vmstate_pcie_device;