Patchwork [RFC,44/45] pci-assign: Use generic MSI-X support

login
register
mail settings
Submitter Jan Kiszka
Date Oct. 17, 2011, 9:28 a.m.
Message ID <7f5051323d1383595d2131d7934044162681a703.1318843694.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/120182/
State New
Headers show

Comments

Jan Kiszka - Oct. 17, 2011, 9:28 a.m.
Switch MSI-X support of the device assignment core to the generic layer
QEMU offers. As for legacy MSI, we use config notifiers to update IRQ
assignment and routes on guest changes. Quite a bit code becomes
obsolete in the device assigment core, e.g. the maintenance of the MSI-X
vector masking MMIO page. Note that we have to reorder BAR mapping and
capability initialization in order to pass the BAR container on
msix_init.

Also in this case we still do not support per-vector masking even after
these changes.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 hw/device-assignment.c |  335 +++++++++++++-----------------------------------
 hw/device-assignment.h |   14 +--
 2 files changed, 88 insertions(+), 261 deletions(-)

Patch

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 10b30a3..df554b3 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -24,6 +24,7 @@ 
  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ *  Copyright (C) 2011, Siemens AG, Jan Kiszka (jan.kiszka@siemens.com)
  */
 #include <stdio.h>
 #include <unistd.h>
@@ -41,6 +42,7 @@ 
 #include "range.h"
 #include "sysemu.h"
 #include "msi.h"
+#include "msix.h"
 
 #define MSIX_PAGE_SIZE 0x1000
 
@@ -64,8 +66,6 @@ 
 
 static void assigned_dev_load_option_rom(AssignedDevice *dev);
 
-static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
-
 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
                                        uint32_t addr, int len, uint32_t *val)
 {
@@ -238,24 +238,11 @@  static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num,
 {
     AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
     AssignedDevRegion *region = &r_dev->v_addrs[region_num];
-    PCIRegion *real_region = &r_dev->real_device.regions[region_num];
 
     if (e_size > 0) {
         memory_region_init(&region->container, "assigned-dev-container",
                            e_size);
         memory_region_add_subregion(&region->container, 0, &region->real_iomem);
-
-        /* deal with MSI-X MMIO page */
-        if (real_region->base_addr <= r_dev->msix_table_addr &&
-                real_region->base_addr + real_region->size >
-                r_dev->msix_table_addr) {
-            int offset = r_dev->msix_table_addr - real_region->base_addr;
-
-            memory_region_add_subregion_overlap(&region->container,
-                                                offset,
-                                                &r_dev->mmio,
-                                                1);
-        }
     }
 }
 
@@ -648,21 +635,20 @@  again:
 
 static QLIST_HEAD(, AssignedDevice) devs = QLIST_HEAD_INITIALIZER(devs);
 
-static void invalidate_msix_vectors(AssignedDevice *dev)
-{
-    int i;
-
-    for (i = 0; i < dev->irq_entries_nr; i++) {
-        kvm_msi_cache_invalidate(&dev->dev.msix_cache[i]);
-    }
-}
-
 static void free_assigned_device(AssignedDevice *dev)
 {
+    uint32_t table_bar_nr, pba_bar_nr;
+    uint8_t *msix_cap;
     int i;
 
-    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
-        assigned_dev_unregister_msix_mmio(dev);
+    if (msix_present(&dev->dev)) {
+        msix_cap = dev->dev.config + dev->dev.msix_cap;
+        table_bar_nr = pci_get_long(msix_cap + PCI_MSIX_TABLE) &
+            PCI_MSIX_FLAGS_BIRMASK;
+        pba_bar_nr = pci_get_long(msix_cap + PCI_MSIX_PBA) &
+            PCI_MSIX_FLAGS_BIRMASK;
+        msix_uninit(&dev->dev, &dev->v_addrs[table_bar_nr].container,
+                    &dev->v_addrs[pba_bar_nr].container);
     }
     for (i = 0; i < dev->real_device.region_number; i++) {
         PCIRegion *pci_region = &dev->real_device.regions[i];
@@ -698,9 +684,6 @@  static void free_assigned_device(AssignedDevice *dev)
     if (dev->real_device.config_fd >= 0) {
         close(dev->real_device.config_fd);
     }
-
-    invalidate_msix_vectors(dev);
-    g_free(dev->dev.msix_cache);
 }
 
 static uint32_t calc_assigned_dev_id(AssignedDevice *dev)
@@ -916,11 +899,13 @@  void assigned_dev_update_irqs(void)
     }
 }
 
+/* used for both MSI and MSI-X */
 static void assigned_dev_update_msi(PCIDevice *pci_dev, bool enabled)
 {
     AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
 
     if (!enabled) {
+        dev->msix_vectors_in_use = 0;
         assign_intx(dev);
     }
 }
@@ -945,113 +930,66 @@  static int assigned_dev_update_msi_vector(PCIDevice *pci_dev,
     return 0;
 }
 
-static int assigned_dev_set_msix_vectors(PCIDevice *pci_dev)
+static int assigned_dev_update_msix_vector(PCIDevice *pci_dev,
+                                           unsigned int vector,
+                                           MSIMessage *msg, bool masked)
 {
-    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
-    uint16_t entries_nr = 0, entries_max_nr;
-    void *msix_page = adev->msix_table_page;
+    AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    MSIRoutingCache *cache;
     uint32_t dev_id;
-    MSIMessage msg;
-    int pos, i, r;
-
-    assert(adev->irq_entries_nr == 0);
-
-    pos = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
+    unsigned int i;
+    int ret = 0;
 
-    entries_max_nr = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS);
-    entries_max_nr &= PCI_MSIX_FLAGS_QSIZE;
-    entries_max_nr += 1;
+    if (!masked) {
+        dev_id = calc_assigned_dev_id(dev);
 
-    /* Get the usable entry number for allocating */
-    for (i = 0; i < entries_max_nr; i++) {
         /* Assuming IA-32 MSI message format:
          * Ignore unused entry (invalid vector) */
-        if (pci_get_long(msix_page + i * PCI_MSIX_ENTRY_SIZE +
-                         PCI_MSIX_ENTRY_DATA) == 0) {
-            continue;
+        if (msg->data == 0) {
+            if (pci_dev->msix_cache[vector].type == MSI_ROUTE_NONE) {
+                return ret;
+            }
+            dev->msix_vectors_in_use--;
+            deassign_irq(dev);
+            kvm_msi_cache_invalidate(&pci_dev->msix_cache[vector]);
+        } else {
+            if (pci_dev->msix_cache[vector].type != MSI_ROUTE_NONE) {
+                ret = kvm_device_msix_set_vector(kvm_state, dev_id,
+                                                 vector, msg,
+                                                 &pci_dev->msix_cache[vector]);
+                return ret;
+            }
+            dev->msix_vectors_in_use++;
+            deassign_irq(dev);
         }
-        entries_nr++;
-    }
-    if (entries_nr == 0) {
-        fprintf(stderr, "MSI-X entry number is zero!\n");
-        return -EINVAL;
-    }
 
-    dev_id = calc_assigned_dev_id(adev);
-
-    r = kvm_device_msix_init_vectors(kvm_state, dev_id, entries_nr);
-    if (r < 0) {
-        return r;
-    }
-    pci_dev->msix_cache = g_malloc0(entries_nr * sizeof(MSIRoutingCache));
-    adev->irq_entries_nr = entries_nr;
-
-    for (i = 0; i < entries_max_nr; i++) {
-        if (entries_nr == 0) {
-            break;
-        }
-        msg.data = pci_get_long(msix_page + i * PCI_MSIX_ENTRY_SIZE +
-                                PCI_MSIX_ENTRY_DATA);
-        if (msg.data == 0) {
-            continue;
+        ret = kvm_device_msix_init_vectors(kvm_state, dev_id,
+                                           dev->msix_vectors_in_use);
+        if (ret < 0) {
+            return ret;
         }
-        msg.address = pci_get_quad(msix_page + i * PCI_MSIX_ENTRY_SIZE +
-                                   PCI_MSIX_ENTRY_LOWER_ADDR);
 
-        r = kvm_device_msix_set_vector(kvm_state, dev_id, i, &msg,
-                                       &pci_dev->msix_cache[i]);
-        if (r < 0) {
-            return r;
+        for (i = 0; i < pci_dev->msix_entries_nr; i++) {
+            cache = &pci_dev->msix_cache[i];
+            if (i != vector && cache->type == MSI_ROUTE_NONE) {
+                continue;
+            }
+            ret = kvm_device_msix_set_vector(kvm_state, dev_id, i,
+                                             i == vector ? msg : &cache->msg,
+                                             cache);
+            if (ret < 0) {
+                return ret;
+            }
         }
-        entries_nr--;
-    }
-
-    return 0;
-}
-
-static void assigned_dev_update_msix(PCIDevice *pci_dev)
-{
-    AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
-    uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap +
-                                      PCI_MSIX_FLAGS);
-    uint32_t dev_id;
-    int r;
 
-    dev_id = calc_assigned_dev_id(assigned_dev);
-
-    /* Some guests gratuitously disable MSIX even if they're not using it,
-     * try to catch this by only deassigning irqs if the guest is using
-     * MSIX or intends to start. */
-    if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSIX) ||
-        (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) {
-        invalidate_msix_vectors(assigned_dev);
-        g_free(pci_dev->msix_cache);
-        assigned_dev->irq_entries_nr = 0;
-
-        r = kvm_device_irq_deassign(kvm_state, dev_id,
-                                    assigned_dev->irq_requested_type);
-        /* -ENXIO means no assigned irq */
-        if (r && r != -ENXIO)
-            perror("assigned_dev_update_msix: deassign irq");
-
-        assigned_dev->irq_requested_type = 0;
-    }
-
-    if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) {
-        if (assigned_dev_set_msix_vectors(pci_dev) < 0) {
-            perror("assigned_dev_update_msix_mmio");
-            return;
-        }
-        if (kvm_device_msix_assign(kvm_state, dev_id) < 0) {
-            perror("assigned_dev_enable_msix: assign irq");
-            return;
+        ret = kvm_device_msix_assign(kvm_state, dev_id);
+        if (ret < 0) {
+            return ret;
         }
-        assigned_dev->girq = -1;
-        assigned_dev->irq_requested_type = KVM_DEV_IRQ_HOST_MSIX |
-                                           KVM_DEV_IRQ_GUEST_MSIX;
-    } else {
-        assign_intx(assigned_dev);
+        dev->irq_requested_type =
+            KVM_DEV_IRQ_HOST_MSIX | KVM_DEV_IRQ_GUEST_MSIX;
     }
+    return ret;
 }
 
 static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
@@ -1083,13 +1021,6 @@  static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
 
     pci_default_write_config(pci_dev, address, val, len);
 
-    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
-        if (range_covers_byte(address, len,
-                              pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) {
-            assigned_dev_update_msix(pci_dev);
-        }
-    }
-
     emulate_mask = 0;
     memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len);
     emulate_mask = le32_to_cpu(emulate_mask);
@@ -1115,7 +1046,6 @@  static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset,
 static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
 {
     AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
-    PCIRegion *pci_region = dev->real_device.regions;
     int ret, pos;
 
     /* Clear initial capabilities pointer and status copied from hw */
@@ -1145,27 +1075,31 @@  static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
     /* Expose MSI-X capability */
     pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0);
     if (pos != 0 && kvm_device_msix_supported(kvm_state)) {
-        int bar_nr;
-        uint32_t msix_table_entry;
-
-        dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
-        if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12)) < 0) {
+        unsigned int table_bar_nr, pba_bar_nr;
+        uint32_t table_offset, pba_offset;
+        uint16_t nentries;
+
+        nentries = (pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
+                    PCI_MSIX_FLAGS_QSIZE) + 1;
+        table_offset = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
+        table_bar_nr = table_offset & PCI_MSIX_FLAGS_BIRMASK;
+        table_offset &= ~PCI_MSIX_FLAGS_BIRMASK;
+        pba_offset = pci_get_long(pci_dev->config + pos + PCI_MSIX_PBA);
+        pba_bar_nr = pba_offset & PCI_MSIX_FLAGS_BIRMASK;
+        pba_offset &= ~PCI_MSIX_FLAGS_BIRMASK;
+
+        ret = msix_init(pci_dev, pos, nentries,
+                        &dev->v_addrs[table_bar_nr].container, table_bar_nr,
+                        table_offset, &dev->v_addrs[pba_bar_nr].container,
+                        pba_bar_nr, pba_offset);
+        if (ret < 0) {
+            return ret;
+        }
+        ret = msix_set_config_notifiers(pci_dev, assigned_dev_update_msi,
+                                        assigned_dev_update_msix_vector);
+        if (ret < 0) {
             return ret;
         }
-        pci_dev->msix_cap = pos;
-
-        pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
-                     pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
-                     PCI_MSIX_FLAGS_QSIZE);
-
-        /* Only enable and function mask bits are writable */
-        pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
-                     PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
-
-        msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
-        bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK;
-        msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK;
-        dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
     }
 
     /* Minimal PM support, nothing writable, device appears to NAK changes */
@@ -1378,94 +1312,6 @@  static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
     return 0;
 }
 
-static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
-{
-    AssignedDevice *adev = opaque;
-    unsigned int offset = addr & 0xfff;
-    void *page = adev->msix_table_page;
-    uint32_t val = 0;
-
-    memcpy(&val, (void *)((char *)page + offset), 4);
-
-    return val;
-}
-
-static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
-{
-    return ((msix_mmio_readl(opaque, addr & ~3)) >>
-            (8 * (addr & 3))) & 0xff;
-}
-
-static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
-{
-    return ((msix_mmio_readl(opaque, addr & ~3)) >>
-            (8 * (addr & 3))) & 0xffff;
-}
-
-static void msix_mmio_writel(void *opaque,
-                             target_phys_addr_t addr, uint32_t val)
-{
-    AssignedDevice *adev = opaque;
-    unsigned int offset = addr & 0xfff;
-    void *page = adev->msix_table_page;
-
-    DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n",
-		    addr, val);
-    memcpy((void *)((char *)page + offset), &val, 4);
-}
-
-static void msix_mmio_writew(void *opaque,
-                             target_phys_addr_t addr, uint32_t val)
-{
-    msix_mmio_writel(opaque, addr & ~3,
-                     (val & 0xffff) << (8*(addr & 3)));
-}
-
-static void msix_mmio_writeb(void *opaque,
-                             target_phys_addr_t addr, uint32_t val)
-{
-    msix_mmio_writel(opaque, addr & ~3,
-                     (val & 0xff) << (8*(addr & 3)));
-}
-
-static const MemoryRegionOps msix_mmio_ops = {
-    .old_mmio = {
-        .read = { msix_mmio_readb, msix_mmio_readw, msix_mmio_readl, },
-        .write = { msix_mmio_writeb, msix_mmio_writew, msix_mmio_writel, },
-    },
-    .endianness = DEVICE_NATIVE_ENDIAN,
-};
-
-static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
-{
-    dev->msix_table_page = mmap(NULL, 0x1000,
-                                PROT_READ|PROT_WRITE,
-                                MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
-    if (dev->msix_table_page == MAP_FAILED) {
-        fprintf(stderr, "fail allocate msix_table_page! %s\n",
-                strerror(errno));
-        return -EFAULT;
-    }
-    memset(dev->msix_table_page, 0, 0x1000);
-    memory_region_init_io(&dev->mmio, &msix_mmio_ops, dev,
-                          "assigned-dev-msix", MSIX_PAGE_SIZE);
-    return 0;
-}
-
-static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
-{
-    if (!dev->msix_table_page)
-        return;
-
-    memory_region_destroy(&dev->mmio);
-
-    if (munmap(dev->msix_table_page, 0x1000) == -1) {
-        fprintf(stderr, "error unmapping msix_table_page! %s\n",
-                strerror(errno));
-    }
-    dev->msix_table_page = NULL;
-}
-
 static const VMStateDescription vmstate_assigned_device = {
     .name = "pci-assign",
     .unmigratable = 1,
@@ -1548,23 +1394,16 @@  static int assigned_initfn(struct PCIDevice *pci_dev)
         goto out;
     }
 
-    if (assigned_device_pci_cap_init(pci_dev) < 0) {
-        goto out;
-    }
-
-    /* intercept MSI-X entry page in the MMIO */
-    if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
-        if (assigned_dev_register_msix_mmio(dev)) {
-            goto out;
-        }
-    }
-
     /* handle real device's MMIO/PIO BARs */
     if (assigned_dev_register_regions(dev->real_device.regions,
                                       dev->real_device.region_number,
                                       dev))
         goto out;
 
+    if (assigned_device_pci_cap_init(pci_dev) < 0) {
+        goto out;
+    }
+
     /* handle interrupt routing */
     e_intx = dev->dev.config[0x3d] - 1;
     dev->intpin = e_intx;
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index 4b67f14..c41ea33 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -95,21 +95,9 @@  typedef struct AssignedDevice {
     uint8_t h_devfn;
     int irq_requested_type;
     int bound;
-    struct {
-#define ASSIGNED_DEVICE_CAP_MSI (1 << 0)
-#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1)
-        uint32_t available;
-#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0)
-#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1)
-#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2)
-        uint32_t state;
-    } cap;
     uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE];
     uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE];
-    int irq_entries_nr;
-    void *msix_table_page;
-    target_phys_addr_t msix_table_addr;
-    MemoryRegion mmio;
+    unsigned int msix_vectors_in_use;
     char *configfd_name;
     int32_t bootindex;
     QLIST_ENTRY(AssignedDevice) next;