From patchwork Mon Oct 17 09:28:18 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Kiszka X-Patchwork-Id: 120182 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [140.186.70.17]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id C57CCB6F99 for ; Mon, 17 Oct 2011 22:33:25 +1100 (EST) Received: from localhost ([::1]:54908 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1RFjXG-0001DH-B6 for incoming@patchwork.ozlabs.org; Mon, 17 Oct 2011 05:31:06 -0400 Received: from eggs.gnu.org ([140.186.70.92]:33295) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1RFjV7-0005z0-0T for qemu-devel@nongnu.org; Mon, 17 Oct 2011 05:28:57 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1RFjUo-0004P8-3a for qemu-devel@nongnu.org; Mon, 17 Oct 2011 05:28:52 -0400 Received: from david.siemens.de ([192.35.17.14]:19152) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1RFjUn-0004NC-Gz for qemu-devel@nongnu.org; Mon, 17 Oct 2011 05:28:34 -0400 Received: from mail1.siemens.de (localhost [127.0.0.1]) by david.siemens.de (8.13.6/8.13.6) with ESMTP id p9H9SWUE023977; Mon, 17 Oct 2011 11:28:32 +0200 Received: from mchn199C.mchp.siemens.de ([139.25.109.49]) by mail1.siemens.de (8.13.6/8.13.6) with ESMTP id p9H9SKW0023511; Mon, 17 Oct 2011 11:28:32 +0200 From: Jan Kiszka To: Avi Kivity , Marcelo Tosatti Date: Mon, 17 Oct 2011 11:28:18 +0200 Message-Id: <7f5051323d1383595d2131d7934044162681a703.1318843694.git.jan.kiszka@siemens.com> X-Mailer: git-send-email 1.7.3.4 In-Reply-To: References: In-Reply-To: References: X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.6, seldom 2.4 (older, 4) X-Received-From: 192.35.17.14 Cc: Alex Williamson , qemu-devel@nongnu.org, kvm@vger.kernel.org, "Michael S. Tsirkin" Subject: [Qemu-devel] [RFC][PATCH 44/45] pci-assign: Use generic MSI-X support X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Switch MSI-X support of the device assignment core to the generic layer QEMU offers. As for legacy MSI, we use config notifiers to update IRQ assignment and routes on guest changes. Quite a bit code becomes obsolete in the device assigment core, e.g. the maintenance of the MSI-X vector masking MMIO page. Note that we have to reorder BAR mapping and capability initialization in order to pass the BAR container on msix_init. Also in this case we still do not support per-vector masking even after these changes. Signed-off-by: Jan Kiszka --- hw/device-assignment.c | 335 +++++++++++++----------------------------------- hw/device-assignment.h | 14 +-- 2 files changed, 88 insertions(+), 261 deletions(-) diff --git a/hw/device-assignment.c b/hw/device-assignment.c index 10b30a3..df554b3 100644 --- a/hw/device-assignment.c +++ b/hw/device-assignment.c @@ -24,6 +24,7 @@ * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + * Copyright (C) 2011, Siemens AG, Jan Kiszka (jan.kiszka@siemens.com) */ #include #include @@ -41,6 +42,7 @@ #include "range.h" #include "sysemu.h" #include "msi.h" +#include "msix.h" #define MSIX_PAGE_SIZE 0x1000 @@ -64,8 +66,6 @@ static void assigned_dev_load_option_rom(AssignedDevice *dev); -static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev); - static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region, uint32_t addr, int len, uint32_t *val) { @@ -238,24 +238,11 @@ static void assigned_dev_iomem_setup(PCIDevice *pci_dev, int region_num, { AssignedDevice *r_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); AssignedDevRegion *region = &r_dev->v_addrs[region_num]; - PCIRegion *real_region = &r_dev->real_device.regions[region_num]; if (e_size > 0) { memory_region_init(®ion->container, "assigned-dev-container", e_size); memory_region_add_subregion(®ion->container, 0, ®ion->real_iomem); - - /* deal with MSI-X MMIO page */ - if (real_region->base_addr <= r_dev->msix_table_addr && - real_region->base_addr + real_region->size > - r_dev->msix_table_addr) { - int offset = r_dev->msix_table_addr - real_region->base_addr; - - memory_region_add_subregion_overlap(®ion->container, - offset, - &r_dev->mmio, - 1); - } } } @@ -648,21 +635,20 @@ again: static QLIST_HEAD(, AssignedDevice) devs = QLIST_HEAD_INITIALIZER(devs); -static void invalidate_msix_vectors(AssignedDevice *dev) -{ - int i; - - for (i = 0; i < dev->irq_entries_nr; i++) { - kvm_msi_cache_invalidate(&dev->dev.msix_cache[i]); - } -} - static void free_assigned_device(AssignedDevice *dev) { + uint32_t table_bar_nr, pba_bar_nr; + uint8_t *msix_cap; int i; - if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { - assigned_dev_unregister_msix_mmio(dev); + if (msix_present(&dev->dev)) { + msix_cap = dev->dev.config + dev->dev.msix_cap; + table_bar_nr = pci_get_long(msix_cap + PCI_MSIX_TABLE) & + PCI_MSIX_FLAGS_BIRMASK; + pba_bar_nr = pci_get_long(msix_cap + PCI_MSIX_PBA) & + PCI_MSIX_FLAGS_BIRMASK; + msix_uninit(&dev->dev, &dev->v_addrs[table_bar_nr].container, + &dev->v_addrs[pba_bar_nr].container); } for (i = 0; i < dev->real_device.region_number; i++) { PCIRegion *pci_region = &dev->real_device.regions[i]; @@ -698,9 +684,6 @@ static void free_assigned_device(AssignedDevice *dev) if (dev->real_device.config_fd >= 0) { close(dev->real_device.config_fd); } - - invalidate_msix_vectors(dev); - g_free(dev->dev.msix_cache); } static uint32_t calc_assigned_dev_id(AssignedDevice *dev) @@ -916,11 +899,13 @@ void assigned_dev_update_irqs(void) } } +/* used for both MSI and MSI-X */ static void assigned_dev_update_msi(PCIDevice *pci_dev, bool enabled) { AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); if (!enabled) { + dev->msix_vectors_in_use = 0; assign_intx(dev); } } @@ -945,113 +930,66 @@ static int assigned_dev_update_msi_vector(PCIDevice *pci_dev, return 0; } -static int assigned_dev_set_msix_vectors(PCIDevice *pci_dev) +static int assigned_dev_update_msix_vector(PCIDevice *pci_dev, + unsigned int vector, + MSIMessage *msg, bool masked) { - AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev); - uint16_t entries_nr = 0, entries_max_nr; - void *msix_page = adev->msix_table_page; + AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + MSIRoutingCache *cache; uint32_t dev_id; - MSIMessage msg; - int pos, i, r; - - assert(adev->irq_entries_nr == 0); - - pos = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX); + unsigned int i; + int ret = 0; - entries_max_nr = pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS); - entries_max_nr &= PCI_MSIX_FLAGS_QSIZE; - entries_max_nr += 1; + if (!masked) { + dev_id = calc_assigned_dev_id(dev); - /* Get the usable entry number for allocating */ - for (i = 0; i < entries_max_nr; i++) { /* Assuming IA-32 MSI message format: * Ignore unused entry (invalid vector) */ - if (pci_get_long(msix_page + i * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA) == 0) { - continue; + if (msg->data == 0) { + if (pci_dev->msix_cache[vector].type == MSI_ROUTE_NONE) { + return ret; + } + dev->msix_vectors_in_use--; + deassign_irq(dev); + kvm_msi_cache_invalidate(&pci_dev->msix_cache[vector]); + } else { + if (pci_dev->msix_cache[vector].type != MSI_ROUTE_NONE) { + ret = kvm_device_msix_set_vector(kvm_state, dev_id, + vector, msg, + &pci_dev->msix_cache[vector]); + return ret; + } + dev->msix_vectors_in_use++; + deassign_irq(dev); } - entries_nr++; - } - if (entries_nr == 0) { - fprintf(stderr, "MSI-X entry number is zero!\n"); - return -EINVAL; - } - dev_id = calc_assigned_dev_id(adev); - - r = kvm_device_msix_init_vectors(kvm_state, dev_id, entries_nr); - if (r < 0) { - return r; - } - pci_dev->msix_cache = g_malloc0(entries_nr * sizeof(MSIRoutingCache)); - adev->irq_entries_nr = entries_nr; - - for (i = 0; i < entries_max_nr; i++) { - if (entries_nr == 0) { - break; - } - msg.data = pci_get_long(msix_page + i * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA); - if (msg.data == 0) { - continue; + ret = kvm_device_msix_init_vectors(kvm_state, dev_id, + dev->msix_vectors_in_use); + if (ret < 0) { + return ret; } - msg.address = pci_get_quad(msix_page + i * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR); - r = kvm_device_msix_set_vector(kvm_state, dev_id, i, &msg, - &pci_dev->msix_cache[i]); - if (r < 0) { - return r; + for (i = 0; i < pci_dev->msix_entries_nr; i++) { + cache = &pci_dev->msix_cache[i]; + if (i != vector && cache->type == MSI_ROUTE_NONE) { + continue; + } + ret = kvm_device_msix_set_vector(kvm_state, dev_id, i, + i == vector ? msg : &cache->msg, + cache); + if (ret < 0) { + return ret; + } } - entries_nr--; - } - - return 0; -} - -static void assigned_dev_update_msix(PCIDevice *pci_dev) -{ - AssignedDevice *assigned_dev = DO_UPCAST(AssignedDevice, dev, pci_dev); - uint16_t ctrl_word = pci_get_word(pci_dev->config + pci_dev->msix_cap + - PCI_MSIX_FLAGS); - uint32_t dev_id; - int r; - dev_id = calc_assigned_dev_id(assigned_dev); - - /* Some guests gratuitously disable MSIX even if they're not using it, - * try to catch this by only deassigning irqs if the guest is using - * MSIX or intends to start. */ - if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSIX) || - (ctrl_word & PCI_MSIX_FLAGS_ENABLE)) { - invalidate_msix_vectors(assigned_dev); - g_free(pci_dev->msix_cache); - assigned_dev->irq_entries_nr = 0; - - r = kvm_device_irq_deassign(kvm_state, dev_id, - assigned_dev->irq_requested_type); - /* -ENXIO means no assigned irq */ - if (r && r != -ENXIO) - perror("assigned_dev_update_msix: deassign irq"); - - assigned_dev->irq_requested_type = 0; - } - - if (ctrl_word & PCI_MSIX_FLAGS_ENABLE) { - if (assigned_dev_set_msix_vectors(pci_dev) < 0) { - perror("assigned_dev_update_msix_mmio"); - return; - } - if (kvm_device_msix_assign(kvm_state, dev_id) < 0) { - perror("assigned_dev_enable_msix: assign irq"); - return; + ret = kvm_device_msix_assign(kvm_state, dev_id); + if (ret < 0) { + return ret; } - assigned_dev->girq = -1; - assigned_dev->irq_requested_type = KVM_DEV_IRQ_HOST_MSIX | - KVM_DEV_IRQ_GUEST_MSIX; - } else { - assign_intx(assigned_dev); + dev->irq_requested_type = + KVM_DEV_IRQ_HOST_MSIX | KVM_DEV_IRQ_GUEST_MSIX; } + return ret; } static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev, @@ -1083,13 +1021,6 @@ static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address, pci_default_write_config(pci_dev, address, val, len); - if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { - if (range_covers_byte(address, len, - pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)) { - assigned_dev_update_msix(pci_dev); - } - } - emulate_mask = 0; memcpy(&emulate_mask, assigned_dev->emulate_config_write + address, len); emulate_mask = le32_to_cpu(emulate_mask); @@ -1115,7 +1046,6 @@ static void assigned_dev_setup_cap_read(AssignedDevice *dev, uint32_t offset, static int assigned_device_pci_cap_init(PCIDevice *pci_dev) { AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); - PCIRegion *pci_region = dev->real_device.regions; int ret, pos; /* Clear initial capabilities pointer and status copied from hw */ @@ -1145,27 +1075,31 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) /* Expose MSI-X capability */ pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0); if (pos != 0 && kvm_device_msix_supported(kvm_state)) { - int bar_nr; - uint32_t msix_table_entry; - - dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX; - if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12)) < 0) { + unsigned int table_bar_nr, pba_bar_nr; + uint32_t table_offset, pba_offset; + uint16_t nentries; + + nentries = (pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) & + PCI_MSIX_FLAGS_QSIZE) + 1; + table_offset = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE); + table_bar_nr = table_offset & PCI_MSIX_FLAGS_BIRMASK; + table_offset &= ~PCI_MSIX_FLAGS_BIRMASK; + pba_offset = pci_get_long(pci_dev->config + pos + PCI_MSIX_PBA); + pba_bar_nr = pba_offset & PCI_MSIX_FLAGS_BIRMASK; + pba_offset &= ~PCI_MSIX_FLAGS_BIRMASK; + + ret = msix_init(pci_dev, pos, nentries, + &dev->v_addrs[table_bar_nr].container, table_bar_nr, + table_offset, &dev->v_addrs[pba_bar_nr].container, + pba_bar_nr, pba_offset); + if (ret < 0) { + return ret; + } + ret = msix_set_config_notifiers(pci_dev, assigned_dev_update_msi, + assigned_dev_update_msix_vector); + if (ret < 0) { return ret; } - pci_dev->msix_cap = pos; - - pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS, - pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) & - PCI_MSIX_FLAGS_QSIZE); - - /* Only enable and function mask bits are writable */ - pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS, - PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL); - - msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE); - bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK; - msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK; - dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry; } /* Minimal PM support, nothing writable, device appears to NAK changes */ @@ -1378,94 +1312,6 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev) return 0; } -static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr) -{ - AssignedDevice *adev = opaque; - unsigned int offset = addr & 0xfff; - void *page = adev->msix_table_page; - uint32_t val = 0; - - memcpy(&val, (void *)((char *)page + offset), 4); - - return val; -} - -static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr) -{ - return ((msix_mmio_readl(opaque, addr & ~3)) >> - (8 * (addr & 3))) & 0xff; -} - -static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr) -{ - return ((msix_mmio_readl(opaque, addr & ~3)) >> - (8 * (addr & 3))) & 0xffff; -} - -static void msix_mmio_writel(void *opaque, - target_phys_addr_t addr, uint32_t val) -{ - AssignedDevice *adev = opaque; - unsigned int offset = addr & 0xfff; - void *page = adev->msix_table_page; - - DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n", - addr, val); - memcpy((void *)((char *)page + offset), &val, 4); -} - -static void msix_mmio_writew(void *opaque, - target_phys_addr_t addr, uint32_t val) -{ - msix_mmio_writel(opaque, addr & ~3, - (val & 0xffff) << (8*(addr & 3))); -} - -static void msix_mmio_writeb(void *opaque, - target_phys_addr_t addr, uint32_t val) -{ - msix_mmio_writel(opaque, addr & ~3, - (val & 0xff) << (8*(addr & 3))); -} - -static const MemoryRegionOps msix_mmio_ops = { - .old_mmio = { - .read = { msix_mmio_readb, msix_mmio_readw, msix_mmio_readl, }, - .write = { msix_mmio_writeb, msix_mmio_writew, msix_mmio_writel, }, - }, - .endianness = DEVICE_NATIVE_ENDIAN, -}; - -static int assigned_dev_register_msix_mmio(AssignedDevice *dev) -{ - dev->msix_table_page = mmap(NULL, 0x1000, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); - if (dev->msix_table_page == MAP_FAILED) { - fprintf(stderr, "fail allocate msix_table_page! %s\n", - strerror(errno)); - return -EFAULT; - } - memset(dev->msix_table_page, 0, 0x1000); - memory_region_init_io(&dev->mmio, &msix_mmio_ops, dev, - "assigned-dev-msix", MSIX_PAGE_SIZE); - return 0; -} - -static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) -{ - if (!dev->msix_table_page) - return; - - memory_region_destroy(&dev->mmio); - - if (munmap(dev->msix_table_page, 0x1000) == -1) { - fprintf(stderr, "error unmapping msix_table_page! %s\n", - strerror(errno)); - } - dev->msix_table_page = NULL; -} - static const VMStateDescription vmstate_assigned_device = { .name = "pci-assign", .unmigratable = 1, @@ -1548,23 +1394,16 @@ static int assigned_initfn(struct PCIDevice *pci_dev) goto out; } - if (assigned_device_pci_cap_init(pci_dev) < 0) { - goto out; - } - - /* intercept MSI-X entry page in the MMIO */ - if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { - if (assigned_dev_register_msix_mmio(dev)) { - goto out; - } - } - /* handle real device's MMIO/PIO BARs */ if (assigned_dev_register_regions(dev->real_device.regions, dev->real_device.region_number, dev)) goto out; + if (assigned_device_pci_cap_init(pci_dev) < 0) { + goto out; + } + /* handle interrupt routing */ e_intx = dev->dev.config[0x3d] - 1; dev->intpin = e_intx; diff --git a/hw/device-assignment.h b/hw/device-assignment.h index 4b67f14..c41ea33 100644 --- a/hw/device-assignment.h +++ b/hw/device-assignment.h @@ -95,21 +95,9 @@ typedef struct AssignedDevice { uint8_t h_devfn; int irq_requested_type; int bound; - struct { -#define ASSIGNED_DEVICE_CAP_MSI (1 << 0) -#define ASSIGNED_DEVICE_CAP_MSIX (1 << 1) - uint32_t available; -#define ASSIGNED_DEVICE_MSI_ENABLED (1 << 0) -#define ASSIGNED_DEVICE_MSIX_ENABLED (1 << 1) -#define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2) - uint32_t state; - } cap; uint8_t emulate_config_read[PCI_CONFIG_SPACE_SIZE]; uint8_t emulate_config_write[PCI_CONFIG_SPACE_SIZE]; - int irq_entries_nr; - void *msix_table_page; - target_phys_addr_t msix_table_addr; - MemoryRegion mmio; + unsigned int msix_vectors_in_use; char *configfd_name; int32_t bootindex; QLIST_ENTRY(AssignedDevice) next;