Message ID | 20170309102849.1db72ad5@nial.brq.redhat.com |
---|---|
State | New |
Headers | show |
On 09/03/2017 10:28, Igor Mammedov wrote: > On Thu, 9 Mar 2017 10:32:44 +0800 > Jason Wang <jasowang@redhat.com> wrote: > >> On 2017年03月09日 00:40, Igor Mammedov wrote: >>> On Tue, 7 Mar 2017 14:47:30 +0200 >>> Marcel Apfelbaum<marcel@redhat.com> wrote: >>> >>>> On 03/07/2017 11:09 AM, Jason Wang wrote: >>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU >>>>> after caching ring translations"), IOMMU was required to be created in >>>>> advance. This is because we can only get the correct dma_as after pci >>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and >>>>> inconvenient for user. This patch releases this by: >>>>> >>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger >>>>> this during pci_init_bus_master >>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register >>>>> the memory listener to the new dma_as > > Instead of trying to fix up later it's possible to refuse > adding iommu device if other devices has been added before > it with -device/device_add. > That would match current CLI semantics where device that > others depend on should be listed on CLI before that others > are listed. I like this a lot. Can we add it to "Future incompatible changes" in the 2.9 changelog? Paolo > A bit hackish but something along this lines: > > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > index 9349acb..9d8ecc6 100644 > --- a/include/hw/pci/pci.h > +++ b/include/hw/pci/pci.h > @@ -460,7 +460,7 @@ void pci_device_deassert_intx(PCIDevice *dev); > typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); > > AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); > -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); > +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp); > > static inline void > pci_set_byte(uint8_t *config, uint8_t val) > diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c > index e0732cc..9d9a76b 100644 > --- a/hw/i386/amd_iommu.c > +++ b/hw/i386/amd_iommu.c > @@ -1160,7 +1160,7 @@ static void amdvi_realize(DeviceState *dev, Error **err) > > sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio); > sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR); > - pci_setup_iommu(bus, amdvi_host_dma_iommu, s); > + pci_setup_iommu(bus, amdvi_host_dma_iommu, s, err); > s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err); > msi_init(&s->pci.dev, 0, 1, true, false, err); > amdvi_init(s); > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 22d8226..f4f208c 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -2585,7 +2585,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) > g_free, g_free); > vtd_init(s); > sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); > - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); > + pci_setup_iommu(bus, vtd_host_dma_iommu, dev, errp); > /* Pseudo address space under root PCI bus. */ > pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); > } > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index 273f1e4..2d01589 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -317,6 +317,21 @@ static void pci_host_bus_register(DeviceState *host) > QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next); > } > > +static bool pcibus_has_devices(PCIBus *bus) > +{ > + int i; > + > + for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) { > + if (bus->devices[i]) { > + DeviceState *dev = DEVICE(bus->devices[i]); > + if (dev->opts) { > + return true; > + } > + } > + } > + return false; > +} > + > PCIBus *pci_find_primary_bus(void) > { > PCIBus *primary_bus = NULL; > @@ -2534,8 +2549,12 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) > return &address_space_memory; > } > > -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) > +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp) > { > + if (pcibus_has_devices(bus)) { > + error_setg(errp, "IOMMU must be created before any other PCI devices"); > + return; > + } > bus->iommu_fn = fn; > bus->iommu_opaque = opaque; > } > > > >>>>> >>>> Hi Jason, >>>> >>>>> Cc: Paolo Bonzini<pbonzini@redhat.com> >>>>> Signed-off-by: Jason Wang<jasowang@redhat.com> >>>>> --- >>>>> Changes from V2: >>>>> - delay pci_init_bus_master() after pci device is realized to make >>>>> bus_master_ready a more generic method >>>>> --- >>>>> hw/pci/pci.c | 11 ++++++++--- >>>>> hw/virtio/virtio-pci.c | 9 +++++++++ >>>>> hw/virtio/virtio.c | 19 +++++++++++++++++++ >>>>> include/hw/pci/pci.h | 1 + >>>>> include/hw/virtio/virtio.h | 1 + >>>>> 5 files changed, 38 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c >>>>> index 273f1e4..22e6ad9 100644 >>>>> --- a/hw/pci/pci.c >>>>> +++ b/hw/pci/pci.c >>>>> @@ -83,6 +83,7 @@ static const VMStateDescription vmstate_pcibus = { >>>>> static void pci_init_bus_master(PCIDevice *pci_dev) >>>>> { >>>>> AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev); >>>>> + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(pci_dev); >>>>> >>>>> memory_region_init_alias(&pci_dev->bus_master_enable_region, >>>>> OBJECT(pci_dev), "bus master", >>>>> @@ -90,6 +91,9 @@ static void pci_init_bus_master(PCIDevice *pci_dev) >>>>> memory_region_set_enabled(&pci_dev->bus_master_enable_region, false); >>>>> address_space_init(&pci_dev->bus_master_as, >>>>> &pci_dev->bus_master_enable_region, pci_dev->name); >>>>> + if (pc->bus_master_ready) { >>>>> + pc->bus_master_ready(pci_dev); >>>>> + } >>>>> } >>>>> >>>>> static void pcibus_machine_done(Notifier *notifier, void *data) >>>>> @@ -995,9 +999,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, >>>>> pci_dev->devfn = devfn; >>>>> pci_dev->requester_id_cache = pci_req_id_cache_get(pci_dev); >>>>> >>>>> - if (qdev_hotplug) { >>>>> - pci_init_bus_master(pci_dev); >>>>> - } >>>>> pstrcpy(pci_dev->name, sizeof(pci_dev->name), name); >>>>> pci_dev->irq_state = 0; >>>>> pci_config_alloc(pci_dev); >>>>> @@ -1995,6 +1996,10 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) >>>>> } >>>>> } >>>>> >>>>> + if (qdev_hotplug) { >>>>> + pci_init_bus_master(pci_dev); >>>>> + } >>>>> + >>>> How does it help if we move qdev_hotplug check outside "do_pci_register_device"? >>>> I suppose you want the code to run after the "realize" function? >>>> If so, what happens if a "realize" function of another device needs the bus_master_as >>>> (at hotplug time)? >>> Conceptually, >>> I'm not sure that inherited device class realize >>> should be aware of uninitialized bus_master, >>> which belongs to PCI device, nor should it initialize >>> it by calling pci_init_bus_master() explicitly, >>> it's parent's class job (PCIDevice). >> >> Yes, I was trying to propose a workaround for 2.9. I'm sure we will >> refine the ordering in 2.10. And consider we have asked libvirt to >> create IOMMU first, I think I will withdraw the patch. >> >>> >>> more close to current code: >>> if xen-pci-passthrough were hotplugged then it looks like >>> this hunk could break it: >>> hw/xen/xen_pt.c: >>> memory_listener_register(&s->memory_listener, &s->dev.bus_master_as); >>> would happen with uninitialized bus_master_as >>> as realize is called before pci_init_bus_master(); >> >> Yes, this won't work. This is exactly the same issue as virtio, but this >> will also break if it was created before an IOMMU. >> >>> >>> So the same question as Marcel's but other way around, >>> why this hunk "has to" be moved. >>> >>> >> >> Right. >> >> Thanks >
On 2017年03月09日 17:28, Igor Mammedov wrote: > On Thu, 9 Mar 2017 10:32:44 +0800 > Jason Wang<jasowang@redhat.com> wrote: > >> On 2017年03月09日 00:40, Igor Mammedov wrote: >>> On Tue, 7 Mar 2017 14:47:30 +0200 >>> Marcel Apfelbaum<marcel@redhat.com> wrote: >>> >>>> On 03/07/2017 11:09 AM, Jason Wang wrote: >>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU >>>>> after caching ring translations"), IOMMU was required to be created in >>>>> advance. This is because we can only get the correct dma_as after pci >>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and >>>>> inconvenient for user. This patch releases this by: >>>>> >>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger >>>>> this during pci_init_bus_master >>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register >>>>> the memory listener to the new dma_as > Instead of trying to fix up later it's possible to refuse > adding iommu device if other devices has been added before > it with -device/device_add. > That would match current CLI semantics where device that > others depend on should be listed on CLI before that others > are listed. Yes, but it works by chance in the past for the device that does not want bus_master_as in their realize. This change may surprise their users. Thanks
On 09/03/2017 10:58, Jason Wang wrote: > > > On 2017年03月09日 17:28, Igor Mammedov wrote: >> On Thu, 9 Mar 2017 10:32:44 +0800 >> Jason Wang<jasowang@redhat.com> wrote: >> >>> On 2017年03月09日 00:40, Igor Mammedov wrote: >>>> On Tue, 7 Mar 2017 14:47:30 +0200 >>>> Marcel Apfelbaum<marcel@redhat.com> wrote: >>>> >>>>> On 03/07/2017 11:09 AM, Jason Wang wrote: >>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU >>>>>> after caching ring translations"), IOMMU was required to be >>>>>> created in >>>>>> advance. This is because we can only get the correct dma_as after pci >>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and >>>>>> inconvenient for user. This patch releases this by: >>>>>> >>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger >>>>>> this during pci_init_bus_master >>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register >>>>>> the memory listener to the new dma_as >> Instead of trying to fix up later it's possible to refuse >> adding iommu device if other devices has been added before >> it with -device/device_add. >> That would match current CLI semantics where device that >> others depend on should be listed on CLI before that others >> are listed. > > Yes, but it works by chance in the past for the device that does not > want bus_master_as in their realize. This change may surprise their users. There is another posssibility. Create the address space at init time and add a container region instead of the bus_master_enable_region alias. Then at machine_done time you create the bus_master_enable_region. This removes the need for the callbacks and makes the MemoryListener just work. Paolo
On Thu, Mar 09, 2017 at 11:05:36AM +0100, Paolo Bonzini wrote: > > > On 09/03/2017 10:58, Jason Wang wrote: > > > > > > On 2017年03月09日 17:28, Igor Mammedov wrote: > >> On Thu, 9 Mar 2017 10:32:44 +0800 > >> Jason Wang<jasowang@redhat.com> wrote: > >> > >>> On 2017年03月09日 00:40, Igor Mammedov wrote: > >>>> On Tue, 7 Mar 2017 14:47:30 +0200 > >>>> Marcel Apfelbaum<marcel@redhat.com> wrote: > >>>> > >>>>> On 03/07/2017 11:09 AM, Jason Wang wrote: > >>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU > >>>>>> after caching ring translations"), IOMMU was required to be > >>>>>> created in > >>>>>> advance. This is because we can only get the correct dma_as after pci > >>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and > >>>>>> inconvenient for user. This patch releases this by: > >>>>>> > >>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger > >>>>>> this during pci_init_bus_master > >>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register > >>>>>> the memory listener to the new dma_as > >> Instead of trying to fix up later it's possible to refuse > >> adding iommu device if other devices has been added before > >> it with -device/device_add. > >> That would match current CLI semantics where device that > >> others depend on should be listed on CLI before that others > >> are listed. > > > > Yes, but it works by chance in the past for the device that does not > > want bus_master_as in their realize. This change may surprise their users. > > There is another posssibility. Create the address space at init time > and add a container region instead of the bus_master_enable_region > alias. Then at machine_done time you create the bus_master_enable_region. > > This removes the need for the callbacks and makes the MemoryListener > just work. > > Paolo That's definitely cleaner than a callback, though a bit tricky so needs a good comment explaining what is going on. And then I think we can revert 96a8821d21411f10d77ea994af369c6e5c35a2cc, right?
On 09/03/2017 16:31, Michael S. Tsirkin wrote: > On Thu, Mar 09, 2017 at 11:05:36AM +0100, Paolo Bonzini wrote: >> >> >> On 09/03/2017 10:58, Jason Wang wrote: >>> >>> >>> On 2017年03月09日 17:28, Igor Mammedov wrote: >>>> On Thu, 9 Mar 2017 10:32:44 +0800 >>>> Jason Wang<jasowang@redhat.com> wrote: >>>> >>>>> On 2017年03月09日 00:40, Igor Mammedov wrote: >>>>>> On Tue, 7 Mar 2017 14:47:30 +0200 >>>>>> Marcel Apfelbaum<marcel@redhat.com> wrote: >>>>>> >>>>>>> On 03/07/2017 11:09 AM, Jason Wang wrote: >>>>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU >>>>>>>> after caching ring translations"), IOMMU was required to be >>>>>>>> created in >>>>>>>> advance. This is because we can only get the correct dma_as after pci >>>>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and >>>>>>>> inconvenient for user. This patch releases this by: >>>>>>>> >>>>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger >>>>>>>> this during pci_init_bus_master >>>>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register >>>>>>>> the memory listener to the new dma_as >>>> Instead of trying to fix up later it's possible to refuse >>>> adding iommu device if other devices has been added before >>>> it with -device/device_add. >>>> That would match current CLI semantics where device that >>>> others depend on should be listed on CLI before that others >>>> are listed. >>> >>> Yes, but it works by chance in the past for the device that does not >>> want bus_master_as in their realize. This change may surprise their users. >> >> There is another posssibility. Create the address space at init time >> and add a container region instead of the bus_master_enable_region >> alias. Then at machine_done time you create the bus_master_enable_region. >> >> This removes the need for the callbacks and makes the MemoryListener >> just work. >> >> Paolo > > That's definitely cleaner than a callback, though a bit tricky > so needs a good comment explaining what is going on. > And then I think we can revert > 96a8821d21411f10d77ea994af369c6e5c35a2cc, right? Yes. 96a8821d21411f10d77ea994af369c6e5c35a2cc can go then (and this is how I expected it to work all the time---my bad). Paolo
On 2017年03月09日 23:32, Paolo Bonzini wrote: > > On 09/03/2017 16:31, Michael S. Tsirkin wrote: >> On Thu, Mar 09, 2017 at 11:05:36AM +0100, Paolo Bonzini wrote: >>> >>> On 09/03/2017 10:58, Jason Wang wrote: >>>> >>>> On 2017年03月09日 17:28, Igor Mammedov wrote: >>>>> On Thu, 9 Mar 2017 10:32:44 +0800 >>>>> Jason Wang<jasowang@redhat.com> wrote: >>>>> >>>>>> On 2017年03月09日 00:40, Igor Mammedov wrote: >>>>>>> On Tue, 7 Mar 2017 14:47:30 +0200 >>>>>>> Marcel Apfelbaum<marcel@redhat.com> wrote: >>>>>>> >>>>>>>> On 03/07/2017 11:09 AM, Jason Wang wrote: >>>>>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU >>>>>>>>> after caching ring translations"), IOMMU was required to be >>>>>>>>> created in >>>>>>>>> advance. This is because we can only get the correct dma_as after pci >>>>>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and >>>>>>>>> inconvenient for user. This patch releases this by: >>>>>>>>> >>>>>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger >>>>>>>>> this during pci_init_bus_master >>>>>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register >>>>>>>>> the memory listener to the new dma_as >>>>> Instead of trying to fix up later it's possible to refuse >>>>> adding iommu device if other devices has been added before >>>>> it with -device/device_add. >>>>> That would match current CLI semantics where device that >>>>> others depend on should be listed on CLI before that others >>>>> are listed. >>>> Yes, but it works by chance in the past for the device that does not >>>> want bus_master_as in their realize. This change may surprise their users. >>> There is another posssibility. Create the address space at init time >>> and add a container region instead of the bus_master_enable_region >>> alias. Then at machine_done time you create the bus_master_enable_region. >>> >>> This removes the need for the callbacks and makes the MemoryListener >>> just work. >>> >>> Paolo >> That's definitely cleaner than a callback, though a bit tricky >> so needs a good comment explaining what is going on. >> And then I think we can revert >> 96a8821d21411f10d77ea994af369c6e5c35a2cc, right? > Yes. 96a8821d21411f10d77ea994af369c6e5c35a2cc can go then (and this is > how I expected it to work all the time---my bad). > > Paolo > > Will prepare patches for this (probably next week). Thanks
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 9349acb..9d8ecc6 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -460,7 +460,7 @@ void pci_device_deassert_intx(PCIDevice *dev); typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp); static inline void pci_set_byte(uint8_t *config, uint8_t val) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index e0732cc..9d9a76b 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1160,7 +1160,7 @@ static void amdvi_realize(DeviceState *dev, Error **err) sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio); sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR); - pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + pci_setup_iommu(bus, amdvi_host_dma_iommu, s, err); s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err); msi_init(&s->pci.dev, 0, 1, true, false, err); amdvi_init(s); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 22d8226..f4f208c 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2585,7 +2585,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) g_free, g_free); vtd_init(s); sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + pci_setup_iommu(bus, vtd_host_dma_iommu, dev, errp); /* Pseudo address space under root PCI bus. */ pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 273f1e4..2d01589 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -317,6 +317,21 @@ static void pci_host_bus_register(DeviceState *host) QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next); } +static bool pcibus_has_devices(PCIBus *bus) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) { + if (bus->devices[i]) { + DeviceState *dev = DEVICE(bus->devices[i]); + if (dev->opts) { + return true; + } + } + } + return false; +} + PCIBus *pci_find_primary_bus(void) { PCIBus *primary_bus = NULL; @@ -2534,8 +2549,12 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) return &address_space_memory; } -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp) { + if (pcibus_has_devices(bus)) { + error_setg(errp, "IOMMU must be created before any other PCI devices"); + return; + } bus->iommu_fn = fn; bus->iommu_opaque = opaque; }