diff mbox

[V3] virtio: do not require IOMMU to be created in advance

Message ID 20170309102849.1db72ad5@nial.brq.redhat.com
State New
Headers show

Commit Message

Igor Mammedov March 9, 2017, 9:28 a.m. UTC
On Thu, 9 Mar 2017 10:32:44 +0800
Jason Wang <jasowang@redhat.com> wrote:

> On 2017年03月09日 00:40, Igor Mammedov wrote:
> > On Tue, 7 Mar 2017 14:47:30 +0200
> > Marcel Apfelbaum<marcel@redhat.com>  wrote:
> >  
> >> On 03/07/2017 11:09 AM, Jason Wang wrote:  
> >>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
> >>> after caching ring translations"), IOMMU was required to be created in
> >>> advance. This is because we can only get the correct dma_as after pci
> >>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
> >>> inconvenient for user. This patch releases this by:
> >>>
> >>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
> >>>    this during pci_init_bus_master
> >>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
> >>>    the memory listener to the new dma_as

Instead of trying to fix up later it's possible to refuse
adding iommu device if other devices has been added before
it with -device/device_add.
That would match current CLI semantics where device that
others depend on should be listed on CLI before that others
are listed.

A bit hackish but something along this lines:




> >>>     
> >> Hi Jason,
> >>  
> >>> Cc: Paolo Bonzini<pbonzini@redhat.com>
> >>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> >>> ---
> >>> Changes from V2:
> >>> - delay pci_init_bus_master() after pci device is realized to make
> >>>    bus_master_ready a more generic method
> >>> ---
> >>>   hw/pci/pci.c               | 11 ++++++++---
> >>>   hw/virtio/virtio-pci.c     |  9 +++++++++
> >>>   hw/virtio/virtio.c         | 19 +++++++++++++++++++
> >>>   include/hw/pci/pci.h       |  1 +
> >>>   include/hw/virtio/virtio.h |  1 +
> >>>   5 files changed, 38 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >>> index 273f1e4..22e6ad9 100644
> >>> --- a/hw/pci/pci.c
> >>> +++ b/hw/pci/pci.c
> >>> @@ -83,6 +83,7 @@ static const VMStateDescription vmstate_pcibus = {
> >>>   static void pci_init_bus_master(PCIDevice *pci_dev)
> >>>   {
> >>>       AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
> >>> +    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(pci_dev);
> >>>
> >>>       memory_region_init_alias(&pci_dev->bus_master_enable_region,
> >>>                                OBJECT(pci_dev), "bus master",
> >>> @@ -90,6 +91,9 @@ static void pci_init_bus_master(PCIDevice *pci_dev)
> >>>       memory_region_set_enabled(&pci_dev->bus_master_enable_region, false);
> >>>       address_space_init(&pci_dev->bus_master_as,
> >>>                          &pci_dev->bus_master_enable_region, pci_dev->name);
> >>> +    if (pc->bus_master_ready) {
> >>> +        pc->bus_master_ready(pci_dev);
> >>> +    }
> >>>   }
> >>>
> >>>   static void pcibus_machine_done(Notifier *notifier, void *data)
> >>> @@ -995,9 +999,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
> >>>       pci_dev->devfn = devfn;
> >>>       pci_dev->requester_id_cache = pci_req_id_cache_get(pci_dev);
> >>>
> >>> -    if (qdev_hotplug) {
> >>> -        pci_init_bus_master(pci_dev);
> >>> -    }
> >>>       pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
> >>>       pci_dev->irq_state = 0;
> >>>       pci_config_alloc(pci_dev);
> >>> @@ -1995,6 +1996,10 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
> >>>           }
> >>>       }
> >>>
> >>> +    if (qdev_hotplug) {
> >>> +        pci_init_bus_master(pci_dev);
> >>> +    }
> >>> +  
> >> How does it help if we move qdev_hotplug check outside "do_pci_register_device"?
> >> I suppose you want the code to run after the "realize" function?
> >> If so, what happens if a "realize" function of another device needs the bus_master_as
> >> (at hotplug time)?  
> > Conceptually,
> > I'm not sure that inherited device class realize
> > should be aware of uninitialized bus_master,
> > which belongs to PCI device, nor should it initialize
> > it by calling pci_init_bus_master() explicitly,
> > it's parent's class job (PCIDevice).  
> 
> Yes, I was trying to propose a workaround for 2.9. I'm sure we will 
> refine the ordering in 2.10. And consider we have asked libvirt to 
> create IOMMU first, I think I will withdraw the patch.
> 
> >
> > more close to current code:
> > if xen-pci-passthrough were hotplugged then it looks like
> > this hunk could break it:
> > hw/xen/xen_pt.c:
> >   memory_listener_register(&s->memory_listener, &s->dev.bus_master_as);
> > would happen with uninitialized bus_master_as
> > as realize is called before pci_init_bus_master();  
> 
> Yes, this won't work. This is exactly the same issue as virtio, but this 
> will also break if it was created before an IOMMU.
> 
> >
> > So the same question as Marcel's but other way around,
> > why this hunk "has to" be moved.
> >
> >  
> 
> Right.
> 
> Thanks

Comments

Paolo Bonzini March 9, 2017, 9:30 a.m. UTC | #1
On 09/03/2017 10:28, Igor Mammedov wrote:
> On Thu, 9 Mar 2017 10:32:44 +0800
> Jason Wang <jasowang@redhat.com> wrote:
> 
>> On 2017年03月09日 00:40, Igor Mammedov wrote:
>>> On Tue, 7 Mar 2017 14:47:30 +0200
>>> Marcel Apfelbaum<marcel@redhat.com>  wrote:
>>>  
>>>> On 03/07/2017 11:09 AM, Jason Wang wrote:  
>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
>>>>> after caching ring translations"), IOMMU was required to be created in
>>>>> advance. This is because we can only get the correct dma_as after pci
>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
>>>>> inconvenient for user. This patch releases this by:
>>>>>
>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
>>>>>    this during pci_init_bus_master
>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
>>>>>    the memory listener to the new dma_as
> 
> Instead of trying to fix up later it's possible to refuse
> adding iommu device if other devices has been added before
> it with -device/device_add.
> That would match current CLI semantics where device that
> others depend on should be listed on CLI before that others
> are listed.

I like this a lot.  Can we add it to "Future incompatible changes" in
the 2.9 changelog?

Paolo

> A bit hackish but something along this lines:
> 
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index 9349acb..9d8ecc6 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -460,7 +460,7 @@ void pci_device_deassert_intx(PCIDevice *dev);
>  typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
>  
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
> -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
> +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp);
>  
>  static inline void
>  pci_set_byte(uint8_t *config, uint8_t val)
> diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
> index e0732cc..9d9a76b 100644
> --- a/hw/i386/amd_iommu.c
> +++ b/hw/i386/amd_iommu.c
> @@ -1160,7 +1160,7 @@ static void amdvi_realize(DeviceState *dev, Error **err)
>  
>      sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
>      sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
> -    pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
> +    pci_setup_iommu(bus, amdvi_host_dma_iommu, s, err);
>      s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err);
>      msi_init(&s->pci.dev, 0, 1, true, false, err);
>      amdvi_init(s);
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 22d8226..f4f208c 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -2585,7 +2585,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
>                                                g_free, g_free);
>      vtd_init(s);
>      sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
> -    pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
> +    pci_setup_iommu(bus, vtd_host_dma_iommu, dev, errp);
>      /* Pseudo address space under root PCI bus. */
>      pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
>  }
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 273f1e4..2d01589 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -317,6 +317,21 @@ static void pci_host_bus_register(DeviceState *host)
>      QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next);
>  }
>  
> +static bool pcibus_has_devices(PCIBus *bus)
> +{
> +    int i;
> +
> +    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
> +        if (bus->devices[i]) {
> +            DeviceState *dev = DEVICE(bus->devices[i]);
> +            if (dev->opts) {
> +                return true;
> +            }
> +        }
> +    }
> +    return false;
> +}
> +
>  PCIBus *pci_find_primary_bus(void)
>  {
>      PCIBus *primary_bus = NULL;
> @@ -2534,8 +2549,12 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>      return &address_space_memory;
>  }
>  
> -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque)
> +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp)
>  {
> +    if (pcibus_has_devices(bus)) {
> +        error_setg(errp, "IOMMU must be created before any other PCI devices");
> +        return;
> +    }
>      bus->iommu_fn = fn;
>      bus->iommu_opaque = opaque;
>  }
> 
> 
> 
>>>>>     
>>>> Hi Jason,
>>>>  
>>>>> Cc: Paolo Bonzini<pbonzini@redhat.com>
>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>> ---
>>>>> Changes from V2:
>>>>> - delay pci_init_bus_master() after pci device is realized to make
>>>>>    bus_master_ready a more generic method
>>>>> ---
>>>>>   hw/pci/pci.c               | 11 ++++++++---
>>>>>   hw/virtio/virtio-pci.c     |  9 +++++++++
>>>>>   hw/virtio/virtio.c         | 19 +++++++++++++++++++
>>>>>   include/hw/pci/pci.h       |  1 +
>>>>>   include/hw/virtio/virtio.h |  1 +
>>>>>   5 files changed, 38 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>>> index 273f1e4..22e6ad9 100644
>>>>> --- a/hw/pci/pci.c
>>>>> +++ b/hw/pci/pci.c
>>>>> @@ -83,6 +83,7 @@ static const VMStateDescription vmstate_pcibus = {
>>>>>   static void pci_init_bus_master(PCIDevice *pci_dev)
>>>>>   {
>>>>>       AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev);
>>>>> +    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(pci_dev);
>>>>>
>>>>>       memory_region_init_alias(&pci_dev->bus_master_enable_region,
>>>>>                                OBJECT(pci_dev), "bus master",
>>>>> @@ -90,6 +91,9 @@ static void pci_init_bus_master(PCIDevice *pci_dev)
>>>>>       memory_region_set_enabled(&pci_dev->bus_master_enable_region, false);
>>>>>       address_space_init(&pci_dev->bus_master_as,
>>>>>                          &pci_dev->bus_master_enable_region, pci_dev->name);
>>>>> +    if (pc->bus_master_ready) {
>>>>> +        pc->bus_master_ready(pci_dev);
>>>>> +    }
>>>>>   }
>>>>>
>>>>>   static void pcibus_machine_done(Notifier *notifier, void *data)
>>>>> @@ -995,9 +999,6 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>>>>>       pci_dev->devfn = devfn;
>>>>>       pci_dev->requester_id_cache = pci_req_id_cache_get(pci_dev);
>>>>>
>>>>> -    if (qdev_hotplug) {
>>>>> -        pci_init_bus_master(pci_dev);
>>>>> -    }
>>>>>       pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
>>>>>       pci_dev->irq_state = 0;
>>>>>       pci_config_alloc(pci_dev);
>>>>> @@ -1995,6 +1996,10 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
>>>>>           }
>>>>>       }
>>>>>
>>>>> +    if (qdev_hotplug) {
>>>>> +        pci_init_bus_master(pci_dev);
>>>>> +    }
>>>>> +  
>>>> How does it help if we move qdev_hotplug check outside "do_pci_register_device"?
>>>> I suppose you want the code to run after the "realize" function?
>>>> If so, what happens if a "realize" function of another device needs the bus_master_as
>>>> (at hotplug time)?  
>>> Conceptually,
>>> I'm not sure that inherited device class realize
>>> should be aware of uninitialized bus_master,
>>> which belongs to PCI device, nor should it initialize
>>> it by calling pci_init_bus_master() explicitly,
>>> it's parent's class job (PCIDevice).  
>>
>> Yes, I was trying to propose a workaround for 2.9. I'm sure we will 
>> refine the ordering in 2.10. And consider we have asked libvirt to 
>> create IOMMU first, I think I will withdraw the patch.
>>
>>>
>>> more close to current code:
>>> if xen-pci-passthrough were hotplugged then it looks like
>>> this hunk could break it:
>>> hw/xen/xen_pt.c:
>>>   memory_listener_register(&s->memory_listener, &s->dev.bus_master_as);
>>> would happen with uninitialized bus_master_as
>>> as realize is called before pci_init_bus_master();  
>>
>> Yes, this won't work. This is exactly the same issue as virtio, but this 
>> will also break if it was created before an IOMMU.
>>
>>>
>>> So the same question as Marcel's but other way around,
>>> why this hunk "has to" be moved.
>>>
>>>  
>>
>> Right.
>>
>> Thanks
>
Jason Wang March 9, 2017, 9:58 a.m. UTC | #2
On 2017年03月09日 17:28, Igor Mammedov wrote:
> On Thu, 9 Mar 2017 10:32:44 +0800
> Jason Wang<jasowang@redhat.com>  wrote:
>
>> On 2017年03月09日 00:40, Igor Mammedov wrote:
>>> On Tue, 7 Mar 2017 14:47:30 +0200
>>> Marcel Apfelbaum<marcel@redhat.com>   wrote:
>>>   
>>>> On 03/07/2017 11:09 AM, Jason Wang wrote:
>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
>>>>> after caching ring translations"), IOMMU was required to be created in
>>>>> advance. This is because we can only get the correct dma_as after pci
>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
>>>>> inconvenient for user. This patch releases this by:
>>>>>
>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
>>>>>     this during pci_init_bus_master
>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
>>>>>     the memory listener to the new dma_as
> Instead of trying to fix up later it's possible to refuse
> adding iommu device if other devices has been added before
> it with -device/device_add.
> That would match current CLI semantics where device that
> others depend on should be listed on CLI before that others
> are listed.

Yes, but it works by chance in the past for the device that does not 
want bus_master_as in their realize. This change may surprise their users.

Thanks
Paolo Bonzini March 9, 2017, 10:05 a.m. UTC | #3
On 09/03/2017 10:58, Jason Wang wrote:
> 
> 
> On 2017年03月09日 17:28, Igor Mammedov wrote:
>> On Thu, 9 Mar 2017 10:32:44 +0800
>> Jason Wang<jasowang@redhat.com>  wrote:
>>
>>> On 2017年03月09日 00:40, Igor Mammedov wrote:
>>>> On Tue, 7 Mar 2017 14:47:30 +0200
>>>> Marcel Apfelbaum<marcel@redhat.com>   wrote:
>>>>  
>>>>> On 03/07/2017 11:09 AM, Jason Wang wrote:
>>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
>>>>>> after caching ring translations"), IOMMU was required to be
>>>>>> created in
>>>>>> advance. This is because we can only get the correct dma_as after pci
>>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
>>>>>> inconvenient for user. This patch releases this by:
>>>>>>
>>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
>>>>>>     this during pci_init_bus_master
>>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
>>>>>>     the memory listener to the new dma_as
>> Instead of trying to fix up later it's possible to refuse
>> adding iommu device if other devices has been added before
>> it with -device/device_add.
>> That would match current CLI semantics where device that
>> others depend on should be listed on CLI before that others
>> are listed.
> 
> Yes, but it works by chance in the past for the device that does not
> want bus_master_as in their realize. This change may surprise their users.

There is another posssibility.  Create the address space at init time
and add a container region instead of the bus_master_enable_region
alias.  Then at machine_done time you create the bus_master_enable_region.

This removes the need for the callbacks and makes the MemoryListener
just work.

Paolo
Michael S. Tsirkin March 9, 2017, 3:31 p.m. UTC | #4
On Thu, Mar 09, 2017 at 11:05:36AM +0100, Paolo Bonzini wrote:
> 
> 
> On 09/03/2017 10:58, Jason Wang wrote:
> > 
> > 
> > On 2017年03月09日 17:28, Igor Mammedov wrote:
> >> On Thu, 9 Mar 2017 10:32:44 +0800
> >> Jason Wang<jasowang@redhat.com>  wrote:
> >>
> >>> On 2017年03月09日 00:40, Igor Mammedov wrote:
> >>>> On Tue, 7 Mar 2017 14:47:30 +0200
> >>>> Marcel Apfelbaum<marcel@redhat.com>   wrote:
> >>>>  
> >>>>> On 03/07/2017 11:09 AM, Jason Wang wrote:
> >>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
> >>>>>> after caching ring translations"), IOMMU was required to be
> >>>>>> created in
> >>>>>> advance. This is because we can only get the correct dma_as after pci
> >>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
> >>>>>> inconvenient for user. This patch releases this by:
> >>>>>>
> >>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
> >>>>>>     this during pci_init_bus_master
> >>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
> >>>>>>     the memory listener to the new dma_as
> >> Instead of trying to fix up later it's possible to refuse
> >> adding iommu device if other devices has been added before
> >> it with -device/device_add.
> >> That would match current CLI semantics where device that
> >> others depend on should be listed on CLI before that others
> >> are listed.
> > 
> > Yes, but it works by chance in the past for the device that does not
> > want bus_master_as in their realize. This change may surprise their users.
> 
> There is another posssibility.  Create the address space at init time
> and add a container region instead of the bus_master_enable_region
> alias.  Then at machine_done time you create the bus_master_enable_region.
> 
> This removes the need for the callbacks and makes the MemoryListener
> just work.
> 
> Paolo

That's definitely cleaner than a callback, though a bit tricky
so needs a good comment explaining what is going on.
And then I think we can revert
96a8821d21411f10d77ea994af369c6e5c35a2cc, right?
Paolo Bonzini March 9, 2017, 3:32 p.m. UTC | #5
On 09/03/2017 16:31, Michael S. Tsirkin wrote:
> On Thu, Mar 09, 2017 at 11:05:36AM +0100, Paolo Bonzini wrote:
>>
>>
>> On 09/03/2017 10:58, Jason Wang wrote:
>>>
>>>
>>> On 2017年03月09日 17:28, Igor Mammedov wrote:
>>>> On Thu, 9 Mar 2017 10:32:44 +0800
>>>> Jason Wang<jasowang@redhat.com>  wrote:
>>>>
>>>>> On 2017年03月09日 00:40, Igor Mammedov wrote:
>>>>>> On Tue, 7 Mar 2017 14:47:30 +0200
>>>>>> Marcel Apfelbaum<marcel@redhat.com>   wrote:
>>>>>>  
>>>>>>> On 03/07/2017 11:09 AM, Jason Wang wrote:
>>>>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
>>>>>>>> after caching ring translations"), IOMMU was required to be
>>>>>>>> created in
>>>>>>>> advance. This is because we can only get the correct dma_as after pci
>>>>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
>>>>>>>> inconvenient for user. This patch releases this by:
>>>>>>>>
>>>>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
>>>>>>>>     this during pci_init_bus_master
>>>>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
>>>>>>>>     the memory listener to the new dma_as
>>>> Instead of trying to fix up later it's possible to refuse
>>>> adding iommu device if other devices has been added before
>>>> it with -device/device_add.
>>>> That would match current CLI semantics where device that
>>>> others depend on should be listed on CLI before that others
>>>> are listed.
>>>
>>> Yes, but it works by chance in the past for the device that does not
>>> want bus_master_as in their realize. This change may surprise their users.
>>
>> There is another posssibility.  Create the address space at init time
>> and add a container region instead of the bus_master_enable_region
>> alias.  Then at machine_done time you create the bus_master_enable_region.
>>
>> This removes the need for the callbacks and makes the MemoryListener
>> just work.
>>
>> Paolo
> 
> That's definitely cleaner than a callback, though a bit tricky
> so needs a good comment explaining what is going on.
> And then I think we can revert
> 96a8821d21411f10d77ea994af369c6e5c35a2cc, right?

Yes. 96a8821d21411f10d77ea994af369c6e5c35a2cc can go then (and this is
how I expected it to work all the time---my bad).

Paolo
Jason Wang March 10, 2017, 10:54 a.m. UTC | #6
On 2017年03月09日 23:32, Paolo Bonzini wrote:
>
> On 09/03/2017 16:31, Michael S. Tsirkin wrote:
>> On Thu, Mar 09, 2017 at 11:05:36AM +0100, Paolo Bonzini wrote:
>>>
>>> On 09/03/2017 10:58, Jason Wang wrote:
>>>>
>>>> On 2017年03月09日 17:28, Igor Mammedov wrote:
>>>>> On Thu, 9 Mar 2017 10:32:44 +0800
>>>>> Jason Wang<jasowang@redhat.com>  wrote:
>>>>>
>>>>>> On 2017年03月09日 00:40, Igor Mammedov wrote:
>>>>>>> On Tue, 7 Mar 2017 14:47:30 +0200
>>>>>>> Marcel Apfelbaum<marcel@redhat.com>   wrote:
>>>>>>>   
>>>>>>>> On 03/07/2017 11:09 AM, Jason Wang wrote:
>>>>>>>>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU
>>>>>>>>> after caching ring translations"), IOMMU was required to be
>>>>>>>>> created in
>>>>>>>>> advance. This is because we can only get the correct dma_as after pci
>>>>>>>>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and
>>>>>>>>> inconvenient for user. This patch releases this by:
>>>>>>>>>
>>>>>>>>> - introduce a bus_master_ready method for PCIDeviceClass and trigger
>>>>>>>>>      this during pci_init_bus_master
>>>>>>>>> - implement virtio-pci method and 1) reset the dma_as 2) re-register
>>>>>>>>>      the memory listener to the new dma_as
>>>>> Instead of trying to fix up later it's possible to refuse
>>>>> adding iommu device if other devices has been added before
>>>>> it with -device/device_add.
>>>>> That would match current CLI semantics where device that
>>>>> others depend on should be listed on CLI before that others
>>>>> are listed.
>>>> Yes, but it works by chance in the past for the device that does not
>>>> want bus_master_as in their realize. This change may surprise their users.
>>> There is another posssibility.  Create the address space at init time
>>> and add a container region instead of the bus_master_enable_region
>>> alias.  Then at machine_done time you create the bus_master_enable_region.
>>>
>>> This removes the need for the callbacks and makes the MemoryListener
>>> just work.
>>>
>>> Paolo
>> That's definitely cleaner than a callback, though a bit tricky
>> so needs a good comment explaining what is going on.
>> And then I think we can revert
>> 96a8821d21411f10d77ea994af369c6e5c35a2cc, right?
> Yes. 96a8821d21411f10d77ea994af369c6e5c35a2cc can go then (and this is
> how I expected it to work all the time---my bad).
>
> Paolo
>
>

Will prepare patches for this (probably next week).

Thanks
diff mbox

Patch

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 9349acb..9d8ecc6 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -460,7 +460,7 @@  void pci_device_deassert_intx(PCIDevice *dev);
 typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int);
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
-void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque);
+void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp);
 
 static inline void
 pci_set_byte(uint8_t *config, uint8_t val)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index e0732cc..9d9a76b 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1160,7 +1160,7 @@  static void amdvi_realize(DeviceState *dev, Error **err)
 
     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
-    pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+    pci_setup_iommu(bus, amdvi_host_dma_iommu, s, err);
     s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err);
     msi_init(&s->pci.dev, 0, 1, true, false, err);
     amdvi_init(s);
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 22d8226..f4f208c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2585,7 +2585,7 @@  static void vtd_realize(DeviceState *dev, Error **errp)
                                               g_free, g_free);
     vtd_init(s);
     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
-    pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
+    pci_setup_iommu(bus, vtd_host_dma_iommu, dev, errp);
     /* Pseudo address space under root PCI bus. */
     pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
 }
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 273f1e4..2d01589 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -317,6 +317,21 @@  static void pci_host_bus_register(DeviceState *host)
     QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next);
 }
 
+static bool pcibus_has_devices(PCIBus *bus)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) {
+        if (bus->devices[i]) {
+            DeviceState *dev = DEVICE(bus->devices[i]);
+            if (dev->opts) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 PCIBus *pci_find_primary_bus(void)
 {
     PCIBus *primary_bus = NULL;
@@ -2534,8 +2549,12 @@  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
     return &address_space_memory;
 }
 
-void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque)
+void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp)
 {
+    if (pcibus_has_devices(bus)) {
+        error_setg(errp, "IOMMU must be created before any other PCI devices");
+        return;
+    }
     bus->iommu_fn = fn;
     bus->iommu_opaque = opaque;
 }