Patchwork [02/38] pci: split exit and finalize

login
register
mail settings
Submitter Paolo Bonzini
Date Sept. 3, 2013, 12:32 p.m.
Message ID <1378211609-16121-3-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/272216/
State New
Headers show

Comments

Paolo Bonzini - Sept. 3, 2013, 12:32 p.m.
When converting devices to use out-of-BQL memory access, destruction
needs to be done in two phases.  First, the device is unrealized;
at this point, pending memory accesses can still be completed, but
no new accesses will be started.  The second part is freeing the
device, which happens only after the reference count drops to zero;
this means that all memory accesses are complete.

Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/pci/pci.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)
Michael S. Tsirkin - Sept. 17, 2013, 9:16 a.m.
On Tue, Sep 03, 2013 at 02:32:53PM +0200, Paolo Bonzini wrote:
> When converting devices to use out-of-BQL memory access, destruction
> needs to be done in two phases.  First, the device is unrealized;
> at this point, pending memory accesses can still be completed, but
> no new accesses will be started.  The second part is freeing the
> device, which happens only after the reference count drops to zero;
> this means that all memory accesses are complete.
> 
> Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  hw/pci/pci.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 4c004f5..bd084c7 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -787,6 +787,16 @@ static void pci_config_free(PCIDevice *pci_dev)
>      g_free(pci_dev->used);
>  }
>  
> +static void pci_device_instance_finalize(Object *obj)
> +{
> +    PCIDevice *pci_dev = PCI_DEVICE(obj);
> +
> +    qemu_free_irqs(pci_dev->irq);
> +
> +    address_space_destroy(&pci_dev->bus_master_as);
> +    memory_region_destroy(&pci_dev->bus_master_enable_region);
> +}
> +
>  /* -1 for devfn means auto assign */
>  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>                                           const char *name, int devfn)
> @@ -875,12 +885,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>  
>  static void do_pci_unregister_device(PCIDevice *pci_dev)
>  {
> -    qemu_free_irqs(pci_dev->irq);

I don't get this one.
Why do we want to keep irqs about?
If they manage to send an interrupt to guest *somehow*
guest will hang with no way to clear.

>      pci_dev->bus->devices[pci_dev->devfn] = NULL;
>      pci_config_free(pci_dev);
> -
> -    address_space_destroy(&pci_dev->bus_master_as);
> -    memory_region_destroy(&pci_dev->bus_master_enable_region);

Interesting.
So you are saying it's important to keep MMIO MRs around until finalize,
it's not enough that that they are not a subregion of anything?  If not,
is e.g. pcie_host_mmcfg_update buggy?


>  }
>  
>  static void pci_unregister_io_regions(PCIDevice *pci_dev)
> @@ -2252,6 +2258,7 @@ static const TypeInfo pci_device_type_info = {
>      .abstract = true,
>      .class_size = sizeof(PCIDeviceClass),
>      .class_init = pci_device_class_init,
> +    .instance_finalize = pci_device_instance_finalize,
>  };
>  
>  static void pci_register_types(void)
> -- 
> 1.8.3.1
>
Paolo Bonzini - Sept. 17, 2013, 9:56 a.m.
Il 17/09/2013 11:16, Michael S. Tsirkin ha scritto:
> On Tue, Sep 03, 2013 at 02:32:53PM +0200, Paolo Bonzini wrote:
>> When converting devices to use out-of-BQL memory access, destruction
>> needs to be done in two phases.  First, the device is unrealized;
>> at this point, pending memory accesses can still be completed, but
>> no new accesses will be started.  The second part is freeing the
>> device, which happens only after the reference count drops to zero;
>> this means that all memory accesses are complete.
>>
>> Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>>  hw/pci/pci.c | 15 +++++++++++----
>>  1 file changed, 11 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index 4c004f5..bd084c7 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -787,6 +787,16 @@ static void pci_config_free(PCIDevice *pci_dev)
>>      g_free(pci_dev->used);
>>  }
>>  
>> +static void pci_device_instance_finalize(Object *obj)
>> +{
>> +    PCIDevice *pci_dev = PCI_DEVICE(obj);
>> +
>> +    qemu_free_irqs(pci_dev->irq);
>> +
>> +    address_space_destroy(&pci_dev->bus_master_as);
>> +    memory_region_destroy(&pci_dev->bus_master_enable_region);
>> +}
>> +
>>  /* -1 for devfn means auto assign */
>>  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>>                                           const char *name, int devfn)
>> @@ -875,12 +885,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>>  
>>  static void do_pci_unregister_device(PCIDevice *pci_dev)
>>  {
>> -    qemu_free_irqs(pci_dev->irq);
> 
> I don't get this one.
> Why do we want to keep irqs about?
> If they manage to send an interrupt to guest *somehow*
> guest will hang with no way to clear.

I can leave this here for now, since IRQs are always triggered under the
BQL.  But I think it's cleaner to do all freeing in instance_finalize
(actually that includes pci_config_free that I somehow missed).

>>      pci_dev->bus->devices[pci_dev->devfn] = NULL;
>>      pci_config_free(pci_dev);
>> -
>> -    address_space_destroy(&pci_dev->bus_master_as);
>> -    memory_region_destroy(&pci_dev->bus_master_enable_region);
> 
> Interesting.
> So you are saying it's important to keep MMIO MRs around until finalize,
> it's not enough that that they are not a subregion of anything?

Yes.  do_pci_unregister_device marks the point where the guest will not
be able to submit new requests to the device, but there may be previous
requests pending. because you could have something like this:

       VCPU 1                    VCPU 2
       ----------------------------------------------------
       start asynchronous I/O
        address_space_map
         address_space_translate
          memory_region_ref
           object_ref
       ** releases BQL
                                  eject device
                                   object_unparent
                                    my_device_exit
                                     memory_region_del_subregion
                                     ** cannot yet destroy!!
                                     ** address_space_unmap will use it
       ** gets BQL again
       asynchronous I/O ends
        address_space_unmap
         memory_region_unref
          object_unref
           instance_finalize
            memory_region_destroy

In RCU terms, do_pci_unregister_device is "removal", while
instance_finalize is "reclamation", but this is not yet getting
RCU-based MMIO dispatch into the picture; it's all using the BQL.  In
fact you could even have just one VCPU that kicks the IO and also ejects
the device, but it's more easily understood if you separate the two actions.

While it generally means the guest is buggy or malicious, of course it
must be handled correctly.

> If not, is e.g. pcie_host_mmcfg_update buggy?

See patch "pcie: do not recreate mmcfg I/O region, use an alias instead"

Paolo

> 
>>  }
>>  
>>  static void pci_unregister_io_regions(PCIDevice *pci_dev)
>> @@ -2252,6 +2258,7 @@ static const TypeInfo pci_device_type_info = {
>>      .abstract = true,
>>      .class_size = sizeof(PCIDeviceClass),
>>      .class_init = pci_device_class_init,
>> +    .instance_finalize = pci_device_instance_finalize,
>>  };
>>  
>>  static void pci_register_types(void)
>> -- 
>> 1.8.3.1
>>
Michael S. Tsirkin - Sept. 17, 2013, 10:06 a.m.
On Tue, Sep 03, 2013 at 02:32:53PM +0200, Paolo Bonzini wrote:
> When converting devices to use out-of-BQL memory access, destruction
> needs to be done in two phases.  First, the device is unrealized;
> at this point, pending memory accesses can still be completed, but
> no new accesses will be started.  The second part is freeing the
> device, which happens only after the reference count drops to zero;
> this means that all memory accesses are complete.
> 
> Reviewed-by: Anthony Liguori <aliguori@us.ibm.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  hw/pci/pci.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 4c004f5..bd084c7 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -787,6 +787,16 @@ static void pci_config_free(PCIDevice *pci_dev)
>      g_free(pci_dev->used);
>  }
>  
> +static void pci_device_instance_finalize(Object *obj)
> +{
> +    PCIDevice *pci_dev = PCI_DEVICE(obj);
> +
> +    qemu_free_irqs(pci_dev->irq);
> +
> +    address_space_destroy(&pci_dev->bus_master_as);
> +    memory_region_destroy(&pci_dev->bus_master_enable_region);
> +}
> +
>  /* -1 for devfn means auto assign */
>  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>                                           const char *name, int devfn)

Actually this isn't enough,
memory_region_destroy is called from
pci_del_option_rom too.

> @@ -875,12 +885,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>  
>  static void do_pci_unregister_device(PCIDevice *pci_dev)
>  {
> -    qemu_free_irqs(pci_dev->irq);
>      pci_dev->bus->devices[pci_dev->devfn] = NULL;
>      pci_config_free(pci_dev);
> -
> -    address_space_destroy(&pci_dev->bus_master_as);
> -    memory_region_destroy(&pci_dev->bus_master_enable_region);
>  }
>  
>  static void pci_unregister_io_regions(PCIDevice *pci_dev)
> @@ -2252,6 +2258,7 @@ static const TypeInfo pci_device_type_info = {
>      .abstract = true,
>      .class_size = sizeof(PCIDeviceClass),
>      .class_init = pci_device_class_init,
> +    .instance_finalize = pci_device_instance_finalize,
>  };
>  
>  static void pci_register_types(void)
> -- 
> 1.8.3.1
>
Paolo Bonzini - Sept. 17, 2013, 10:23 a.m.
Il 17/09/2013 11:56, Paolo Bonzini ha scritto:
> Yes.  do_pci_unregister_device marks the point where the guest will not
> be able to submit new requests to the device, but there may be previous
> requests pending. because you could have something like this:

Michael pointed out offlist that the previous example involved the
address_space_map bounce buffer.

Here is a simpler one that doesn't rely on it:

       VCPU 1                    VCPU 2
       ----------------------------------------------------
       start asynchronous I/O
        pci_dma_sglist_init
         object_ref
       ** releases BQL
                                  eject device
                                   object_unparent
                                    my_device_exit
                                     memory_region_del_subregion
                                     ** cannot yet destroy!!
                                     ** address_space_unmap will use it
       ** gets BQL again
       asynchronous I/O ends
        qemu_sglist_destroy
         object_unref
          instance_finalize

Paolo

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 4c004f5..bd084c7 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -787,6 +787,16 @@  static void pci_config_free(PCIDevice *pci_dev)
     g_free(pci_dev->used);
 }
 
+static void pci_device_instance_finalize(Object *obj)
+{
+    PCIDevice *pci_dev = PCI_DEVICE(obj);
+
+    qemu_free_irqs(pci_dev->irq);
+
+    address_space_destroy(&pci_dev->bus_master_as);
+    memory_region_destroy(&pci_dev->bus_master_enable_region);
+}
+
 /* -1 for devfn means auto assign */
 static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
                                          const char *name, int devfn)
@@ -875,12 +885,8 @@  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
 
 static void do_pci_unregister_device(PCIDevice *pci_dev)
 {
-    qemu_free_irqs(pci_dev->irq);
     pci_dev->bus->devices[pci_dev->devfn] = NULL;
     pci_config_free(pci_dev);
-
-    address_space_destroy(&pci_dev->bus_master_as);
-    memory_region_destroy(&pci_dev->bus_master_enable_region);
 }
 
 static void pci_unregister_io_regions(PCIDevice *pci_dev)
@@ -2252,6 +2258,7 @@  static const TypeInfo pci_device_type_info = {
     .abstract = true,
     .class_size = sizeof(PCIDeviceClass),
     .class_init = pci_device_class_init,
+    .instance_finalize = pci_device_instance_finalize,
 };
 
 static void pci_register_types(void)