diff mbox

[v14,Resend,10/13] pci: add pci device pre-post reset callbacks for host bus reset

Message ID 56737D85.3030305@cn.fujitsu.com
State New
Headers show

Commit Message

chenfan Dec. 18, 2015, 3:29 a.m. UTC
On 12/18/2015 04:31 AM, Alex Williamson wrote:
> On Thu, 2015-12-17 at 09:41 +0800, Cao jin wrote:
>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>
>> Particularly, For vfio devices, Once need to recovery devices
>> by bus reset such as AER, we always need to reset the host bus
>> to recovery the devices under the bus, so we need to add pci device
>> callbacks to specify to do host bus reset.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>> ---
>>   hw/pci/pci.c         | 18 ++++++++++++++++++
>>   hw/pci/pci_bridge.c  |  9 +++++++++
>>   hw/vfio/pci.c        | 26 ++++++++++++++++++++++++++
>>   hw/vfio/pci.h        |  2 ++
>>   include/hw/pci/pci.h |  7 +++++++
>>   5 files changed, 62 insertions(+)
>>
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index f6ca6ef..64fa2cc 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -247,6 +247,24 @@ static void pci_do_device_reset(PCIDevice *dev)
>>       msix_reset(dev);
>>   }
>>   
>> +void pci_device_pre_reset(PCIBus *bus, PCIDevice *dev, void *unused)
>> +{
>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>> +
>> +    if (dc->pre_reset) {
>> +        dc->pre_reset(dev);
>> +    }
>> +}
>> +
>> +void pci_device_post_reset(PCIBus *bus, PCIDevice *dev, void
>> *unused)
>> +{
>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>> +
>> +    if (dc->post_reset) {
>> +        dc->post_reset(dev);
>> +    }
>> +}
>> +
>>   /*
>>    * This function is called on #RST and FLR.
>>    * FLR if PCI_EXP_DEVCTL_BCR_FLR is set
>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>> index 40c97b1..ddb76ab 100644
>> --- a/hw/pci/pci_bridge.c
>> +++ b/hw/pci/pci_bridge.c
>> @@ -267,8 +267,17 @@ void pci_bridge_write_config(PCIDevice *d,
>>   
>>       newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>       if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>> +        /*
>> +         * Notify all vfio-pci devices under the bus
>> +         * should do physical bus reset.
>> +         */
>> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>> +                            pci_device_pre_reset, NULL);
>>           /* Trigger hot reset on 0->1 transition. */
>>           qbus_reset_all(&s->sec_bus.qbus);
>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>> +                            pci_device_post_reset, NULL);
>>       }
>>   }
>>   
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index e17dc89..df32618 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -39,6 +39,7 @@
>>   
>>   static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
>> enabled);
>> +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single);
>>   
>>   /*
>>    * Disabling BAR mmaping can be slow, but toggling it around INTx
>> can
>> @@ -1888,6 +1889,8 @@ static int
>> vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>>       /* List all affected devices by bus reset */
>>       devices = &info->devices[0];
>>   
>> +    vdev->single_depend_dev = (info->count == 1);
>> +
>>       /* Verify that we have all the groups required */
>>       for (i = 0; i < info->count; i++) {
>>           PCIHostDeviceAddress host;
>> @@ -2029,10 +2032,26 @@ static int
>> vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
>>       return vfio_check_host_bus_reset(vdev);
>>   }
>>   
>> +static void vfio_aer_pre_reset(PCIDevice *pdev)
>> +{
>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>> +
>> +    vdev->aer_reset = true;
>> +    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>> +}
> Doesn't this lead to multiple host bus resets per guest bus reset in
> many cases?  It looks like we'll do it once per vfio-pci device, even
> if those devices are on the same host bus.  That's a 1 second operation
> per device.  Can we avoid that?  Maybe some sort of sequence ID could
> help a device figure out whether it's already been reset as part of a
> dependent device for this particular guest bus reset.  Thanks,
That's right, I missed this case, but I don't understand the scenario how to
use a sequence ID to mark the device if been reset. can you detail it ?
additional, there was a mechanism to compute device whether need to be reset
by hot reset. so I simply modify the code as the following:

what do you think of this ?

Thanks,
Chen

>
> Alex
>
>> +
>> +static void vfio_aer_post_reset(PCIDevice *pdev)
>> +{
>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>> +
>> +    vdev->aer_reset = false;
>> +}
>> +
>>   static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
>>                             int pos, uint16_t size)
>>   {
>>       PCIDevice *pdev = &vdev->pdev;
>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(pdev);
>>       PCIDevice *dev_iter;
>>       uint8_t type;
>>       uint32_t errcap;
>> @@ -2079,6 +2098,9 @@ static int vfio_setup_aer(VFIOPCIDevice *vdev,
>> uint8_t cap_ver,
>>       vdev->hotplug_notifier.notify = vfio_check_bus_reset;
>>       pci_bus_add_hotplug_notifier(pdev->bus, &vdev-
>>> hotplug_notifier);
>>   
>> +    dc->pre_reset = vfio_aer_pre_reset;
>> +    dc->post_reset = vfio_aer_post_reset;
>> +
>>       pcie_cap_deverr_init(pdev);
>>       return pcie_aer_init(pdev, pos, size);
>>   
>> @@ -2978,6 +3000,10 @@ static void vfio_pci_reset(DeviceState *dev)
>>   
>>       trace_vfio_pci_reset(vdev->vbasedev.name);
>>   
>> +    if (vdev->aer_reset) {
>> +        return;
>> +    }
>> +
>>       vfio_pci_pre_reset(vdev);
>>   
>>       if (vdev->resetfn && !vdev->resetfn(vdev)) {
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index b385f07..5470b97 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -142,6 +142,8 @@ typedef struct VFIOPCIDevice {
>>       bool no_kvm_intx;
>>       bool no_kvm_msi;
>>       bool no_kvm_msix;
>> +    bool aer_reset;
>> +    bool single_depend_dev;
>>   
>>       NotifierWithReturn hotplug_notifier;
>>   } VFIOPCIDevice;
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 379b6e1..6b1f2d4 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -105,6 +105,9 @@ typedef void PCIMapIORegionFunc(PCIDevice
>> *pci_dev, int region_num,
>>                                   pcibus_t addr, pcibus_t size, int
>> type);
>>   typedef void PCIUnregisterFunc(PCIDevice *pci_dev);
>>   
>> +typedef void PCIPreResetFunc(PCIDevice *pci_dev);
>> +typedef void PCIPostResetFunc(PCIDevice *pci_dev);
>> +
>>   typedef struct PCIIORegion {
>>       pcibus_t addr; /* current PCI mapping address. -1 means not
>> mapped */
>>   #define PCI_BAR_UNMAPPED (~(pcibus_t)0)
>> @@ -193,6 +196,8 @@ typedef struct PCIDeviceClass {
>>       PCIUnregisterFunc *exit;
>>       PCIConfigReadFunc *config_read;
>>       PCIConfigWriteFunc *config_write;
>> +    PCIPreResetFunc *pre_reset;
>> +    PCIPostResetFunc *post_reset;
>>   
>>       uint16_t vendor_id;
>>       uint16_t device_id;
>> @@ -380,6 +385,8 @@ bool pci_intx_route_changed(PCIINTxRoute *old,
>> PCIINTxRoute *new);
>>   void pci_bus_fire_intx_routing_notifier(PCIBus *bus);
>>   void pci_device_set_intx_routing_notifier(PCIDevice *dev,
>>                                             PCIINTxRoutingNotifier
>> notifier);
>> +void pci_device_pre_reset(PCIBus *bus, PCIDevice *d, void *opaque);
>> +void pci_device_post_reset(PCIBus *bus, PCIDevice *d, void *opaque);
>>   void pci_device_reset(PCIDevice *dev);
>>   
>>   PCIDevice *pci_nic_init_nofail(NICInfo *nd, PCIBus *rootbus,
>
>
> .
>

Comments

Alex Williamson Dec. 21, 2015, 9:07 p.m. UTC | #1
On Fri, 2015-12-18 at 11:29 +0800, Chen Fan wrote:
> On 12/18/2015 04:31 AM, Alex Williamson wrote:
> > On Thu, 2015-12-17 at 09:41 +0800, Cao jin wrote:
> > > From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > 
> > > Particularly, For vfio devices, Once need to recovery devices
> > > by bus reset such as AER, we always need to reset the host bus
> > > to recovery the devices under the bus, so we need to add pci
> > > device
> > > callbacks to specify to do host bus reset.
> > > 
> > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >   hw/pci/pci.c         | 18 ++++++++++++++++++
> > >   hw/pci/pci_bridge.c  |  9 +++++++++
> > >   hw/vfio/pci.c        | 26 ++++++++++++++++++++++++++
> > >   hw/vfio/pci.h        |  2 ++
> > >   include/hw/pci/pci.h |  7 +++++++
> > >   5 files changed, 62 insertions(+)
> > > 
> > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > index f6ca6ef..64fa2cc 100644
> > > --- a/hw/pci/pci.c
> > > +++ b/hw/pci/pci.c
> > > @@ -247,6 +247,24 @@ static void pci_do_device_reset(PCIDevice
> > > *dev)
> > >       msix_reset(dev);
> > >   }
> > >   
> > > +void pci_device_pre_reset(PCIBus *bus, PCIDevice *dev, void
> > > *unused)
> > > +{
> > > +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
> > > +
> > > +    if (dc->pre_reset) {
> > > +        dc->pre_reset(dev);
> > > +    }
> > > +}
> > > +
> > > +void pci_device_post_reset(PCIBus *bus, PCIDevice *dev, void
> > > *unused)
> > > +{
> > > +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
> > > +
> > > +    if (dc->post_reset) {
> > > +        dc->post_reset(dev);
> > > +    }
> > > +}
> > > +
> > >   /*
> > >    * This function is called on #RST and FLR.
> > >    * FLR if PCI_EXP_DEVCTL_BCR_FLR is set
> > > diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> > > index 40c97b1..ddb76ab 100644
> > > --- a/hw/pci/pci_bridge.c
> > > +++ b/hw/pci/pci_bridge.c
> > > @@ -267,8 +267,17 @@ void pci_bridge_write_config(PCIDevice *d,
> > >   
> > >       newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
> > >       if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
> > > +        /*
> > > +         * Notify all vfio-pci devices under the bus
> > > +         * should do physical bus reset.
> > > +         */
> > > +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
> > > +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > > +                            pci_device_pre_reset, NULL);
> > >           /* Trigger hot reset on 0->1 transition. */
> > >           qbus_reset_all(&s->sec_bus.qbus);
> > > +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > > +                            pci_device_post_reset, NULL);
> > >       }
> > >   }
> > >   
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index e17dc89..df32618 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -39,6 +39,7 @@
> > >   
> > >   static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> > >   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
> > > enabled);
> > > +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single);
> > >   
> > >   /*
> > >    * Disabling BAR mmaping can be slow, but toggling it around
> > > INTx
> > > can
> > > @@ -1888,6 +1889,8 @@ static int
> > > vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> > >       /* List all affected devices by bus reset */
> > >       devices = &info->devices[0];
> > >   
> > > +    vdev->single_depend_dev = (info->count == 1);
> > > +
> > >       /* Verify that we have all the groups required */
> > >       for (i = 0; i < info->count; i++) {
> > >           PCIHostDeviceAddress host;
> > > @@ -2029,10 +2032,26 @@ static int
> > > vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
> > >       return vfio_check_host_bus_reset(vdev);
> > >   }
> > >   
> > > +static void vfio_aer_pre_reset(PCIDevice *pdev)
> > > +{
> > > +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> > > +
> > > +    vdev->aer_reset = true;
> > > +    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> > > +}
> > Doesn't this lead to multiple host bus resets per guest bus reset
> > in
> > many cases?  It looks like we'll do it once per vfio-pci device,
> > even
> > if those devices are on the same host bus.  That's a 1 second
> > operation
> > per device.  Can we avoid that?  Maybe some sort of sequence ID
> > could
> > help a device figure out whether it's already been reset as part of
> > a
> > dependent device for this particular guest bus reset.  Thanks,
> That's right, I missed this case, but I don't understand the scenario
> how to
> use a sequence ID to mark the device if been reset. can you detail it
> ?

I don't really have a concrete idea for a sequence ID, it was just a
thought that maybe if each bus reset had a sequence ID then devices
could know whether they've already been reset for that sequence ID.
 The basic problem we have is that reset callbacks are per device and
it's difficult to infer which individual resets are part of that bus
reset.  In fact, do we propagate resets correctly down secondary
bridges?  We're triggering off a VM write of the bridge control bus
reset bit triggering from 0->1 and we then call qbus_reset_all() on
that qbus, which I think is just going to call pci_bridge_reset() for
any other bridges, which doesn't do anything about resetting deeper
subordinate buses.  I think that means that if we had a root port with
a switch below it and endpoints below that, if the VM triggered a
secondary bus reset at the root port, the endpoints would never see it,
which is not how real hardware works.

> additional, there was a mechanism to compute device whether need to
> be reset
> by hot reset. so I simply modify the code as the following:
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index a9bc67e..42774ca 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2063,13 +2063,19 @@ static void vfio_aer_pre_reset(PCIDevice
> *pdev)
>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> 
>       vdev->aer_reset = true;
> -    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> +    vdev->vbasedev.needs_reset = true;
>   }
> 
>   static void vfio_aer_post_reset(PCIDevice *pdev)
>   {
>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> 
> +    if (!vdev->single_depend_dev && vdev->vbasedev.needs_reset) {
> +        vfio_pci_hot_reset(vdev, false);
> +    } else {
> +        vfio_pci_hot_reset(vdev, true);
> +    }
> +
>       vdev->aer_reset = false;
>   }
> 
> what do you think of this ?

I think it might be a bigger problem than that subtle change.  I wonder
if we really need a better model of the reset line through the
subordinate buses.  When reset is asserted, we'd set a bus_in_reset
flag on the bus and trigger downstream bridges to do the same.  Then
when the user de-asserts reset, we'd call qbus_reset_all() and
propagate it through to downstream buses.  That way the per device
reset callback could check to see if the bus is in reset and aer
devices can then know to do a bus reset.  Finally, the bus_in_reset
flag would be cleared on all the affected buses.  I'm sure there are
numerous details missing there, but it seems like it might be a
reasonable approach.  Thanks,

Alex
chenfan Dec. 22, 2015, 7:18 a.m. UTC | #2
On 12/22/2015 05:07 AM, Alex Williamson wrote:
> On Fri, 2015-12-18 at 11:29 +0800, Chen Fan wrote:
>> On 12/18/2015 04:31 AM, Alex Williamson wrote:
>>> On Thu, 2015-12-17 at 09:41 +0800, Cao jin wrote:
>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>
>>>> Particularly, For vfio devices, Once need to recovery devices
>>>> by bus reset such as AER, we always need to reset the host bus
>>>> to recovery the devices under the bus, so we need to add pci
>>>> device
>>>> callbacks to specify to do host bus reset.
>>>>
>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>>>> ---
>>>>    hw/pci/pci.c         | 18 ++++++++++++++++++
>>>>    hw/pci/pci_bridge.c  |  9 +++++++++
>>>>    hw/vfio/pci.c        | 26 ++++++++++++++++++++++++++
>>>>    hw/vfio/pci.h        |  2 ++
>>>>    include/hw/pci/pci.h |  7 +++++++
>>>>    5 files changed, 62 insertions(+)
>>>>
>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>> index f6ca6ef..64fa2cc 100644
>>>> --- a/hw/pci/pci.c
>>>> +++ b/hw/pci/pci.c
>>>> @@ -247,6 +247,24 @@ static void pci_do_device_reset(PCIDevice
>>>> *dev)
>>>>        msix_reset(dev);
>>>>    }
>>>>    
>>>> +void pci_device_pre_reset(PCIBus *bus, PCIDevice *dev, void
>>>> *unused)
>>>> +{
>>>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>>>> +
>>>> +    if (dc->pre_reset) {
>>>> +        dc->pre_reset(dev);
>>>> +    }
>>>> +}
>>>> +
>>>> +void pci_device_post_reset(PCIBus *bus, PCIDevice *dev, void
>>>> *unused)
>>>> +{
>>>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>>>> +
>>>> +    if (dc->post_reset) {
>>>> +        dc->post_reset(dev);
>>>> +    }
>>>> +}
>>>> +
>>>>    /*
>>>>     * This function is called on #RST and FLR.
>>>>     * FLR if PCI_EXP_DEVCTL_BCR_FLR is set
>>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>>>> index 40c97b1..ddb76ab 100644
>>>> --- a/hw/pci/pci_bridge.c
>>>> +++ b/hw/pci/pci_bridge.c
>>>> @@ -267,8 +267,17 @@ void pci_bridge_write_config(PCIDevice *d,
>>>>    
>>>>        newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>>>        if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>>>> +        /*
>>>> +         * Notify all vfio-pci devices under the bus
>>>> +         * should do physical bus reset.
>>>> +         */
>>>> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
>>>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>> +                            pci_device_pre_reset, NULL);
>>>>            /* Trigger hot reset on 0->1 transition. */
>>>>            qbus_reset_all(&s->sec_bus.qbus);
>>>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>> +                            pci_device_post_reset, NULL);
>>>>        }
>>>>    }
>>>>    
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index e17dc89..df32618 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -39,6 +39,7 @@
>>>>    
>>>>    static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>>>    static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
>>>> enabled);
>>>> +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single);
>>>>    
>>>>    /*
>>>>     * Disabling BAR mmaping can be slow, but toggling it around
>>>> INTx
>>>> can
>>>> @@ -1888,6 +1889,8 @@ static int
>>>> vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>>>>        /* List all affected devices by bus reset */
>>>>        devices = &info->devices[0];
>>>>    
>>>> +    vdev->single_depend_dev = (info->count == 1);
>>>> +
>>>>        /* Verify that we have all the groups required */
>>>>        for (i = 0; i < info->count; i++) {
>>>>            PCIHostDeviceAddress host;
>>>> @@ -2029,10 +2032,26 @@ static int
>>>> vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
>>>>        return vfio_check_host_bus_reset(vdev);
>>>>    }
>>>>    
>>>> +static void vfio_aer_pre_reset(PCIDevice *pdev)
>>>> +{
>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>> +
>>>> +    vdev->aer_reset = true;
>>>> +    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>>>> +}
>>> Doesn't this lead to multiple host bus resets per guest bus reset
>>> in
>>> many cases?  It looks like we'll do it once per vfio-pci device,
>>> even
>>> if those devices are on the same host bus.  That's a 1 second
>>> operation
>>> per device.  Can we avoid that?  Maybe some sort of sequence ID
>>> could
>>> help a device figure out whether it's already been reset as part of
>>> a
>>> dependent device for this particular guest bus reset.  Thanks,
>> That's right, I missed this case, but I don't understand the scenario
>> how to
>> use a sequence ID to mark the device if been reset. can you detail it
>> ?
> I don't really have a concrete idea for a sequence ID, it was just a
> thought that maybe if each bus reset had a sequence ID then devices
> could know whether they've already been reset for that sequence ID.
>   The basic problem we have is that reset callbacks are per device and
> it's difficult to infer which individual resets are part of that bus
> reset.  In fact, do we propagate resets correctly down secondary
> bridges?  We're triggering off a VM write of the bridge control bus
> reset bit triggering from 0->1 and we then call qbus_reset_all() on
> that qbus, which I think is just going to call pci_bridge_reset() for
> any other bridges, which doesn't do anything about resetting deeper
> subordinate buses.  I think that means that if we had a root port with
> a switch below it and endpoints below that, if the VM triggered a
> secondary bus reset at the root port, the endpoints would never see it,
> which is not how real hardware works.
Indeed, you're right, for subordinate buses reset, we should have a common
mechanism for all bridges.

>
>> additional, there was a mechanism to compute device whether need to
>> be reset
>> by hot reset. so I simply modify the code as the following:
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index a9bc67e..42774ca 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2063,13 +2063,19 @@ static void vfio_aer_pre_reset(PCIDevice
>> *pdev)
>>        VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>
>>        vdev->aer_reset = true;
>> -    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>> +    vdev->vbasedev.needs_reset = true;
>>    }
>>
>>    static void vfio_aer_post_reset(PCIDevice *pdev)
>>    {
>>        VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>
>> +    if (!vdev->single_depend_dev && vdev->vbasedev.needs_reset) {
>> +        vfio_pci_hot_reset(vdev, false);
>> +    } else {
>> +        vfio_pci_hot_reset(vdev, true);
>> +    }
>> +
>>        vdev->aer_reset = false;
>>    }
>>
>> what do you think of this ?
> I think it might be a bigger problem than that subtle change.  I wonder
> if we really need a better model of the reset line through the
> subordinate buses.  When reset is asserted, we'd set a bus_in_reset
> flag on the bus and trigger downstream bridges to do the same.  Then
> when the user de-asserts reset, we'd call qbus_reset_all() and
> propagate it through to downstream buses.  That way the per device
> reset callback could check to see if the bus is in reset and aer
> devices can then know to do a bus reset.  Finally, the bus_in_reset
> flag would be cleared on all the affected buses.  I'm sure there are
> numerous details missing there, but it seems like it might be a
> reasonable approach.  Thanks,
it should be, let me try to think about it carefully.;)

Thanks,
Chen
>
> Alex
>
>
> .
>
chenfan Dec. 24, 2015, 5:10 a.m. UTC | #3
On 12/22/2015 05:07 AM, Alex Williamson wrote:
> On Fri, 2015-12-18 at 11:29 +0800, Chen Fan wrote:
>> On 12/18/2015 04:31 AM, Alex Williamson wrote:
>>> On Thu, 2015-12-17 at 09:41 +0800, Cao jin wrote:
>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>
>>>> Particularly, For vfio devices, Once need to recovery devices
>>>> by bus reset such as AER, we always need to reset the host bus
>>>> to recovery the devices under the bus, so we need to add pci
>>>> device
>>>> callbacks to specify to do host bus reset.
>>>>
>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>>>> ---
>>>>    hw/pci/pci.c         | 18 ++++++++++++++++++
>>>>    hw/pci/pci_bridge.c  |  9 +++++++++
>>>>    hw/vfio/pci.c        | 26 ++++++++++++++++++++++++++
>>>>    hw/vfio/pci.h        |  2 ++
>>>>    include/hw/pci/pci.h |  7 +++++++
>>>>    5 files changed, 62 insertions(+)
>>>>
>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>> index f6ca6ef..64fa2cc 100644
>>>> --- a/hw/pci/pci.c
>>>> +++ b/hw/pci/pci.c
>>>> @@ -247,6 +247,24 @@ static void pci_do_device_reset(PCIDevice
>>>> *dev)
>>>>        msix_reset(dev);
>>>>    }
>>>>    
>>>> +void pci_device_pre_reset(PCIBus *bus, PCIDevice *dev, void
>>>> *unused)
>>>> +{
>>>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>>>> +
>>>> +    if (dc->pre_reset) {
>>>> +        dc->pre_reset(dev);
>>>> +    }
>>>> +}
>>>> +
>>>> +void pci_device_post_reset(PCIBus *bus, PCIDevice *dev, void
>>>> *unused)
>>>> +{
>>>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>>>> +
>>>> +    if (dc->post_reset) {
>>>> +        dc->post_reset(dev);
>>>> +    }
>>>> +}
>>>> +
>>>>    /*
>>>>     * This function is called on #RST and FLR.
>>>>     * FLR if PCI_EXP_DEVCTL_BCR_FLR is set
>>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>>>> index 40c97b1..ddb76ab 100644
>>>> --- a/hw/pci/pci_bridge.c
>>>> +++ b/hw/pci/pci_bridge.c
>>>> @@ -267,8 +267,17 @@ void pci_bridge_write_config(PCIDevice *d,
>>>>    
>>>>        newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>>>        if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>>>> +        /*
>>>> +         * Notify all vfio-pci devices under the bus
>>>> +         * should do physical bus reset.
>>>> +         */
>>>> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
>>>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>> +                            pci_device_pre_reset, NULL);
>>>>            /* Trigger hot reset on 0->1 transition. */
>>>>            qbus_reset_all(&s->sec_bus.qbus);
>>>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>> +                            pci_device_post_reset, NULL);
>>>>        }
>>>>    }
>>>>    
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index e17dc89..df32618 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -39,6 +39,7 @@
>>>>    
>>>>    static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>>>    static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
>>>> enabled);
>>>> +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single);
>>>>    
>>>>    /*
>>>>     * Disabling BAR mmaping can be slow, but toggling it around
>>>> INTx
>>>> can
>>>> @@ -1888,6 +1889,8 @@ static int
>>>> vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>>>>        /* List all affected devices by bus reset */
>>>>        devices = &info->devices[0];
>>>>    
>>>> +    vdev->single_depend_dev = (info->count == 1);
>>>> +
>>>>        /* Verify that we have all the groups required */
>>>>        for (i = 0; i < info->count; i++) {
>>>>            PCIHostDeviceAddress host;
>>>> @@ -2029,10 +2032,26 @@ static int
>>>> vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
>>>>        return vfio_check_host_bus_reset(vdev);
>>>>    }
>>>>    
>>>> +static void vfio_aer_pre_reset(PCIDevice *pdev)
>>>> +{
>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>> +
>>>> +    vdev->aer_reset = true;
>>>> +    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>>>> +}
>>> Doesn't this lead to multiple host bus resets per guest bus reset
>>> in
>>> many cases?  It looks like we'll do it once per vfio-pci device,
>>> even
>>> if those devices are on the same host bus.  That's a 1 second
>>> operation
>>> per device.  Can we avoid that?  Maybe some sort of sequence ID
>>> could
>>> help a device figure out whether it's already been reset as part of
>>> a
>>> dependent device for this particular guest bus reset.  Thanks,
>> That's right, I missed this case, but I don't understand the scenario
>> how to
>> use a sequence ID to mark the device if been reset. can you detail it
>> ?
> I don't really have a concrete idea for a sequence ID, it was just a
> thought that maybe if each bus reset had a sequence ID then devices
> could know whether they've already been reset for that sequence ID.
>   The basic problem we have is that reset callbacks are per device and
> it's difficult to infer which individual resets are part of that bus
> reset.  In fact, do we propagate resets correctly down secondary
> bridges?  We're triggering off a VM write of the bridge control bus
> reset bit triggering from 0->1 and we then call qbus_reset_all() on
> that qbus, which I think is just going to call pci_bridge_reset() for
> any other bridges, which doesn't do anything about resetting deeper
> subordinate buses.  I think that means that if we had a root port with
> a switch below it and endpoints below that, if the VM triggered a
> secondary bus reset at the root port, the endpoints would never see it,
> which is not how real hardware works.
Hi Alex,

     After think over the subordinate case, I think the qbus_reset_all()
should also reset the deeper endpoints. which recursive the qbus and qdev
to reset all devices blow sec bus and bridge device independently.

MST's suggestion in another mail to add a hot reset callback for sec bus 
reset
should be good to avoid the couple of pre_reset/post_reset callbacks. 
thanks:)

but for the multiple resets triggering on the same bus, I think the 
virtual bus-level
callback do not guarantee the host bus reset only once for all dependent 
devices.
because the host reset dependent devices may not on the same virtual bus.
we allowed that the dependent devices address below the bus. so the devices
still don't know the host bus is in reset. here I think we should have a 
host bus
model bind to all the dependent devices. when devices do hot reset, we mark
the bus in reset. then other dependent devices know the bus is reset. 
how about
this?

Thanks,
Chen

>
>> additional, there was a mechanism to compute device whether need to
>> be reset
>> by hot reset. so I simply modify the code as the following:
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index a9bc67e..42774ca 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -2063,13 +2063,19 @@ static void vfio_aer_pre_reset(PCIDevice
>> *pdev)
>>        VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>
>>        vdev->aer_reset = true;
>> -    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>> +    vdev->vbasedev.needs_reset = true;
>>    }
>>
>>    static void vfio_aer_post_reset(PCIDevice *pdev)
>>    {
>>        VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>
>> +    if (!vdev->single_depend_dev && vdev->vbasedev.needs_reset) {
>> +        vfio_pci_hot_reset(vdev, false);
>> +    } else {
>> +        vfio_pci_hot_reset(vdev, true);
>> +    }
>> +
>>        vdev->aer_reset = false;
>>    }
>>
>> what do you think of this ?
> I think it might be a bigger problem than that subtle change.  I wonder
> if we really need a better model of the reset line through the
> subordinate buses.  When reset is asserted, we'd set a bus_in_reset
> flag on the bus and trigger downstream bridges to do the same.  Then
> when the user de-asserts reset, we'd call qbus_reset_all() and
> propagate it through to downstream buses.  That way the per device
> reset callback could check to see if the bus is in reset and aer
> devices can then know to do a bus reset.  Finally, the bus_in_reset
> flag would be cleared on all the affected buses.  I'm sure there are
> numerous details missing there, but it seems like it might be a
> reasonable approach.  Thanks,
>
> Alex
>
>
> .
>
Michael S. Tsirkin Dec. 24, 2015, 2:34 p.m. UTC | #4
On Thu, Dec 24, 2015 at 01:10:25PM +0800, Chen Fan wrote:
> 
> On 12/22/2015 05:07 AM, Alex Williamson wrote:
> >On Fri, 2015-12-18 at 11:29 +0800, Chen Fan wrote:
> >>On 12/18/2015 04:31 AM, Alex Williamson wrote:
> >>>On Thu, 2015-12-17 at 09:41 +0800, Cao jin wrote:
> >>>>From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>>
> >>>>Particularly, For vfio devices, Once need to recovery devices
> >>>>by bus reset such as AER, we always need to reset the host bus
> >>>>to recovery the devices under the bus, so we need to add pci
> >>>>device
> >>>>callbacks to specify to do host bus reset.
> >>>>
> >>>>Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >>>>Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> >>>>---
> >>>>   hw/pci/pci.c         | 18 ++++++++++++++++++
> >>>>   hw/pci/pci_bridge.c  |  9 +++++++++
> >>>>   hw/vfio/pci.c        | 26 ++++++++++++++++++++++++++
> >>>>   hw/vfio/pci.h        |  2 ++
> >>>>   include/hw/pci/pci.h |  7 +++++++
> >>>>   5 files changed, 62 insertions(+)
> >>>>
> >>>>diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >>>>index f6ca6ef..64fa2cc 100644
> >>>>--- a/hw/pci/pci.c
> >>>>+++ b/hw/pci/pci.c
> >>>>@@ -247,6 +247,24 @@ static void pci_do_device_reset(PCIDevice
> >>>>*dev)
> >>>>       msix_reset(dev);
> >>>>   }
> >>>>+void pci_device_pre_reset(PCIBus *bus, PCIDevice *dev, void
> >>>>*unused)
> >>>>+{
> >>>>+    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
> >>>>+
> >>>>+    if (dc->pre_reset) {
> >>>>+        dc->pre_reset(dev);
> >>>>+    }
> >>>>+}
> >>>>+
> >>>>+void pci_device_post_reset(PCIBus *bus, PCIDevice *dev, void
> >>>>*unused)
> >>>>+{
> >>>>+    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
> >>>>+
> >>>>+    if (dc->post_reset) {
> >>>>+        dc->post_reset(dev);
> >>>>+    }
> >>>>+}
> >>>>+
> >>>>   /*
> >>>>    * This function is called on #RST and FLR.
> >>>>    * FLR if PCI_EXP_DEVCTL_BCR_FLR is set
> >>>>diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> >>>>index 40c97b1..ddb76ab 100644
> >>>>--- a/hw/pci/pci_bridge.c
> >>>>+++ b/hw/pci/pci_bridge.c
> >>>>@@ -267,8 +267,17 @@ void pci_bridge_write_config(PCIDevice *d,
> >>>>       newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
> >>>>       if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
> >>>>+        /*
> >>>>+         * Notify all vfio-pci devices under the bus
> >>>>+         * should do physical bus reset.
> >>>>+         */
> >>>>+        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
> >>>>+        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> >>>>+                            pci_device_pre_reset, NULL);
> >>>>           /* Trigger hot reset on 0->1 transition. */
> >>>>           qbus_reset_all(&s->sec_bus.qbus);
> >>>>+        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> >>>>+                            pci_device_post_reset, NULL);
> >>>>       }
> >>>>   }
> >>>>diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>>>index e17dc89..df32618 100644
> >>>>--- a/hw/vfio/pci.c
> >>>>+++ b/hw/vfio/pci.c
> >>>>@@ -39,6 +39,7 @@
> >>>>   static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
> >>>>   static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
> >>>>enabled);
> >>>>+static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single);
> >>>>   /*
> >>>>    * Disabling BAR mmaping can be slow, but toggling it around
> >>>>INTx
> >>>>can
> >>>>@@ -1888,6 +1889,8 @@ static int
> >>>>vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
> >>>>       /* List all affected devices by bus reset */
> >>>>       devices = &info->devices[0];
> >>>>+    vdev->single_depend_dev = (info->count == 1);
> >>>>+
> >>>>       /* Verify that we have all the groups required */
> >>>>       for (i = 0; i < info->count; i++) {
> >>>>           PCIHostDeviceAddress host;
> >>>>@@ -2029,10 +2032,26 @@ static int
> >>>>vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
> >>>>       return vfio_check_host_bus_reset(vdev);
> >>>>   }
> >>>>+static void vfio_aer_pre_reset(PCIDevice *pdev)
> >>>>+{
> >>>>+    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> >>>>+
> >>>>+    vdev->aer_reset = true;
> >>>>+    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> >>>>+}
> >>>Doesn't this lead to multiple host bus resets per guest bus reset
> >>>in
> >>>many cases?  It looks like we'll do it once per vfio-pci device,
> >>>even
> >>>if those devices are on the same host bus.  That's a 1 second
> >>>operation
> >>>per device.  Can we avoid that?  Maybe some sort of sequence ID
> >>>could
> >>>help a device figure out whether it's already been reset as part of
> >>>a
> >>>dependent device for this particular guest bus reset.  Thanks,
> >>That's right, I missed this case, but I don't understand the scenario
> >>how to
> >>use a sequence ID to mark the device if been reset. can you detail it
> >>?
> >I don't really have a concrete idea for a sequence ID, it was just a
> >thought that maybe if each bus reset had a sequence ID then devices
> >could know whether they've already been reset for that sequence ID.
> >  The basic problem we have is that reset callbacks are per device and
> >it's difficult to infer which individual resets are part of that bus
> >reset.  In fact, do we propagate resets correctly down secondary
> >bridges?  We're triggering off a VM write of the bridge control bus
> >reset bit triggering from 0->1 and we then call qbus_reset_all() on
> >that qbus, which I think is just going to call pci_bridge_reset() for
> >any other bridges, which doesn't do anything about resetting deeper
> >subordinate buses.  I think that means that if we had a root port with
> >a switch below it and endpoints below that, if the VM triggered a
> >secondary bus reset at the root port, the endpoints would never see it,
> >which is not how real hardware works.
> Hi Alex,
> 
>     After think over the subordinate case, I think the qbus_reset_all()
> should also reset the deeper endpoints. which recursive the qbus and qdev
> to reset all devices blow sec bus and bridge device independently.
> 
> MST's suggestion in another mail to add a hot reset callback for sec bus
> reset
> should be good to avoid the couple of pre_reset/post_reset callbacks.
> thanks:)
> 
> but for the multiple resets triggering on the same bus, I think the virtual
> bus-level
> callback do not guarantee the host bus reset only once for all dependent
> devices.
> because the host reset dependent devices may not on the same virtual bus.
> we allowed that the dependent devices address below the bus. so the devices
> still don't know the host bus is in reset. here I think we should have a
> host bus
> model bind to all the dependent devices. when devices do hot reset, we mark
> the bus in reset. then other dependent devices know the bus is reset. how
> about
> this?
> 
> Thanks,
> Chen


What does "below bus" mean here?
Below what?


> >
> >>additional, there was a mechanism to compute device whether need to
> >>be reset
> >>by hot reset. so I simply modify the code as the following:
> >>
> >>diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>index a9bc67e..42774ca 100644
> >>--- a/hw/vfio/pci.c
> >>+++ b/hw/vfio/pci.c
> >>@@ -2063,13 +2063,19 @@ static void vfio_aer_pre_reset(PCIDevice
> >>*pdev)
> >>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> >>
> >>       vdev->aer_reset = true;
> >>-    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
> >>+    vdev->vbasedev.needs_reset = true;
> >>   }
> >>
> >>   static void vfio_aer_post_reset(PCIDevice *pdev)
> >>   {
> >>       VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
> >>
> >>+    if (!vdev->single_depend_dev && vdev->vbasedev.needs_reset) {
> >>+        vfio_pci_hot_reset(vdev, false);
> >>+    } else {
> >>+        vfio_pci_hot_reset(vdev, true);
> >>+    }
> >>+
> >>       vdev->aer_reset = false;
> >>   }
> >>
> >>what do you think of this ?
> >I think it might be a bigger problem than that subtle change.  I wonder
> >if we really need a better model of the reset line through the
> >subordinate buses.  When reset is asserted, we'd set a bus_in_reset
> >flag on the bus and trigger downstream bridges to do the same.  Then
> >when the user de-asserts reset, we'd call qbus_reset_all() and
> >propagate it through to downstream buses.  That way the per device
> >reset callback could check to see if the bus is in reset and aer
> >devices can then know to do a bus reset.  Finally, the bus_in_reset
> >flag would be cleared on all the affected buses.  I'm sure there are
> >numerous details missing there, but it seems like it might be a
> >reasonable approach.  Thanks,
> >
> >Alex
> >
> >
> >.
> >
> 
>
chenfan Dec. 25, 2015, 1:18 a.m. UTC | #5
On 12/24/2015 10:34 PM, Michael S. Tsirkin wrote:
> On Thu, Dec 24, 2015 at 01:10:25PM +0800, Chen Fan wrote:
>> On 12/22/2015 05:07 AM, Alex Williamson wrote:
>>> On Fri, 2015-12-18 at 11:29 +0800, Chen Fan wrote:
>>>> On 12/18/2015 04:31 AM, Alex Williamson wrote:
>>>>> On Thu, 2015-12-17 at 09:41 +0800, Cao jin wrote:
>>>>>> From: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>>
>>>>>> Particularly, For vfio devices, Once need to recovery devices
>>>>>> by bus reset such as AER, we always need to reset the host bus
>>>>>> to recovery the devices under the bus, so we need to add pci
>>>>>> device
>>>>>> callbacks to specify to do host bus reset.
>>>>>>
>>>>>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>>>>>> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>>>>>> ---
>>>>>>    hw/pci/pci.c         | 18 ++++++++++++++++++
>>>>>>    hw/pci/pci_bridge.c  |  9 +++++++++
>>>>>>    hw/vfio/pci.c        | 26 ++++++++++++++++++++++++++
>>>>>>    hw/vfio/pci.h        |  2 ++
>>>>>>    include/hw/pci/pci.h |  7 +++++++
>>>>>>    5 files changed, 62 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>>>>> index f6ca6ef..64fa2cc 100644
>>>>>> --- a/hw/pci/pci.c
>>>>>> +++ b/hw/pci/pci.c
>>>>>> @@ -247,6 +247,24 @@ static void pci_do_device_reset(PCIDevice
>>>>>> *dev)
>>>>>>        msix_reset(dev);
>>>>>>    }
>>>>>> +void pci_device_pre_reset(PCIBus *bus, PCIDevice *dev, void
>>>>>> *unused)
>>>>>> +{
>>>>>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>>>>>> +
>>>>>> +    if (dc->pre_reset) {
>>>>>> +        dc->pre_reset(dev);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +void pci_device_post_reset(PCIBus *bus, PCIDevice *dev, void
>>>>>> *unused)
>>>>>> +{
>>>>>> +    PCIDeviceClass *dc = PCI_DEVICE_GET_CLASS(dev);
>>>>>> +
>>>>>> +    if (dc->post_reset) {
>>>>>> +        dc->post_reset(dev);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>>    /*
>>>>>>     * This function is called on #RST and FLR.
>>>>>>     * FLR if PCI_EXP_DEVCTL_BCR_FLR is set
>>>>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>>>>>> index 40c97b1..ddb76ab 100644
>>>>>> --- a/hw/pci/pci_bridge.c
>>>>>> +++ b/hw/pci/pci_bridge.c
>>>>>> @@ -267,8 +267,17 @@ void pci_bridge_write_config(PCIDevice *d,
>>>>>>        newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>>>>>        if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>>>>>> +        /*
>>>>>> +         * Notify all vfio-pci devices under the bus
>>>>>> +         * should do physical bus reset.
>>>>>> +         */
>>>>>> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
>>>>>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>>>> +                            pci_device_pre_reset, NULL);
>>>>>>            /* Trigger hot reset on 0->1 transition. */
>>>>>>            qbus_reset_all(&s->sec_bus.qbus);
>>>>>> +        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>>>> +                            pci_device_post_reset, NULL);
>>>>>>        }
>>>>>>    }
>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>> index e17dc89..df32618 100644
>>>>>> --- a/hw/vfio/pci.c
>>>>>> +++ b/hw/vfio/pci.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>    static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
>>>>>>    static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool
>>>>>> enabled);
>>>>>> +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single);
>>>>>>    /*
>>>>>>     * Disabling BAR mmaping can be slow, but toggling it around
>>>>>> INTx
>>>>>> can
>>>>>> @@ -1888,6 +1889,8 @@ static int
>>>>>> vfio_check_host_bus_reset(VFIOPCIDevice *vdev)
>>>>>>        /* List all affected devices by bus reset */
>>>>>>        devices = &info->devices[0];
>>>>>> +    vdev->single_depend_dev = (info->count == 1);
>>>>>> +
>>>>>>        /* Verify that we have all the groups required */
>>>>>>        for (i = 0; i < info->count; i++) {
>>>>>>            PCIHostDeviceAddress host;
>>>>>> @@ -2029,10 +2032,26 @@ static int
>>>>>> vfio_check_bus_reset(NotifierWithReturn *n, void *opaque)
>>>>>>        return vfio_check_host_bus_reset(vdev);
>>>>>>    }
>>>>>> +static void vfio_aer_pre_reset(PCIDevice *pdev)
>>>>>> +{
>>>>>> +    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>>> +
>>>>>> +    vdev->aer_reset = true;
>>>>>> +    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>>>>>> +}
>>>>> Doesn't this lead to multiple host bus resets per guest bus reset
>>>>> in
>>>>> many cases?  It looks like we'll do it once per vfio-pci device,
>>>>> even
>>>>> if those devices are on the same host bus.  That's a 1 second
>>>>> operation
>>>>> per device.  Can we avoid that?  Maybe some sort of sequence ID
>>>>> could
>>>>> help a device figure out whether it's already been reset as part of
>>>>> a
>>>>> dependent device for this particular guest bus reset.  Thanks,
>>>> That's right, I missed this case, but I don't understand the scenario
>>>> how to
>>>> use a sequence ID to mark the device if been reset. can you detail it
>>>> ?
>>> I don't really have a concrete idea for a sequence ID, it was just a
>>> thought that maybe if each bus reset had a sequence ID then devices
>>> could know whether they've already been reset for that sequence ID.
>>>   The basic problem we have is that reset callbacks are per device and
>>> it's difficult to infer which individual resets are part of that bus
>>> reset.  In fact, do we propagate resets correctly down secondary
>>> bridges?  We're triggering off a VM write of the bridge control bus
>>> reset bit triggering from 0->1 and we then call qbus_reset_all() on
>>> that qbus, which I think is just going to call pci_bridge_reset() for
>>> any other bridges, which doesn't do anything about resetting deeper
>>> subordinate buses.  I think that means that if we had a root port with
>>> a switch below it and endpoints below that, if the VM triggered a
>>> secondary bus reset at the root port, the endpoints would never see it,
>>> which is not how real hardware works.
>> Hi Alex,
>>
>>      After think over the subordinate case, I think the qbus_reset_all()
>> should also reset the deeper endpoints. which recursive the qbus and qdev
>> to reset all devices blow sec bus and bridge device independently.
>>
>> MST's suggestion in another mail to add a hot reset callback for sec bus
>> reset
>> should be good to avoid the couple of pre_reset/post_reset callbacks.
>> thanks:)
>>
>> but for the multiple resets triggering on the same bus, I think the virtual
>> bus-level
>> callback do not guarantee the host bus reset only once for all dependent
>> devices.
>> because the host reset dependent devices may not on the same virtual bus.
>> we allowed that the dependent devices address below the bus. so the devices
>> still don't know the host bus is in reset. here I think we should have a
>> host bus
>> model bind to all the dependent devices. when devices do hot reset, we mark
>> the bus in reset. then other dependent devices know the bus is reset. how
>> about
>> this?
>>
>> Thanks,
>> Chen
>
> What does "below bus" mean here?
> Below what?
I just said that assume the interacted devices on the same host bus. we 
allowed
one device with aer was assigned on a virtual bus and the others were 
assigned
on the secondary bus.

Thanks,
Chen

>
>
>>>> additional, there was a mechanism to compute device whether need to
>>>> be reset
>>>> by hot reset. so I simply modify the code as the following:
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index a9bc67e..42774ca 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -2063,13 +2063,19 @@ static void vfio_aer_pre_reset(PCIDevice
>>>> *pdev)
>>>>        VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>
>>>>        vdev->aer_reset = true;
>>>> -    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
>>>> +    vdev->vbasedev.needs_reset = true;
>>>>    }
>>>>
>>>>    static void vfio_aer_post_reset(PCIDevice *pdev)
>>>>    {
>>>>        VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
>>>>
>>>> +    if (!vdev->single_depend_dev && vdev->vbasedev.needs_reset) {
>>>> +        vfio_pci_hot_reset(vdev, false);
>>>> +    } else {
>>>> +        vfio_pci_hot_reset(vdev, true);
>>>> +    }
>>>> +
>>>>        vdev->aer_reset = false;
>>>>    }
>>>>
>>>> what do you think of this ?
>>> I think it might be a bigger problem than that subtle change.  I wonder
>>> if we really need a better model of the reset line through the
>>> subordinate buses.  When reset is asserted, we'd set a bus_in_reset
>>> flag on the bus and trigger downstream bridges to do the same.  Then
>>> when the user de-asserts reset, we'd call qbus_reset_all() and
>>> propagate it through to downstream buses.  That way the per device
>>> reset callback could check to see if the bus is in reset and aer
>>> devices can then know to do a bus reset.  Finally, the bus_in_reset
>>> flag would be cleared on all the affected buses.  I'm sure there are
>>> numerous details missing there, but it seems like it might be a
>>> reasonable approach.  Thanks,
>>>
>>> Alex
>>>
>>>
>>> .
>>>
>>
>
> .
>
diff mbox

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a9bc67e..42774ca 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2063,13 +2063,19 @@  static void vfio_aer_pre_reset(PCIDevice *pdev)
      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);

      vdev->aer_reset = true;
-    vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+    vdev->vbasedev.needs_reset = true;
  }

  static void vfio_aer_post_reset(PCIDevice *pdev)
  {
      VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);

+    if (!vdev->single_depend_dev && vdev->vbasedev.needs_reset) {
+        vfio_pci_hot_reset(vdev, false);
+    } else {
+        vfio_pci_hot_reset(vdev, true);
+    }
+
      vdev->aer_reset = false;
  }