Patchwork [v2,2/5] acpi_piix4: Fix PCI hotplug race

login
register
mail settings
Submitter Alex Williamson
Date April 5, 2012, 5:07 p.m.
Message ID <20120405170715.14105.7389.stgit@bling.home>
Download mbox | patch
Permalink /patch/151220/
State New
Headers show

Comments

Alex Williamson - April 5, 2012, 5:07 p.m.
As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
to a few races.  The first is a race with other hotplug operations
because we clear the up & down registers at each event.  If a new
event comes before the last is processed, up/down is cleared and
the event is lost.

To fix this for the down register, we create a life cycle for
the event request that starts with the hot unplug request in
piix4_device_hotplug() and ends when the device is ejected.
This allows us to mask and clear individual bits, preserving them
against races.  For the up register, we have no clear end point
for when the event is finished.  We could modify the BIOS to
acknowledge the bit and clear it, but this creates BIOS compatibiliy
issues without offering a complete solution.  Instead we note that
gratuitous ACPI device checks are not harmful, which allows us to
issue a device check for every slot.  We know which slots are present
and we know which slots are hotpluggable, so we can easily reduce
this to a more manageable set for the guest.

The other race Michael noted was that an unplug request followed
by reset may also lose the eject notification, which may also
result in the eject request being lost which a subsequent add
or remove.  Once we're in reset, the device is unused and we can
flush the queue of device removals ourselves.  Previously if a
device_del was issued to a guest without ACPI PCI hotplug support,
it was necessary to shutdown the guest to recover the device.
With this, a guest reboot is sufficient.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 53 insertions(+), 21 deletions(-)
Marcelo Tosatti - April 8, 2012, 4:57 a.m.
On Thu, Apr 05, 2012 at 11:07:15AM -0600, Alex Williamson wrote:
> As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
> to a few races.  The first is a race with other hotplug operations
> because we clear the up & down registers at each event.  If a new
> event comes before the last is processed, up/down is cleared and
> the event is lost.
> 
> To fix this for the down register, we create a life cycle for
> the event request that starts with the hot unplug request in
> piix4_device_hotplug() and ends when the device is ejected.
> This allows us to mask and clear individual bits, preserving them
> against races.  For the up register, we have no clear end point
> for when the event is finished.  We could modify the BIOS to
> acknowledge the bit and clear it, but this creates BIOS compatibiliy
> issues without offering a complete solution.  Instead we note that
> gratuitous ACPI device checks are not harmful, which allows us to
> issue a device check for every slot.  We know which slots are present
> and we know which slots are hotpluggable, so we can easily reduce
> this to a more manageable set for the guest.
> 
> The other race Michael noted was that an unplug request followed
> by reset may also lose the eject notification, which may also
> result in the eject request being lost which a subsequent add
> or remove.  Once we're in reset, the device is unused and we can
> flush the queue of device removals ourselves.  Previously if a
> device_del was issued to a guest without ACPI PCI hotplug support,
> it was necessary to shutdown the guest to recover the device.
> With this, a guest reboot is sufficient.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
>  1 files changed, 53 insertions(+), 21 deletions(-)
> 
> diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
> index 5e8b261..0e7af51 100644
> --- a/hw/acpi_piix4.c
> +++ b/hw/acpi_piix4.c
> @@ -48,7 +48,7 @@
>  #define PIIX4_PCI_HOTPLUG_STATUS 2
>  
>  struct pci_status {
> -    uint32_t up;
> +    uint32_t up; /* deprecated, maintained for migration compatibility */
>      uint32_t down;
>  };
>  
> @@ -70,6 +70,7 @@ typedef struct PIIX4PMState {
>      /* for pci hotplug */
>      struct pci_status pci0_status;
>      uint32_t pci0_hotplug_enable;
> +    uint32_t pci0_slot_device_present;

Why can't you use the "up" field for this? Fail to see the point
of the new variable.


>  } PIIX4PMState;
>  
>  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);
> @@ -205,6 +206,17 @@ static void pm_write_config(PCIDevice *d,
>          pm_io_space_update((PIIX4PMState *)d);
>  }
>  
> +static void vmstate_pci_status_pre_save(void *opaque)
> +{
> +    struct pci_status *pci0_status = opaque;
> +    PIIX4PMState *s = container_of(pci0_status, PIIX4PMState, pci0_status);
> +
> +    /* We no longer track up, so build a safe value for migrating
> +     * to a version that still does... of course these might get lost
> +     * by an old buggy implementation, but we try. */
> +    pci0_status->up = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> +}
> +
>  static int vmstate_acpi_post_load(void *opaque, int version_id)
>  {
>      PIIX4PMState *s = opaque;
> @@ -241,6 +253,7 @@ static const VMStateDescription vmstate_pci_status = {
>      .version_id = 1,
>      .minimum_version_id = 1,
>      .minimum_version_id_old = 1,
> +    .pre_save = vmstate_pci_status_pre_save,
>      .fields      = (VMStateField []) {
>          VMSTATE_UINT32(up, struct pci_status),
>          VMSTATE_UINT32(down, struct pci_status),
> @@ -269,13 +282,38 @@ static const VMStateDescription vmstate_acpi = {
>      }
>  };
>  
> +static void acpi_piix_eject_slot(PIIX4PMState *s, unsigned slots)
> +{
> +    DeviceState *qdev, *next;
> +    BusState *bus = qdev_get_parent_bus(&s->dev.qdev);
> +    int slot = ffs(slots) - 1;
> +
> +    /* Mark request as complete */
> +    s->pci0_status.down &= ~(1U << slot);
> +
> +    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> +        PCIDevice *dev = PCI_DEVICE(qdev);
> +        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> +        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> +            s->pci0_slot_device_present &= ~(1U << slot);
> +            qdev_free(qdev);
> +        }
> +    }
> +}
> +
>  static void piix4_update_hotplug(PIIX4PMState *s)
>  {
>      PCIDevice *dev = &s->dev;
>      BusState *bus = qdev_get_parent_bus(&dev->qdev);
>      DeviceState *qdev, *next;
>  
> +    /* Execute any pending removes during reset */
> +    while (s->pci0_status.down) {
> +        acpi_piix_eject_slot(s, s->pci0_status.down);
> +    }
> +
>      s->pci0_hotplug_enable = ~0;
> +    s->pci0_slot_device_present = 0;
>  
>      QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
>          PCIDevice *pdev = PCI_DEVICE(qdev);
> @@ -283,8 +321,10 @@ static void piix4_update_hotplug(PIIX4PMState *s)
>          int slot = PCI_SLOT(pdev->devfn);
>  
>          if (pc->no_hotplug) {
> -            s->pci0_hotplug_enable &= ~(1 << slot);
> +            s->pci0_hotplug_enable &= ~(1U << slot);
>          }
> +
> +        s->pci0_slot_device_present |= (1U << slot);
>      }
>  }
>  
> @@ -452,7 +492,11 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
>  static uint32_t pci_up_read(void *opaque, uint32_t addr)
>  {
>      PIIX4PMState *s = opaque;
> -    uint32_t val = s->pci0_status.up;
> +    uint32_t val;
> +
> +    /* Manufacture an "up" value to cause a device check on any hotplug
> +     * slot with a device.  Extra device checks are harmless. */
> +    val = s->pci0_slot_device_present & s->pci0_hotplug_enable;
>  
>      PIIX4_DPRINTF("pci_up_read %x\n", val);
>      return val;
> @@ -475,18 +519,7 @@ static uint32_t pciej_read(void *opaque, uint32_t addr)
>  
>  static void pciej_write(void *opaque, uint32_t addr, uint32_t val)
>  {
> -    BusState *bus = opaque;
> -    DeviceState *qdev, *next;
> -    int slot = ffs(val) - 1;
> -
> -    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> -        PCIDevice *dev = PCI_DEVICE(qdev);
> -        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> -        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> -            qdev_free(qdev);
> -        }
> -    }
> -
> +    acpi_piix_eject_slot(opaque, val);
>  
>      PIIX4_DPRINTF("pciej write %x <== %d\n", addr, val);
>  }
> @@ -516,8 +549,8 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
>      register_ioport_read(PCI_UP_BASE, 4, 4, pci_up_read, s);
>      register_ioport_read(PCI_DOWN_BASE, 4, 4, pci_down_read, s);
>  
> -    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, bus);
> -    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, bus);
> +    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, s);
> +    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, s);
>  
>      register_ioport_write(PCI_RMV_BASE, 4, 4, pcirmv_write, s);
>      register_ioport_read(PCI_RMV_BASE, 4, 4,  pcirmv_read, s);
> @@ -528,13 +561,13 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
>  static void enable_device(PIIX4PMState *s, int slot)
>  {
>      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> -    s->pci0_status.up |= (1 << slot);
> +    s->pci0_slot_device_present |= (1U << slot);
>  }
>  
>  static void disable_device(PIIX4PMState *s, int slot)
>  {
>      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> -    s->pci0_status.down |= (1 << slot);
> +    s->pci0_status.down |= (1U << slot);
>  }
>  
>  static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> @@ -548,11 +581,10 @@ static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
>       * it is present on boot, no hotplug event is necessary. We do send an
>       * event when the device is disabled later. */
>      if (state == PCI_COLDPLUG_ENABLED) {
> +        s->pci0_slot_device_present |= (1U << slot);

Why is this necessary?
Alex Williamson - April 10, 2012, 3:14 p.m.
On Sun, 2012-04-08 at 01:57 -0300, Marcelo Tosatti wrote:
> On Thu, Apr 05, 2012 at 11:07:15AM -0600, Alex Williamson wrote:
> > As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
> > to a few races.  The first is a race with other hotplug operations
> > because we clear the up & down registers at each event.  If a new
> > event comes before the last is processed, up/down is cleared and
> > the event is lost.
> > 
> > To fix this for the down register, we create a life cycle for
> > the event request that starts with the hot unplug request in
> > piix4_device_hotplug() and ends when the device is ejected.
> > This allows us to mask and clear individual bits, preserving them
> > against races.  For the up register, we have no clear end point
> > for when the event is finished.  We could modify the BIOS to
> > acknowledge the bit and clear it, but this creates BIOS compatibiliy
> > issues without offering a complete solution.  Instead we note that
> > gratuitous ACPI device checks are not harmful, which allows us to
> > issue a device check for every slot.  We know which slots are present
> > and we know which slots are hotpluggable, so we can easily reduce
> > this to a more manageable set for the guest.
> > 
> > The other race Michael noted was that an unplug request followed
> > by reset may also lose the eject notification, which may also
> > result in the eject request being lost which a subsequent add
> > or remove.  Once we're in reset, the device is unused and we can
> > flush the queue of device removals ourselves.  Previously if a
> > device_del was issued to a guest without ACPI PCI hotplug support,
> > it was necessary to shutdown the guest to recover the device.
> > With this, a guest reboot is sufficient.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
> >  1 files changed, 53 insertions(+), 21 deletions(-)
> > 
> > diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
> > index 5e8b261..0e7af51 100644
> > --- a/hw/acpi_piix4.c
> > +++ b/hw/acpi_piix4.c
> > @@ -48,7 +48,7 @@
> >  #define PIIX4_PCI_HOTPLUG_STATUS 2
> >  
> >  struct pci_status {
> > -    uint32_t up;
> > +    uint32_t up; /* deprecated, maintained for migration compatibility */
> >      uint32_t down;
> >  };
> >  
> > @@ -70,6 +70,7 @@ typedef struct PIIX4PMState {
> >      /* for pci hotplug */
> >      struct pci_status pci0_status;
> >      uint32_t pci0_hotplug_enable;
> > +    uint32_t pci0_slot_device_present;
> 
> Why can't you use the "up" field for this? Fail to see the point
> of the new variable.

I suppose we could.  Initially I was creating the present device bitmap
dynamically, but walking the device tree on every access seemed
excessive versus updating it runtime.  Since there's no need to migrate
this and we don't want to have it clobbered by an incoming migration, I
didn't think to re-use the "up" field.  We could do it, but it would
mean introducing a post_load function that effectively just calls
piix4_update_hotplug to recreate it, which seems like an unnecessary
effort to avoid using an extra 4 bytes of memory.

> 
> >  } PIIX4PMState;
> >  
> >  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);
> > @@ -205,6 +206,17 @@ static void pm_write_config(PCIDevice *d,
> >          pm_io_space_update((PIIX4PMState *)d);
> >  }
> >  
> > +static void vmstate_pci_status_pre_save(void *opaque)
> > +{
> > +    struct pci_status *pci0_status = opaque;
> > +    PIIX4PMState *s = container_of(pci0_status, PIIX4PMState, pci0_status);
> > +
> > +    /* We no longer track up, so build a safe value for migrating
> > +     * to a version that still does... of course these might get lost
> > +     * by an old buggy implementation, but we try. */
> > +    pci0_status->up = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> > +}
> > +
> >  static int vmstate_acpi_post_load(void *opaque, int version_id)
> >  {
> >      PIIX4PMState *s = opaque;
> > @@ -241,6 +253,7 @@ static const VMStateDescription vmstate_pci_status = {
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> >      .minimum_version_id_old = 1,
> > +    .pre_save = vmstate_pci_status_pre_save,
> >      .fields      = (VMStateField []) {
> >          VMSTATE_UINT32(up, struct pci_status),
> >          VMSTATE_UINT32(down, struct pci_status),
> > @@ -269,13 +282,38 @@ static const VMStateDescription vmstate_acpi = {
> >      }
> >  };
> >  
> > +static void acpi_piix_eject_slot(PIIX4PMState *s, unsigned slots)
> > +{
> > +    DeviceState *qdev, *next;
> > +    BusState *bus = qdev_get_parent_bus(&s->dev.qdev);
> > +    int slot = ffs(slots) - 1;
> > +
> > +    /* Mark request as complete */
> > +    s->pci0_status.down &= ~(1U << slot);
> > +
> > +    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > +        PCIDevice *dev = PCI_DEVICE(qdev);
> > +        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> > +        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> > +            s->pci0_slot_device_present &= ~(1U << slot);
> > +            qdev_free(qdev);
> > +        }
> > +    }
> > +}
> > +
> >  static void piix4_update_hotplug(PIIX4PMState *s)
> >  {
> >      PCIDevice *dev = &s->dev;
> >      BusState *bus = qdev_get_parent_bus(&dev->qdev);
> >      DeviceState *qdev, *next;
> >  
> > +    /* Execute any pending removes during reset */
> > +    while (s->pci0_status.down) {
> > +        acpi_piix_eject_slot(s, s->pci0_status.down);
> > +    }
> > +
> >      s->pci0_hotplug_enable = ~0;
> > +    s->pci0_slot_device_present = 0;
> >  
> >      QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> >          PCIDevice *pdev = PCI_DEVICE(qdev);
> > @@ -283,8 +321,10 @@ static void piix4_update_hotplug(PIIX4PMState *s)
> >          int slot = PCI_SLOT(pdev->devfn);
> >  
> >          if (pc->no_hotplug) {
> > -            s->pci0_hotplug_enable &= ~(1 << slot);
> > +            s->pci0_hotplug_enable &= ~(1U << slot);
> >          }
> > +
> > +        s->pci0_slot_device_present |= (1U << slot);
> >      }
> >  }
> >  
> > @@ -452,7 +492,11 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
> >  static uint32_t pci_up_read(void *opaque, uint32_t addr)
> >  {
> >      PIIX4PMState *s = opaque;
> > -    uint32_t val = s->pci0_status.up;
> > +    uint32_t val;
> > +
> > +    /* Manufacture an "up" value to cause a device check on any hotplug
> > +     * slot with a device.  Extra device checks are harmless. */
> > +    val = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> >  
> >      PIIX4_DPRINTF("pci_up_read %x\n", val);
> >      return val;
> > @@ -475,18 +519,7 @@ static uint32_t pciej_read(void *opaque, uint32_t addr)
> >  
> >  static void pciej_write(void *opaque, uint32_t addr, uint32_t val)
> >  {
> > -    BusState *bus = opaque;
> > -    DeviceState *qdev, *next;
> > -    int slot = ffs(val) - 1;
> > -
> > -    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > -        PCIDevice *dev = PCI_DEVICE(qdev);
> > -        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> > -        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> > -            qdev_free(qdev);
> > -        }
> > -    }
> > -
> > +    acpi_piix_eject_slot(opaque, val);
> >  
> >      PIIX4_DPRINTF("pciej write %x <== %d\n", addr, val);
> >  }
> > @@ -516,8 +549,8 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
> >      register_ioport_read(PCI_UP_BASE, 4, 4, pci_up_read, s);
> >      register_ioport_read(PCI_DOWN_BASE, 4, 4, pci_down_read, s);
> >  
> > -    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, bus);
> > -    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, bus);
> > +    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, s);
> > +    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, s);
> >  
> >      register_ioport_write(PCI_RMV_BASE, 4, 4, pcirmv_write, s);
> >      register_ioport_read(PCI_RMV_BASE, 4, 4,  pcirmv_read, s);
> > @@ -528,13 +561,13 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
> >  static void enable_device(PIIX4PMState *s, int slot)
> >  {
> >      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> > -    s->pci0_status.up |= (1 << slot);
> > +    s->pci0_slot_device_present |= (1U << slot);
> >  }
> >  
> >  static void disable_device(PIIX4PMState *s, int slot)
> >  {
> >      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> > -    s->pci0_status.down |= (1 << slot);
> > +    s->pci0_status.down |= (1U << slot);
> >  }
> >  
> >  static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> > @@ -548,11 +581,10 @@ static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> >       * it is present on boot, no hotplug event is necessary. We do send an
> >       * event when the device is disabled later. */
> >      if (state == PCI_COLDPLUG_ENABLED) {
> > +        s->pci0_slot_device_present |= (1U << slot);
> 
> Why is this necessary?

I couldn't convince myself that it was completely redundant with
piix4_update_hotplug called from reset, so I took the paranoia approach.
If we always do a reset after all the cold plug devices are added, we
can drop it.  Is that the case?  Thanks,

Alex
Michael S. Tsirkin - April 10, 2012, 3:19 p.m.
On Tue, Apr 10, 2012 at 09:14:00AM -0600, Alex Williamson wrote:
> On Sun, 2012-04-08 at 01:57 -0300, Marcelo Tosatti wrote:
> > On Thu, Apr 05, 2012 at 11:07:15AM -0600, Alex Williamson wrote:
> > > As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
> > > to a few races.  The first is a race with other hotplug operations
> > > because we clear the up & down registers at each event.  If a new
> > > event comes before the last is processed, up/down is cleared and
> > > the event is lost.
> > > 
> > > To fix this for the down register, we create a life cycle for
> > > the event request that starts with the hot unplug request in
> > > piix4_device_hotplug() and ends when the device is ejected.
> > > This allows us to mask and clear individual bits, preserving them
> > > against races.  For the up register, we have no clear end point
> > > for when the event is finished.  We could modify the BIOS to
> > > acknowledge the bit and clear it, but this creates BIOS compatibiliy
> > > issues without offering a complete solution.  Instead we note that
> > > gratuitous ACPI device checks are not harmful, which allows us to
> > > issue a device check for every slot.  We know which slots are present
> > > and we know which slots are hotpluggable, so we can easily reduce
> > > this to a more manageable set for the guest.
> > > 
> > > The other race Michael noted was that an unplug request followed
> > > by reset may also lose the eject notification, which may also
> > > result in the eject request being lost which a subsequent add
> > > or remove.  Once we're in reset, the device is unused and we can
> > > flush the queue of device removals ourselves.  Previously if a
> > > device_del was issued to a guest without ACPI PCI hotplug support,
> > > it was necessary to shutdown the guest to recover the device.
> > > With this, a guest reboot is sufficient.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > >  hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
> > >  1 files changed, 53 insertions(+), 21 deletions(-)
> > > 
> > > diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
> > > index 5e8b261..0e7af51 100644
> > > --- a/hw/acpi_piix4.c
> > > +++ b/hw/acpi_piix4.c
> > > @@ -48,7 +48,7 @@
> > >  #define PIIX4_PCI_HOTPLUG_STATUS 2
> > >  
> > >  struct pci_status {
> > > -    uint32_t up;
> > > +    uint32_t up; /* deprecated, maintained for migration compatibility */
> > >      uint32_t down;
> > >  };
> > >  
> > > @@ -70,6 +70,7 @@ typedef struct PIIX4PMState {
> > >      /* for pci hotplug */
> > >      struct pci_status pci0_status;
> > >      uint32_t pci0_hotplug_enable;
> > > +    uint32_t pci0_slot_device_present;
> > 
> > Why can't you use the "up" field for this? Fail to see the point
> > of the new variable.
> 
> I suppose we could.  Initially I was creating the present device bitmap
> dynamically, but walking the device tree on every access seemed
> excessive versus updating it runtime.  Since there's no need to migrate
> this and we don't want to have it clobbered by an incoming migration, I
> didn't think to re-use the "up" field.  We could do it, but it would
> mean introducing a post_load function that effectively just calls
> piix4_update_hotplug to recreate it, which seems like an unnecessary
> effort to avoid using an extra 4 bytes of memory.

It's probably harmless if we do let it be clobbered by migration
though: worst case we lose an event and that might have
happened before migration :)

I don't mind either way though.

> > 
> > >  } PIIX4PMState;
> > >  
> > >  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);
> > > @@ -205,6 +206,17 @@ static void pm_write_config(PCIDevice *d,
> > >          pm_io_space_update((PIIX4PMState *)d);
> > >  }
> > >  
> > > +static void vmstate_pci_status_pre_save(void *opaque)
> > > +{
> > > +    struct pci_status *pci0_status = opaque;
> > > +    PIIX4PMState *s = container_of(pci0_status, PIIX4PMState, pci0_status);
> > > +
> > > +    /* We no longer track up, so build a safe value for migrating
> > > +     * to a version that still does... of course these might get lost
> > > +     * by an old buggy implementation, but we try. */
> > > +    pci0_status->up = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> > > +}
> > > +
> > >  static int vmstate_acpi_post_load(void *opaque, int version_id)
> > >  {
> > >      PIIX4PMState *s = opaque;
> > > @@ -241,6 +253,7 @@ static const VMStateDescription vmstate_pci_status = {
> > >      .version_id = 1,
> > >      .minimum_version_id = 1,
> > >      .minimum_version_id_old = 1,
> > > +    .pre_save = vmstate_pci_status_pre_save,
> > >      .fields      = (VMStateField []) {
> > >          VMSTATE_UINT32(up, struct pci_status),
> > >          VMSTATE_UINT32(down, struct pci_status),
> > > @@ -269,13 +282,38 @@ static const VMStateDescription vmstate_acpi = {
> > >      }
> > >  };
> > >  
> > > +static void acpi_piix_eject_slot(PIIX4PMState *s, unsigned slots)
> > > +{
> > > +    DeviceState *qdev, *next;
> > > +    BusState *bus = qdev_get_parent_bus(&s->dev.qdev);
> > > +    int slot = ffs(slots) - 1;
> > > +
> > > +    /* Mark request as complete */
> > > +    s->pci0_status.down &= ~(1U << slot);
> > > +
> > > +    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > > +        PCIDevice *dev = PCI_DEVICE(qdev);
> > > +        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> > > +        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> > > +            s->pci0_slot_device_present &= ~(1U << slot);
> > > +            qdev_free(qdev);
> > > +        }
> > > +    }
> > > +}
> > > +
> > >  static void piix4_update_hotplug(PIIX4PMState *s)
> > >  {
> > >      PCIDevice *dev = &s->dev;
> > >      BusState *bus = qdev_get_parent_bus(&dev->qdev);
> > >      DeviceState *qdev, *next;
> > >  
> > > +    /* Execute any pending removes during reset */
> > > +    while (s->pci0_status.down) {
> > > +        acpi_piix_eject_slot(s, s->pci0_status.down);
> > > +    }
> > > +
> > >      s->pci0_hotplug_enable = ~0;
> > > +    s->pci0_slot_device_present = 0;
> > >  
> > >      QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > >          PCIDevice *pdev = PCI_DEVICE(qdev);
> > > @@ -283,8 +321,10 @@ static void piix4_update_hotplug(PIIX4PMState *s)
> > >          int slot = PCI_SLOT(pdev->devfn);
> > >  
> > >          if (pc->no_hotplug) {
> > > -            s->pci0_hotplug_enable &= ~(1 << slot);
> > > +            s->pci0_hotplug_enable &= ~(1U << slot);
> > >          }
> > > +
> > > +        s->pci0_slot_device_present |= (1U << slot);
> > >      }
> > >  }
> > >  
> > > @@ -452,7 +492,11 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
> > >  static uint32_t pci_up_read(void *opaque, uint32_t addr)
> > >  {
> > >      PIIX4PMState *s = opaque;
> > > -    uint32_t val = s->pci0_status.up;
> > > +    uint32_t val;
> > > +
> > > +    /* Manufacture an "up" value to cause a device check on any hotplug
> > > +     * slot with a device.  Extra device checks are harmless. */
> > > +    val = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> > >  
> > >      PIIX4_DPRINTF("pci_up_read %x\n", val);
> > >      return val;
> > > @@ -475,18 +519,7 @@ static uint32_t pciej_read(void *opaque, uint32_t addr)
> > >  
> > >  static void pciej_write(void *opaque, uint32_t addr, uint32_t val)
> > >  {
> > > -    BusState *bus = opaque;
> > > -    DeviceState *qdev, *next;
> > > -    int slot = ffs(val) - 1;
> > > -
> > > -    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > > -        PCIDevice *dev = PCI_DEVICE(qdev);
> > > -        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> > > -        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> > > -            qdev_free(qdev);
> > > -        }
> > > -    }
> > > -
> > > +    acpi_piix_eject_slot(opaque, val);
> > >  
> > >      PIIX4_DPRINTF("pciej write %x <== %d\n", addr, val);
> > >  }
> > > @@ -516,8 +549,8 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
> > >      register_ioport_read(PCI_UP_BASE, 4, 4, pci_up_read, s);
> > >      register_ioport_read(PCI_DOWN_BASE, 4, 4, pci_down_read, s);
> > >  
> > > -    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, bus);
> > > -    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, bus);
> > > +    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, s);
> > > +    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, s);
> > >  
> > >      register_ioport_write(PCI_RMV_BASE, 4, 4, pcirmv_write, s);
> > >      register_ioport_read(PCI_RMV_BASE, 4, 4,  pcirmv_read, s);
> > > @@ -528,13 +561,13 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
> > >  static void enable_device(PIIX4PMState *s, int slot)
> > >  {
> > >      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> > > -    s->pci0_status.up |= (1 << slot);
> > > +    s->pci0_slot_device_present |= (1U << slot);
> > >  }
> > >  
> > >  static void disable_device(PIIX4PMState *s, int slot)
> > >  {
> > >      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> > > -    s->pci0_status.down |= (1 << slot);
> > > +    s->pci0_status.down |= (1U << slot);
> > >  }
> > >  
> > >  static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> > > @@ -548,11 +581,10 @@ static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> > >       * it is present on boot, no hotplug event is necessary. We do send an
> > >       * event when the device is disabled later. */
> > >      if (state == PCI_COLDPLUG_ENABLED) {
> > > +        s->pci0_slot_device_present |= (1U << slot);
> > 
> > Why is this necessary?
> 
> I couldn't convince myself that it was completely redundant with
> piix4_update_hotplug called from reset, so I took the paranoia approach.
> If we always do a reset after all the cold plug devices are added, we
> can drop it.  Is that the case?  Thanks,
> 
> Alex
>
Alex Williamson - April 10, 2012, 3:35 p.m.
On Tue, 2012-04-10 at 18:19 +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 10, 2012 at 09:14:00AM -0600, Alex Williamson wrote:
> > On Sun, 2012-04-08 at 01:57 -0300, Marcelo Tosatti wrote:
> > > On Thu, Apr 05, 2012 at 11:07:15AM -0600, Alex Williamson wrote:
> > > > As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
> > > > to a few races.  The first is a race with other hotplug operations
> > > > because we clear the up & down registers at each event.  If a new
> > > > event comes before the last is processed, up/down is cleared and
> > > > the event is lost.
> > > > 
> > > > To fix this for the down register, we create a life cycle for
> > > > the event request that starts with the hot unplug request in
> > > > piix4_device_hotplug() and ends when the device is ejected.
> > > > This allows us to mask and clear individual bits, preserving them
> > > > against races.  For the up register, we have no clear end point
> > > > for when the event is finished.  We could modify the BIOS to
> > > > acknowledge the bit and clear it, but this creates BIOS compatibiliy
> > > > issues without offering a complete solution.  Instead we note that
> > > > gratuitous ACPI device checks are not harmful, which allows us to
> > > > issue a device check for every slot.  We know which slots are present
> > > > and we know which slots are hotpluggable, so we can easily reduce
> > > > this to a more manageable set for the guest.
> > > > 
> > > > The other race Michael noted was that an unplug request followed
> > > > by reset may also lose the eject notification, which may also
> > > > result in the eject request being lost which a subsequent add
> > > > or remove.  Once we're in reset, the device is unused and we can
> > > > flush the queue of device removals ourselves.  Previously if a
> > > > device_del was issued to a guest without ACPI PCI hotplug support,
> > > > it was necessary to shutdown the guest to recover the device.
> > > > With this, a guest reboot is sufficient.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > > 
> > > >  hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
> > > >  1 files changed, 53 insertions(+), 21 deletions(-)
> > > > 
> > > > diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
> > > > index 5e8b261..0e7af51 100644
> > > > --- a/hw/acpi_piix4.c
> > > > +++ b/hw/acpi_piix4.c
> > > > @@ -48,7 +48,7 @@
> > > >  #define PIIX4_PCI_HOTPLUG_STATUS 2
> > > >  
> > > >  struct pci_status {
> > > > -    uint32_t up;
> > > > +    uint32_t up; /* deprecated, maintained for migration compatibility */
> > > >      uint32_t down;
> > > >  };
> > > >  
> > > > @@ -70,6 +70,7 @@ typedef struct PIIX4PMState {
> > > >      /* for pci hotplug */
> > > >      struct pci_status pci0_status;
> > > >      uint32_t pci0_hotplug_enable;
> > > > +    uint32_t pci0_slot_device_present;
> > > 
> > > Why can't you use the "up" field for this? Fail to see the point
> > > of the new variable.
> > 
> > I suppose we could.  Initially I was creating the present device bitmap
> > dynamically, but walking the device tree on every access seemed
> > excessive versus updating it runtime.  Since there's no need to migrate
> > this and we don't want to have it clobbered by an incoming migration, I
> > didn't think to re-use the "up" field.  We could do it, but it would
> > mean introducing a post_load function that effectively just calls
> > piix4_update_hotplug to recreate it, which seems like an unnecessary
> > effort to avoid using an extra 4 bytes of memory.
> 
> It's probably harmless if we do let it be clobbered by migration
> though: worst case we lose an event and that might have
> happened before migration :)

Perhaps that's another way to manage it, just let it be a lazy
accumulation of anything that has been hotadded.  That makes debug hard
though because we can't know what should be set without looking that the
entire history of the vm, versus something like "device present", which
we can verify at any point in time.  Thanks,

Alex

> I don't mind either way though.
> 
> > > 
> > > >  } PIIX4PMState;
> > > >  
> > > >  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);
> > > > @@ -205,6 +206,17 @@ static void pm_write_config(PCIDevice *d,
> > > >          pm_io_space_update((PIIX4PMState *)d);
> > > >  }
> > > >  
> > > > +static void vmstate_pci_status_pre_save(void *opaque)
> > > > +{
> > > > +    struct pci_status *pci0_status = opaque;
> > > > +    PIIX4PMState *s = container_of(pci0_status, PIIX4PMState, pci0_status);
> > > > +
> > > > +    /* We no longer track up, so build a safe value for migrating
> > > > +     * to a version that still does... of course these might get lost
> > > > +     * by an old buggy implementation, but we try. */
> > > > +    pci0_status->up = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> > > > +}
> > > > +
> > > >  static int vmstate_acpi_post_load(void *opaque, int version_id)
> > > >  {
> > > >      PIIX4PMState *s = opaque;
> > > > @@ -241,6 +253,7 @@ static const VMStateDescription vmstate_pci_status = {
> > > >      .version_id = 1,
> > > >      .minimum_version_id = 1,
> > > >      .minimum_version_id_old = 1,
> > > > +    .pre_save = vmstate_pci_status_pre_save,
> > > >      .fields      = (VMStateField []) {
> > > >          VMSTATE_UINT32(up, struct pci_status),
> > > >          VMSTATE_UINT32(down, struct pci_status),
> > > > @@ -269,13 +282,38 @@ static const VMStateDescription vmstate_acpi = {
> > > >      }
> > > >  };
> > > >  
> > > > +static void acpi_piix_eject_slot(PIIX4PMState *s, unsigned slots)
> > > > +{
> > > > +    DeviceState *qdev, *next;
> > > > +    BusState *bus = qdev_get_parent_bus(&s->dev.qdev);
> > > > +    int slot = ffs(slots) - 1;
> > > > +
> > > > +    /* Mark request as complete */
> > > > +    s->pci0_status.down &= ~(1U << slot);
> > > > +
> > > > +    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > > > +        PCIDevice *dev = PCI_DEVICE(qdev);
> > > > +        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> > > > +        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> > > > +            s->pci0_slot_device_present &= ~(1U << slot);
> > > > +            qdev_free(qdev);
> > > > +        }
> > > > +    }
> > > > +}
> > > > +
> > > >  static void piix4_update_hotplug(PIIX4PMState *s)
> > > >  {
> > > >      PCIDevice *dev = &s->dev;
> > > >      BusState *bus = qdev_get_parent_bus(&dev->qdev);
> > > >      DeviceState *qdev, *next;
> > > >  
> > > > +    /* Execute any pending removes during reset */
> > > > +    while (s->pci0_status.down) {
> > > > +        acpi_piix_eject_slot(s, s->pci0_status.down);
> > > > +    }
> > > > +
> > > >      s->pci0_hotplug_enable = ~0;
> > > > +    s->pci0_slot_device_present = 0;
> > > >  
> > > >      QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > > >          PCIDevice *pdev = PCI_DEVICE(qdev);
> > > > @@ -283,8 +321,10 @@ static void piix4_update_hotplug(PIIX4PMState *s)
> > > >          int slot = PCI_SLOT(pdev->devfn);
> > > >  
> > > >          if (pc->no_hotplug) {
> > > > -            s->pci0_hotplug_enable &= ~(1 << slot);
> > > > +            s->pci0_hotplug_enable &= ~(1U << slot);
> > > >          }
> > > > +
> > > > +        s->pci0_slot_device_present |= (1U << slot);
> > > >      }
> > > >  }
> > > >  
> > > > @@ -452,7 +492,11 @@ static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
> > > >  static uint32_t pci_up_read(void *opaque, uint32_t addr)
> > > >  {
> > > >      PIIX4PMState *s = opaque;
> > > > -    uint32_t val = s->pci0_status.up;
> > > > +    uint32_t val;
> > > > +
> > > > +    /* Manufacture an "up" value to cause a device check on any hotplug
> > > > +     * slot with a device.  Extra device checks are harmless. */
> > > > +    val = s->pci0_slot_device_present & s->pci0_hotplug_enable;
> > > >  
> > > >      PIIX4_DPRINTF("pci_up_read %x\n", val);
> > > >      return val;
> > > > @@ -475,18 +519,7 @@ static uint32_t pciej_read(void *opaque, uint32_t addr)
> > > >  
> > > >  static void pciej_write(void *opaque, uint32_t addr, uint32_t val)
> > > >  {
> > > > -    BusState *bus = opaque;
> > > > -    DeviceState *qdev, *next;
> > > > -    int slot = ffs(val) - 1;
> > > > -
> > > > -    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
> > > > -        PCIDevice *dev = PCI_DEVICE(qdev);
> > > > -        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
> > > > -        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
> > > > -            qdev_free(qdev);
> > > > -        }
> > > > -    }
> > > > -
> > > > +    acpi_piix_eject_slot(opaque, val);
> > > >  
> > > >      PIIX4_DPRINTF("pciej write %x <== %d\n", addr, val);
> > > >  }
> > > > @@ -516,8 +549,8 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
> > > >      register_ioport_read(PCI_UP_BASE, 4, 4, pci_up_read, s);
> > > >      register_ioport_read(PCI_DOWN_BASE, 4, 4, pci_down_read, s);
> > > >  
> > > > -    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, bus);
> > > > -    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, bus);
> > > > +    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, s);
> > > > +    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, s);
> > > >  
> > > >      register_ioport_write(PCI_RMV_BASE, 4, 4, pcirmv_write, s);
> > > >      register_ioport_read(PCI_RMV_BASE, 4, 4,  pcirmv_read, s);
> > > > @@ -528,13 +561,13 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
> > > >  static void enable_device(PIIX4PMState *s, int slot)
> > > >  {
> > > >      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> > > > -    s->pci0_status.up |= (1 << slot);
> > > > +    s->pci0_slot_device_present |= (1U << slot);
> > > >  }
> > > >  
> > > >  static void disable_device(PIIX4PMState *s, int slot)
> > > >  {
> > > >      s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
> > > > -    s->pci0_status.down |= (1 << slot);
> > > > +    s->pci0_status.down |= (1U << slot);
> > > >  }
> > > >  
> > > >  static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> > > > @@ -548,11 +581,10 @@ static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
> > > >       * it is present on boot, no hotplug event is necessary. We do send an
> > > >       * event when the device is disabled later. */
> > > >      if (state == PCI_COLDPLUG_ENABLED) {
> > > > +        s->pci0_slot_device_present |= (1U << slot);
> > > 
> > > Why is this necessary?
> > 
> > I couldn't convince myself that it was completely redundant with
> > piix4_update_hotplug called from reset, so I took the paranoia approach.
> > If we always do a reset after all the cold plug devices are added, we
> > can drop it.  Is that the case?  Thanks,
> > 
> > Alex
> >
Michael S. Tsirkin - April 10, 2012, 3:45 p.m.
On Tue, Apr 10, 2012 at 09:35:39AM -0600, Alex Williamson wrote:
> On Tue, 2012-04-10 at 18:19 +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 10, 2012 at 09:14:00AM -0600, Alex Williamson wrote:
> > > On Sun, 2012-04-08 at 01:57 -0300, Marcelo Tosatti wrote:
> > > > On Thu, Apr 05, 2012 at 11:07:15AM -0600, Alex Williamson wrote:
> > > > > As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
> > > > > to a few races.  The first is a race with other hotplug operations
> > > > > because we clear the up & down registers at each event.  If a new
> > > > > event comes before the last is processed, up/down is cleared and
> > > > > the event is lost.
> > > > > 
> > > > > To fix this for the down register, we create a life cycle for
> > > > > the event request that starts with the hot unplug request in
> > > > > piix4_device_hotplug() and ends when the device is ejected.
> > > > > This allows us to mask and clear individual bits, preserving them
> > > > > against races.  For the up register, we have no clear end point
> > > > > for when the event is finished.  We could modify the BIOS to
> > > > > acknowledge the bit and clear it, but this creates BIOS compatibiliy
> > > > > issues without offering a complete solution.  Instead we note that
> > > > > gratuitous ACPI device checks are not harmful, which allows us to
> > > > > issue a device check for every slot.  We know which slots are present
> > > > > and we know which slots are hotpluggable, so we can easily reduce
> > > > > this to a more manageable set for the guest.
> > > > > 
> > > > > The other race Michael noted was that an unplug request followed
> > > > > by reset may also lose the eject notification, which may also
> > > > > result in the eject request being lost which a subsequent add
> > > > > or remove.  Once we're in reset, the device is unused and we can
> > > > > flush the queue of device removals ourselves.  Previously if a
> > > > > device_del was issued to a guest without ACPI PCI hotplug support,
> > > > > it was necessary to shutdown the guest to recover the device.
> > > > > With this, a guest reboot is sufficient.
> > > > > 
> > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > ---
> > > > > 
> > > > >  hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
> > > > >  1 files changed, 53 insertions(+), 21 deletions(-)
> > > > > 
> > > > > diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
> > > > > index 5e8b261..0e7af51 100644
> > > > > --- a/hw/acpi_piix4.c
> > > > > +++ b/hw/acpi_piix4.c
> > > > > @@ -48,7 +48,7 @@
> > > > >  #define PIIX4_PCI_HOTPLUG_STATUS 2
> > > > >  
> > > > >  struct pci_status {
> > > > > -    uint32_t up;
> > > > > +    uint32_t up; /* deprecated, maintained for migration compatibility */
> > > > >      uint32_t down;
> > > > >  };
> > > > >  
> > > > > @@ -70,6 +70,7 @@ typedef struct PIIX4PMState {
> > > > >      /* for pci hotplug */
> > > > >      struct pci_status pci0_status;
> > > > >      uint32_t pci0_hotplug_enable;
> > > > > +    uint32_t pci0_slot_device_present;
> > > > 
> > > > Why can't you use the "up" field for this? Fail to see the point
> > > > of the new variable.
> > > 
> > > I suppose we could.  Initially I was creating the present device bitmap
> > > dynamically, but walking the device tree on every access seemed
> > > excessive versus updating it runtime.  Since there's no need to migrate
> > > this and we don't want to have it clobbered by an incoming migration, I
> > > didn't think to re-use the "up" field.  We could do it, but it would
> > > mean introducing a post_load function that effectively just calls
> > > piix4_update_hotplug to recreate it, which seems like an unnecessary
> > > effort to avoid using an extra 4 bytes of memory.
> > 
> > It's probably harmless if we do let it be clobbered by migration
> > though: worst case we lose an event and that might have
> > happened before migration :)
> 
> Perhaps that's another way to manage it, just let it be a lazy
> accumulation of anything that has been hotadded.  That makes debug hard
> though because we can't know what should be set without looking that the
> entire history of the vm, versus something like "device present", which
> we can verify at any point in time.  Thanks,
> 
> Alex

Good point. I think Marcelo's comment can be addressed
if you rename up to up_ignored, old_up, up_legacy or something
like that.
Alex Williamson - April 10, 2012, 6:38 p.m.
On Tue, 2012-04-10 at 18:45 +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 10, 2012 at 09:35:39AM -0600, Alex Williamson wrote:
> > On Tue, 2012-04-10 at 18:19 +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 10, 2012 at 09:14:00AM -0600, Alex Williamson wrote:
> > > > On Sun, 2012-04-08 at 01:57 -0300, Marcelo Tosatti wrote:
> > > > > On Thu, Apr 05, 2012 at 11:07:15AM -0600, Alex Williamson wrote:
> > > > > > As Michael Tsirkin demonstrated, current PCI hotplug is vulnerable
> > > > > > to a few races.  The first is a race with other hotplug operations
> > > > > > because we clear the up & down registers at each event.  If a new
> > > > > > event comes before the last is processed, up/down is cleared and
> > > > > > the event is lost.
> > > > > > 
> > > > > > To fix this for the down register, we create a life cycle for
> > > > > > the event request that starts with the hot unplug request in
> > > > > > piix4_device_hotplug() and ends when the device is ejected.
> > > > > > This allows us to mask and clear individual bits, preserving them
> > > > > > against races.  For the up register, we have no clear end point
> > > > > > for when the event is finished.  We could modify the BIOS to
> > > > > > acknowledge the bit and clear it, but this creates BIOS compatibiliy
> > > > > > issues without offering a complete solution.  Instead we note that
> > > > > > gratuitous ACPI device checks are not harmful, which allows us to
> > > > > > issue a device check for every slot.  We know which slots are present
> > > > > > and we know which slots are hotpluggable, so we can easily reduce
> > > > > > this to a more manageable set for the guest.
> > > > > > 
> > > > > > The other race Michael noted was that an unplug request followed
> > > > > > by reset may also lose the eject notification, which may also
> > > > > > result in the eject request being lost which a subsequent add
> > > > > > or remove.  Once we're in reset, the device is unused and we can
> > > > > > flush the queue of device removals ourselves.  Previously if a
> > > > > > device_del was issued to a guest without ACPI PCI hotplug support,
> > > > > > it was necessary to shutdown the guest to recover the device.
> > > > > > With this, a guest reboot is sufficient.
> > > > > > 
> > > > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > > > ---
> > > > > > 
> > > > > >  hw/acpi_piix4.c |   74 +++++++++++++++++++++++++++++++++++++++----------------
> > > > > >  1 files changed, 53 insertions(+), 21 deletions(-)
> > > > > > 
> > > > > > diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
> > > > > > index 5e8b261..0e7af51 100644
> > > > > > --- a/hw/acpi_piix4.c
> > > > > > +++ b/hw/acpi_piix4.c
> > > > > > @@ -48,7 +48,7 @@
> > > > > >  #define PIIX4_PCI_HOTPLUG_STATUS 2
> > > > > >  
> > > > > >  struct pci_status {
> > > > > > -    uint32_t up;
> > > > > > +    uint32_t up; /* deprecated, maintained for migration compatibility */
> > > > > >      uint32_t down;
> > > > > >  };
> > > > > >  
> > > > > > @@ -70,6 +70,7 @@ typedef struct PIIX4PMState {
> > > > > >      /* for pci hotplug */
> > > > > >      struct pci_status pci0_status;
> > > > > >      uint32_t pci0_hotplug_enable;
> > > > > > +    uint32_t pci0_slot_device_present;
> > > > > 
> > > > > Why can't you use the "up" field for this? Fail to see the point
> > > > > of the new variable.
> > > > 
> > > > I suppose we could.  Initially I was creating the present device bitmap
> > > > dynamically, but walking the device tree on every access seemed
> > > > excessive versus updating it runtime.  Since there's no need to migrate
> > > > this and we don't want to have it clobbered by an incoming migration, I
> > > > didn't think to re-use the "up" field.  We could do it, but it would
> > > > mean introducing a post_load function that effectively just calls
> > > > piix4_update_hotplug to recreate it, which seems like an unnecessary
> > > > effort to avoid using an extra 4 bytes of memory.
> > > 
> > > It's probably harmless if we do let it be clobbered by migration
> > > though: worst case we lose an event and that might have
> > > happened before migration :)
> > 
> > Perhaps that's another way to manage it, just let it be a lazy
> > accumulation of anything that has been hotadded.  That makes debug hard
> > though because we can't know what should be set without looking that the
> > entire history of the vm, versus something like "device present", which
> > we can verify at any point in time.  Thanks,
> > 
> > Alex
> 
> Good point. I think Marcelo's comment can be addressed
> if you rename up to up_ignored, old_up, up_legacy or something
> like that.

Easy enough to rename .up to .old_up.  Marcelo, are you looking for more
than that?  Thanks,

Alex
Marcelo Tosatti - April 11, 2012, 2:37 a.m.
On Tue, Apr 10, 2012 at 12:38:09PM -0600, Alex Williamson wrote:
> > > > It's probably harmless if we do let it be clobbered by migration
> > > > though: worst case we lose an event and that might have
> > > > happened before migration :)
> > > 
> > > Perhaps that's another way to manage it, just let it be a lazy
> > > accumulation of anything that has been hotadded.  That makes debug hard
> > > though because we can't know what should be set without looking that the
> > > entire history of the vm, versus something like "device present", which
> > > we can verify at any point in time.  Thanks,
> > > 
> > > Alex
> > 
> > Good point. I think Marcelo's comment can be addressed
> > if you rename up to up_ignored, old_up, up_legacy or something
> > like that.
> 
> Easy enough to rename .up to .old_up.  Marcelo, are you looking for more
> than that?  Thanks,
> 
> Alex

Its fine the way it is actually, since the patch adds a comment
mentioning that "up" is not used (and one can easily check the logs to
understand).

Patchset looks good to me, thanks.

Patch

diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index 5e8b261..0e7af51 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -48,7 +48,7 @@ 
 #define PIIX4_PCI_HOTPLUG_STATUS 2
 
 struct pci_status {
-    uint32_t up;
+    uint32_t up; /* deprecated, maintained for migration compatibility */
     uint32_t down;
 };
 
@@ -70,6 +70,7 @@  typedef struct PIIX4PMState {
     /* for pci hotplug */
     struct pci_status pci0_status;
     uint32_t pci0_hotplug_enable;
+    uint32_t pci0_slot_device_present;
 } PIIX4PMState;
 
 static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s);
@@ -205,6 +206,17 @@  static void pm_write_config(PCIDevice *d,
         pm_io_space_update((PIIX4PMState *)d);
 }
 
+static void vmstate_pci_status_pre_save(void *opaque)
+{
+    struct pci_status *pci0_status = opaque;
+    PIIX4PMState *s = container_of(pci0_status, PIIX4PMState, pci0_status);
+
+    /* We no longer track up, so build a safe value for migrating
+     * to a version that still does... of course these might get lost
+     * by an old buggy implementation, but we try. */
+    pci0_status->up = s->pci0_slot_device_present & s->pci0_hotplug_enable;
+}
+
 static int vmstate_acpi_post_load(void *opaque, int version_id)
 {
     PIIX4PMState *s = opaque;
@@ -241,6 +253,7 @@  static const VMStateDescription vmstate_pci_status = {
     .version_id = 1,
     .minimum_version_id = 1,
     .minimum_version_id_old = 1,
+    .pre_save = vmstate_pci_status_pre_save,
     .fields      = (VMStateField []) {
         VMSTATE_UINT32(up, struct pci_status),
         VMSTATE_UINT32(down, struct pci_status),
@@ -269,13 +282,38 @@  static const VMStateDescription vmstate_acpi = {
     }
 };
 
+static void acpi_piix_eject_slot(PIIX4PMState *s, unsigned slots)
+{
+    DeviceState *qdev, *next;
+    BusState *bus = qdev_get_parent_bus(&s->dev.qdev);
+    int slot = ffs(slots) - 1;
+
+    /* Mark request as complete */
+    s->pci0_status.down &= ~(1U << slot);
+
+    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
+        PCIDevice *dev = PCI_DEVICE(qdev);
+        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
+            s->pci0_slot_device_present &= ~(1U << slot);
+            qdev_free(qdev);
+        }
+    }
+}
+
 static void piix4_update_hotplug(PIIX4PMState *s)
 {
     PCIDevice *dev = &s->dev;
     BusState *bus = qdev_get_parent_bus(&dev->qdev);
     DeviceState *qdev, *next;
 
+    /* Execute any pending removes during reset */
+    while (s->pci0_status.down) {
+        acpi_piix_eject_slot(s, s->pci0_status.down);
+    }
+
     s->pci0_hotplug_enable = ~0;
+    s->pci0_slot_device_present = 0;
 
     QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
         PCIDevice *pdev = PCI_DEVICE(qdev);
@@ -283,8 +321,10 @@  static void piix4_update_hotplug(PIIX4PMState *s)
         int slot = PCI_SLOT(pdev->devfn);
 
         if (pc->no_hotplug) {
-            s->pci0_hotplug_enable &= ~(1 << slot);
+            s->pci0_hotplug_enable &= ~(1U << slot);
         }
+
+        s->pci0_slot_device_present |= (1U << slot);
     }
 }
 
@@ -452,7 +492,11 @@  static void gpe_writeb(void *opaque, uint32_t addr, uint32_t val)
 static uint32_t pci_up_read(void *opaque, uint32_t addr)
 {
     PIIX4PMState *s = opaque;
-    uint32_t val = s->pci0_status.up;
+    uint32_t val;
+
+    /* Manufacture an "up" value to cause a device check on any hotplug
+     * slot with a device.  Extra device checks are harmless. */
+    val = s->pci0_slot_device_present & s->pci0_hotplug_enable;
 
     PIIX4_DPRINTF("pci_up_read %x\n", val);
     return val;
@@ -475,18 +519,7 @@  static uint32_t pciej_read(void *opaque, uint32_t addr)
 
 static void pciej_write(void *opaque, uint32_t addr, uint32_t val)
 {
-    BusState *bus = opaque;
-    DeviceState *qdev, *next;
-    int slot = ffs(val) - 1;
-
-    QTAILQ_FOREACH_SAFE(qdev, &bus->children, sibling, next) {
-        PCIDevice *dev = PCI_DEVICE(qdev);
-        PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
-        if (PCI_SLOT(dev->devfn) == slot && !pc->no_hotplug) {
-            qdev_free(qdev);
-        }
-    }
-
+    acpi_piix_eject_slot(opaque, val);
 
     PIIX4_DPRINTF("pciej write %x <== %d\n", addr, val);
 }
@@ -516,8 +549,8 @@  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
     register_ioport_read(PCI_UP_BASE, 4, 4, pci_up_read, s);
     register_ioport_read(PCI_DOWN_BASE, 4, 4, pci_down_read, s);
 
-    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, bus);
-    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, bus);
+    register_ioport_write(PCI_EJ_BASE, 4, 4, pciej_write, s);
+    register_ioport_read(PCI_EJ_BASE, 4, 4,  pciej_read, s);
 
     register_ioport_write(PCI_RMV_BASE, 4, 4, pcirmv_write, s);
     register_ioport_read(PCI_RMV_BASE, 4, 4,  pcirmv_read, s);
@@ -528,13 +561,13 @@  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
 static void enable_device(PIIX4PMState *s, int slot)
 {
     s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
-    s->pci0_status.up |= (1 << slot);
+    s->pci0_slot_device_present |= (1U << slot);
 }
 
 static void disable_device(PIIX4PMState *s, int slot)
 {
     s->ar.gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
-    s->pci0_status.down |= (1 << slot);
+    s->pci0_status.down |= (1U << slot);
 }
 
 static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
@@ -548,11 +581,10 @@  static int piix4_device_hotplug(DeviceState *qdev, PCIDevice *dev,
      * it is present on boot, no hotplug event is necessary. We do send an
      * event when the device is disabled later. */
     if (state == PCI_COLDPLUG_ENABLED) {
+        s->pci0_slot_device_present |= (1U << slot);
         return 0;
     }
 
-    s->pci0_status.up = 0;
-    s->pci0_status.down = 0;
     if (state == PCI_HOTPLUG_ENABLED) {
         enable_device(s, slot);
     } else {