diff mbox

[1/1] Fix device listener interface for PCI to PCI bridges

Message ID 1432802797-21950-1-git-send-email-dslutz@verizon.com
State New
Headers show

Commit Message

Don Slutz May 28, 2015, 8:46 a.m. UTC
The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
PCI device has a static address.  This is not true for PCI devices
that are on the secondary bus of a PCI to PCI bridge.

BIOS and/or guest OS can change the secondary bus number to a new
value at any time.

When a PCI to PCI bridge bridge is reset, the secondary bus number
is set to zero.  Normally the BIOS will set it to 255 during PCI bus
scanning so that only the PCI devices on the root can be accessed
via bus 0.  Later it is set to a number between 1 and 254.

Adjust xen_map_pcidev() to not register with Xen for secondary bus
numbers 0 and 255.

Extend the device listener interface to be called when ever the
secondary bus number is set to a usable value.  This includes
a call on unrealize if the secondary bus number was valid.

Signed-off-by: Don Slutz <dslutz@verizon.com>
---

Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
and so SeaBIOS does not have access to PCI device(s) on secondary
buses.  How ever domU can setup the secondary bus(es) and this patch
will restore access to these PCI devices.

 hw/core/qdev.c              | 10 ++++++++++
 hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
 include/hw/qdev-core.h      |  2 ++
 include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
 trace-events                |  1 +
 5 files changed, 68 insertions(+), 6 deletions(-)

Comments

Michael S. Tsirkin May 28, 2015, 9:30 a.m. UTC | #1
On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
> The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
> PCI device has a static address.  This is not true for PCI devices
> that are on the secondary bus of a PCI to PCI bridge.
> 
> BIOS and/or guest OS can change the secondary bus number to a new
> value at any time.
> 
> When a PCI to PCI bridge bridge is reset, the secondary bus number
> is set to zero.  Normally the BIOS will set it to 255 during PCI bus
> scanning so that only the PCI devices on the root can be accessed
> via bus 0.  Later it is set to a number between 1 and 254.
> 
> Adjust xen_map_pcidev() to not register with Xen for secondary bus
> numbers 0 and 255.
> 
> Extend the device listener interface to be called when ever the
> secondary bus number is set to a usable value.  This includes
> a call on unrealize if the secondary bus number was valid.
> 
> Signed-off-by: Don Slutz <dslutz@verizon.com>
> ---
> 
> Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
> and so SeaBIOS does not have access to PCI device(s) on secondary
> buses.  How ever domU can setup the secondary bus(es) and this patch
> will restore access to these PCI devices.
> 
>  hw/core/qdev.c              | 10 ++++++++++
>  hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
>  include/hw/qdev-core.h      |  2 ++
>  include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
>  trace-events                |  1 +
>  5 files changed, 68 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> index b0f0f84..6a514ee 100644
> --- a/hw/core/qdev.c
> +++ b/hw/core/qdev.c
> @@ -239,6 +239,16 @@ void device_listener_unregister(DeviceListener *listener)
>      QTAILQ_REMOVE(&device_listeners, listener, link);
>  }
>  
> +void device_listener_realize(DeviceState *dev)
> +{
> +    DEVICE_LISTENER_CALL(realize, Forward, dev);
> +}
> +
> +void device_listener_unrealize(DeviceState *dev)
> +{
> +    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
> +}
> +
>  static void device_realize(DeviceState *dev, Error **errp)
>  {
>      DeviceClass *dc = DEVICE_GET_CLASS(dev);


This looks wrong.  Devices not accessible to config cycles are still
accessible e.g. to memory or IO.  It's not the same as unrealize.

You need some other API that makes sense,
probably pci specific.



> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> index 40c97b1..042680d 100644
> --- a/hw/pci/pci_bridge.c
> +++ b/hw/pci/pci_bridge.c
> @@ -241,6 +241,18 @@ void pci_bridge_update_mappings(PCIBridge *br)
>      pci_bridge_region_cleanup(br, w);
>  }
>  
> +static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
> +                                   void *opaque)
> +{
> +    device_listener_realize(DEVICE(d));
> +}
> +
> +static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
> +                                     void *opaque)
> +{
> +    device_listener_unrealize(DEVICE(d));
> +}
> +
>  /* default write_config function for PCI-to-PCI bridge */
>  void pci_bridge_write_config(PCIDevice *d,
>                               uint32_t address, uint32_t val, int len)
> @@ -248,6 +260,8 @@ void pci_bridge_write_config(PCIDevice *d,
>      PCIBridge *s = PCI_BRIDGE(d);
>      uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>      uint16_t newctl;
> +    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> +    uint8_t newbus;
>  
>      pci_default_write_config(d, address, val, len);
>  
> @@ -265,6 +279,22 @@ void pci_bridge_write_config(PCIDevice *d,
>          pci_bridge_update_mappings(s);
>      }
>  
> +    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> +    if (newbus != oldbus) {
> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
> +
> +        if (oldbus && oldbus != 255) {
> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> +                                pci_bridge_unrealize_sub, NULL);
> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
> +        }
> +        if (newbus && newbus != 255) {
> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> +                                pci_bridge_realize_sub, NULL);
> +        }
> +    }
> +


This is relying on undocumented assumptions and how specific firmware
works. There's nothing special about bus number 255, and 0 is not very
special either (except it happens to be the reset value).

To know whether device is accessible to config cycles, you
really need to scan the hierarchy from root downwards.

>      newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>      if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
>          /* Trigger hot reset on 0->1 transition. */
> diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> index d4be92f..8bd38af 100644
> --- a/include/hw/qdev-core.h
> +++ b/include/hw/qdev-core.h
> @@ -387,5 +387,7 @@ static inline bool qbus_is_hotpluggable(BusState *bus)
>  
>  void device_listener_register(DeviceListener *listener);
>  void device_listener_unregister(DeviceListener *listener);
> +void device_listener_realize(DeviceState *dev);
> +void device_listener_unrealize(DeviceState *dev);
>  
>  #endif
> diff --git a/include/hw/xen/xen_common.h b/include/hw/xen/xen_common.h
> index 38f29fb..1c27d51 100644
> --- a/include/hw/xen/xen_common.h
> +++ b/include/hw/xen/xen_common.h
> @@ -347,12 +347,31 @@ static inline void xen_map_pcidev(XenXC xc, domid_t dom,
>                                    ioservid_t ioservid,
>                                    PCIDevice *pci_dev)
>  {
> -    trace_xen_map_pcidev(ioservid, pci_bus_num(pci_dev->bus),
> -                         PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn));
> -    xc_hvm_map_pcidev_to_ioreq_server(xc, dom, ioservid,
> -                                      0, pci_bus_num(pci_dev->bus),
> -                                      PCI_SLOT(pci_dev->devfn),
> -                                      PCI_FUNC(pci_dev->devfn));
> +    if (pci_bus_is_root(pci_dev->bus)) {
> +        trace_xen_map_pcidev(ioservid, 0,
> +                             PCI_SLOT(pci_dev->devfn),
> +                             PCI_FUNC(pci_dev->devfn));
> +        xc_hvm_map_pcidev_to_ioreq_server(xc, dom, ioservid,
> +                                          0, 0,
> +                                          PCI_SLOT(pci_dev->devfn),
> +                                          PCI_FUNC(pci_dev->devfn));
> +    } else {
> +        int subbus = pci_bus_num(pci_dev->bus);
> +
> +        if (subbus && subbus != 255) {
> +            trace_xen_map_pcidev(ioservid, subbus,
> +                                 PCI_SLOT(pci_dev->devfn),
> +                                 PCI_FUNC(pci_dev->devfn));
> +            xc_hvm_map_pcidev_to_ioreq_server(xc, dom, ioservid,
> +                                              0, subbus,
> +                                              PCI_SLOT(pci_dev->devfn),
> +                                              PCI_FUNC(pci_dev->devfn));
> +        } else {
> +            trace_xen_map_pcidev_unknown(ioservid,
> +                                         PCI_SLOT(pci_dev->devfn),
> +                                         PCI_FUNC(pci_dev->devfn));
> +        }
> +    }
>  }
>  
>  static inline void xen_unmap_pcidev(XenXC xc, domid_t dom,
> diff --git a/trace-events b/trace-events
> index 11387c3..63c0361 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -931,6 +931,7 @@ xen_unmap_mmio_range(uint32_t id, uint64_t start_addr, uint64_t end_addr) "id: %
>  xen_map_portio_range(uint32_t id, uint64_t start_addr, uint64_t end_addr) "id: %u start: %#"PRIx64" end: %#"PRIx64
>  xen_unmap_portio_range(uint32_t id, uint64_t start_addr, uint64_t end_addr) "id: %u start: %#"PRIx64" end: %#"PRIx64
>  xen_map_pcidev(uint32_t id, uint8_t bus, uint8_t dev, uint8_t func) "id: %u bdf: %02x.%02x.%02x"
> +xen_map_pcidev_unknown(uint32_t id, uint8_t dev, uint8_t func) "id: %u bdf: ??.%02x.%02x"
>  xen_unmap_pcidev(uint32_t id, uint8_t bus, uint8_t dev, uint8_t func) "id: %u bdf: %02x.%02x.%02x"
>  
>  # xen-mapcache.c
> -- 
> 1.8.4
Don Slutz May 28, 2015, 11:25 a.m. UTC | #2
On 05/28/15 05:30, Michael S. Tsirkin wrote:
> On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
>> The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
>> PCI device has a static address.  This is not true for PCI devices
>> that are on the secondary bus of a PCI to PCI bridge.
>>
>> BIOS and/or guest OS can change the secondary bus number to a new
>> value at any time.
>>
>> When a PCI to PCI bridge bridge is reset, the secondary bus number
>> is set to zero.  Normally the BIOS will set it to 255 during PCI bus
>> scanning so that only the PCI devices on the root can be accessed
>> via bus 0.  Later it is set to a number between 1 and 254.
>>
>> Adjust xen_map_pcidev() to not register with Xen for secondary bus
>> numbers 0 and 255.
>>
>> Extend the device listener interface to be called when ever the
>> secondary bus number is set to a usable value.  This includes
>> a call on unrealize if the secondary bus number was valid.
>>
>> Signed-off-by: Don Slutz <dslutz@verizon.com>
>> ---
>>
>> Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
>> and so SeaBIOS does not have access to PCI device(s) on secondary
>> buses.  How ever domU can setup the secondary bus(es) and this patch
>> will restore access to these PCI devices.
>>
>>  hw/core/qdev.c              | 10 ++++++++++
>>  hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
>>  include/hw/qdev-core.h      |  2 ++
>>  include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
>>  trace-events                |  1 +
>>  5 files changed, 68 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/core/qdev.c b/hw/core/qdev.c
>> index b0f0f84..6a514ee 100644
>> --- a/hw/core/qdev.c
>> +++ b/hw/core/qdev.c
>> @@ -239,6 +239,16 @@ void device_listener_unregister(DeviceListener *listener)
>>      QTAILQ_REMOVE(&device_listeners, listener, link);
>>  }
>>  
>> +void device_listener_realize(DeviceState *dev)
>> +{
>> +    DEVICE_LISTENER_CALL(realize, Forward, dev);
>> +}
>> +
>> +void device_listener_unrealize(DeviceState *dev)
>> +{
>> +    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
>> +}
>> +
>>  static void device_realize(DeviceState *dev, Error **errp)
>>  {
>>      DeviceClass *dc = DEVICE_GET_CLASS(dev);
> 
> 
> This looks wrong.  Devices not accessible to config cycles are still
> accessible e.g. to memory or IO.  It's not the same as unrealize.
> 
> You need some other API that makes sense,
> probably pci specific.
> 

If I am understanding you correctly, you would like:

void device_listener_change_pci_bus_num(PCIDevice *d, uint8_t oldbus)
{
    DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
}


> 
> 
>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>> index 40c97b1..042680d 100644
>> --- a/hw/pci/pci_bridge.c
>> +++ b/hw/pci/pci_bridge.c
>> @@ -241,6 +241,18 @@ void pci_bridge_update_mappings(PCIBridge *br)
>>      pci_bridge_region_cleanup(br, w);
>>  }
>>  
>> +static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
>> +                                   void *opaque)
>> +{
>> +    device_listener_realize(DEVICE(d));
>> +}
>> +
>> +static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
>> +                                     void *opaque)
>> +{
>> +    device_listener_unrealize(DEVICE(d));
>> +}
>> +
>>  /* default write_config function for PCI-to-PCI bridge */
>>  void pci_bridge_write_config(PCIDevice *d,
>>                               uint32_t address, uint32_t val, int len)
>> @@ -248,6 +260,8 @@ void pci_bridge_write_config(PCIDevice *d,
>>      PCIBridge *s = PCI_BRIDGE(d);
>>      uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>      uint16_t newctl;
>> +    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
>> +    uint8_t newbus;
>>  
>>      pci_default_write_config(d, address, val, len);
>>  
>> @@ -265,6 +279,22 @@ void pci_bridge_write_config(PCIDevice *d,
>>          pci_bridge_update_mappings(s);
>>      }
>>  
>> +    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
>> +    if (newbus != oldbus) {
>> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
>> +
>> +        if (oldbus && oldbus != 255) {
>> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
>> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>> +                                pci_bridge_unrealize_sub, NULL);
>> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
>> +        }
>> +        if (newbus && newbus != 255) {
>> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>> +                                pci_bridge_realize_sub, NULL);
>> +        }
>> +    }
>> +
> 
> 
> This is relying on undocumented assumptions and how specific firmware
> works. There's nothing special about bus number 255, and 0 is not very
> special either (except it happens to be the reset value).
> 

Ok, using the above it would change to (almost):


if (newbus != oldbus) {
    pci_for_each_device(pci_bridge_get_sec_bus(s),
                        pci_bus_num(sec_bus),
                        device_listener_change_pci_bus_num,
                        oldbus);
}

Would it be better to have:

void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d, void
*opaque)
{
    uint8_t oldbus = (uint8_t)opaque;
    DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
}

So that the above works, or to add a function to convert args?

> To know whether device is accessible to config cycles, you
> really need to scan the hierarchy from root downwards.
> 

Yes, that is correct.  However what I am doing here is not
changing how QEMU checks if the device is accessible, but
changing what pci config Xen sends to QEMU.  If the change
to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
not an issue.


   -Don Slutz
Michael S. Tsirkin May 28, 2015, 12:28 p.m. UTC | #3
On Thu, May 28, 2015 at 07:25:50AM -0400, Don Slutz wrote:
> On 05/28/15 05:30, Michael S. Tsirkin wrote:
> > On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
> >> The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
> >> PCI device has a static address.  This is not true for PCI devices
> >> that are on the secondary bus of a PCI to PCI bridge.
> >>
> >> BIOS and/or guest OS can change the secondary bus number to a new
> >> value at any time.
> >>
> >> When a PCI to PCI bridge bridge is reset, the secondary bus number
> >> is set to zero.  Normally the BIOS will set it to 255 during PCI bus
> >> scanning so that only the PCI devices on the root can be accessed
> >> via bus 0.  Later it is set to a number between 1 and 254.
> >>
> >> Adjust xen_map_pcidev() to not register with Xen for secondary bus
> >> numbers 0 and 255.
> >>
> >> Extend the device listener interface to be called when ever the
> >> secondary bus number is set to a usable value.  This includes
> >> a call on unrealize if the secondary bus number was valid.
> >>
> >> Signed-off-by: Don Slutz <dslutz@verizon.com>
> >> ---
> >>
> >> Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
> >> and so SeaBIOS does not have access to PCI device(s) on secondary
> >> buses.  How ever domU can setup the secondary bus(es) and this patch
> >> will restore access to these PCI devices.
> >>
> >>  hw/core/qdev.c              | 10 ++++++++++
> >>  hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
> >>  include/hw/qdev-core.h      |  2 ++
> >>  include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
> >>  trace-events                |  1 +
> >>  5 files changed, 68 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> >> index b0f0f84..6a514ee 100644
> >> --- a/hw/core/qdev.c
> >> +++ b/hw/core/qdev.c
> >> @@ -239,6 +239,16 @@ void device_listener_unregister(DeviceListener *listener)
> >>      QTAILQ_REMOVE(&device_listeners, listener, link);
> >>  }
> >>  
> >> +void device_listener_realize(DeviceState *dev)
> >> +{
> >> +    DEVICE_LISTENER_CALL(realize, Forward, dev);
> >> +}
> >> +
> >> +void device_listener_unrealize(DeviceState *dev)
> >> +{
> >> +    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
> >> +}
> >> +
> >>  static void device_realize(DeviceState *dev, Error **errp)
> >>  {
> >>      DeviceClass *dc = DEVICE_GET_CLASS(dev);
> > 
> > 
> > This looks wrong.  Devices not accessible to config cycles are still
> > accessible e.g. to memory or IO.  It's not the same as unrealize.
> > 
> > You need some other API that makes sense,
> > probably pci specific.
> > 
> 
> If I am understanding you correctly, you would like:
> 
> void device_listener_change_pci_bus_num(PCIDevice *d, uint8_t oldbus)
> {
>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> }
> 

I'm not sure what oldbus is but basically ok.
And it must be invoked whenever bus visibility changes,
not just its number.

> > 
> > 
> >> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> >> index 40c97b1..042680d 100644
> >> --- a/hw/pci/pci_bridge.c
> >> +++ b/hw/pci/pci_bridge.c
> >> @@ -241,6 +241,18 @@ void pci_bridge_update_mappings(PCIBridge *br)
> >>      pci_bridge_region_cleanup(br, w);
> >>  }
> >>  
> >> +static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
> >> +                                   void *opaque)
> >> +{
> >> +    device_listener_realize(DEVICE(d));
> >> +}
> >> +
> >> +static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
> >> +                                     void *opaque)
> >> +{
> >> +    device_listener_unrealize(DEVICE(d));
> >> +}
> >> +
> >>  /* default write_config function for PCI-to-PCI bridge */
> >>  void pci_bridge_write_config(PCIDevice *d,
> >>                               uint32_t address, uint32_t val, int len)
> >> @@ -248,6 +260,8 @@ void pci_bridge_write_config(PCIDevice *d,
> >>      PCIBridge *s = PCI_BRIDGE(d);
> >>      uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
> >>      uint16_t newctl;
> >> +    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> >> +    uint8_t newbus;
> >>  
> >>      pci_default_write_config(d, address, val, len);
> >>  
> >> @@ -265,6 +279,22 @@ void pci_bridge_write_config(PCIDevice *d,
> >>          pci_bridge_update_mappings(s);
> >>      }
> >>  
> >> +    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> >> +    if (newbus != oldbus) {
> >> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
> >> +
> >> +        if (oldbus && oldbus != 255) {
> >> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
> >> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> >> +                                pci_bridge_unrealize_sub, NULL);
> >> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
> >> +        }
> >> +        if (newbus && newbus != 255) {
> >> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> >> +                                pci_bridge_realize_sub, NULL);
> >> +        }
> >> +    }
> >> +
> > 
> > 
> > This is relying on undocumented assumptions and how specific firmware
> > works. There's nothing special about bus number 255, and 0 is not very
> > special either (except it happens to be the reset value).
> > 
> 
> Ok, using the above it would change to (almost):
> 
> 
> if (newbus != oldbus) {
>     pci_for_each_device(pci_bridge_get_sec_bus(s),
>                         pci_bus_num(sec_bus),
>                         device_listener_change_pci_bus_num,
>                         oldbus);
> }

Not really because it's not just secondary bus number.
Changing subordinate bus numbers can hide/unhide whole buses.



> Would it be better to have:
> 
> void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d, void
> *opaque)
> {
>     uint8_t oldbus = (uint8_t)opaque;
>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> }
> 
> So that the above works, or to add a function to convert args?
> 
> > To know whether device is accessible to config cycles, you
> > really need to scan the hierarchy from root downwards.
> > 
> 
> Yes, that is correct.  However what I am doing here is not
> changing how QEMU checks if the device is accessible, but
> changing what pci config Xen sends to QEMU.  If the change
> to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
> not an issue.
> 
> 
>    -Don Slutz
> 
> 

Imagine a bridge with secondary bus number 5
behind another one with subordinate numbers 1-3.
You should not send conf cycles for bus number 5 to qemu.
Don Slutz May 28, 2015, 7:09 p.m. UTC | #4
On 05/28/15 08:28, Michael S. Tsirkin wrote:
> On Thu, May 28, 2015 at 07:25:50AM -0400, Don Slutz wrote:
>> On 05/28/15 05:30, Michael S. Tsirkin wrote:
>>> On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
>>>> The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
>>>> PCI device has a static address.  This is not true for PCI devices
>>>> that are on the secondary bus of a PCI to PCI bridge.
>>>>
>>>> BIOS and/or guest OS can change the secondary bus number to a new
>>>> value at any time.
>>>>
>>>> When a PCI to PCI bridge bridge is reset, the secondary bus number
>>>> is set to zero.  Normally the BIOS will set it to 255 during PCI bus
>>>> scanning so that only the PCI devices on the root can be accessed
>>>> via bus 0.  Later it is set to a number between 1 and 254.
>>>>
>>>> Adjust xen_map_pcidev() to not register with Xen for secondary bus
>>>> numbers 0 and 255.
>>>>
>>>> Extend the device listener interface to be called when ever the
>>>> secondary bus number is set to a usable value.  This includes
>>>> a call on unrealize if the secondary bus number was valid.
>>>>
>>>> Signed-off-by: Don Slutz <dslutz@verizon.com>
>>>> ---
>>>>
>>>> Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
>>>> and so SeaBIOS does not have access to PCI device(s) on secondary
>>>> buses.  How ever domU can setup the secondary bus(es) and this patch
>>>> will restore access to these PCI devices.
>>>>
>>>>   hw/core/qdev.c              | 10 ++++++++++
>>>>   hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
>>>>   include/hw/qdev-core.h      |  2 ++
>>>>   include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
>>>>   trace-events                |  1 +
>>>>   5 files changed, 68 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/hw/core/qdev.c b/hw/core/qdev.c
>>>> index b0f0f84..6a514ee 100644
>>>> --- a/hw/core/qdev.c
>>>> +++ b/hw/core/qdev.c
>>>> @@ -239,6 +239,16 @@ void device_listener_unregister(DeviceListener *listener)
>>>>       QTAILQ_REMOVE(&device_listeners, listener, link);
>>>>   }
>>>>   
>>>> +void device_listener_realize(DeviceState *dev)
>>>> +{
>>>> +    DEVICE_LISTENER_CALL(realize, Forward, dev);
>>>> +}
>>>> +
>>>> +void device_listener_unrealize(DeviceState *dev)
>>>> +{
>>>> +    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
>>>> +}
>>>> +
>>>>   static void device_realize(DeviceState *dev, Error **errp)
>>>>   {
>>>>       DeviceClass *dc = DEVICE_GET_CLASS(dev);
>>>
>>> This looks wrong.  Devices not accessible to config cycles are still
>>> accessible e.g. to memory or IO.  It's not the same as unrealize.
>>>
>>> You need some other API that makes sense,
>>> probably pci specific.
>>>
>> If I am understanding you correctly, you would like:
>>
>> void device_listener_change_pci_bus_num(PCIDevice *d, uint8_t oldbus)
>> {
>>      DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
>> }
>>
> I'm not sure what oldbus is but basically ok.

oldbus is the previous value that pci_bus_num(pci_dev->bus) would have 
returned.  Passing it avoids the:

+            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
+                                pci_bridge_unrealize_sub, NULL);
+            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);

hack.  At least xen wants to know the old value so that an "unrealize" (i.e. unmap in xen terms) can be done for the old value and then pci_bus_num(pci_dev->bus) can be done to get the new mapping.



> And it must be invoked whenever bus visibility changes,
> not just its number.

This is not clear to me.  Maybe Paul Durrant has a better understanding.

  I look at this patch as a bug fix in that QEMU 2.2 and before "work" 
with pci-bridge.  It is only after Paul's change that it stops working.  
Maybe part of what is not clear is that the new routine is called for 
the PCI devices on the secondary bus.

So at start using the example QEMU config (a Xen one):

/usr/lib/xen/bin/qemu-system-i386 \
  -xen-domid \
  13 \
  -chardev \
  socket,id=libxl-cmd,path=/var/run/xen/qmp-libxl-13,server,nowait \
  -no-shutdown \
  -mon \
  chardev=libxl-cmd,mode=control \
  -chardev \
  socket,id=libxenstat-cmd,path=/var/run/xen/qmp-libxenstat-13,server,nowait \
  -mon \
  chardev=libxenstat-cmd,mode=control \
  -nodefaults \
  -name \
  C63-min-tools \
  -vnc \
  127.0.0.1:0,to=99 \
  -display \
  none \
  -serial \
  pty \
  -device \
  cirrus-vga,vgamem_mb=8 \
  -boot \
  order=cda \
  -device \
  vmxnet3,id=nic0,netdev=net0,mac=00:0c:29:86:44:a0 \
  -netdev \
  type=tap,id=net0,ifname=vif13.0-emu,script=no,downscript=no \
  -device \
  e1000,id=nic1,netdev=net1,mac=00:0c:29:86:44:aa \
  -netdev \
  type=tap,id=net1,ifname=vif13.1-emu,script=no,downscript=no \
  -machine \
  xenfv \
  -monitor \
  pty \
  -boot \
  menu=on \
  -device \
  pci-bridge,chassis_nr=1,msi=on,id=pciBridge1.0,addr=0x11.0 \
  -device \
  pci-bridge,chassis_nr=2,msi=on,id=pciBridge5.0,multifunction=on,addr=0x15.0 \
  -device \
  pci-bridge,chassis_nr=3,msi=on,id=pciBridge6.0,multifunction=on,addr=0x16.0 \
  -device \
  pci-bridge,chassis_nr=4,msi=on,id=pciBridge7.0,multifunction=on,bus=pciBridge1.0,addr=0x17.0 \
  -device \
  pci-bridge,chassis_nr=5,msi=on,id=pciBridge8.0,multifunction=on,addr=0x18.0 \
  -device \
  pvscsi,id=scsi0,bus=pciBridge5.0,addr=0x1.0x0 \
  -drive \
  if=none,id=disk0-0,file=/dev/etherd/e500.1 \
  -device \
  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
  disk,bus=scsi0.0,scsi-id=0,drive=disk0-0 \
  -device \
  pvscsi,id=sas1,bus=pciBridge7.0,addr=0x1.0x0 \
  -drive \
  if=none,id=disk1-1,file=/dev/etherd/e500.2 \
  -device \
  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
  disk,bus=sas1.0,scsi-id=1,drive=disk1-1 \
  -device \
  pvscsi,id=scsi2,bus=pciBridge1.0,addr=0x3.0x0 \
  -drive \
  if=none,id=disk2-2,file=/dev/etherd/e500.3 \
  -device \
  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
  disk,bus=scsi2.0,scsi-id=2,drive=disk2-2 \
  -device \
  e1000,id=nic2,netdev=net2,mac=00:0c:29:86:44:b4,bus=pciBridge5.0,addr=0x3.0x0 \
  -netdev \
  type=tap,id=net2,ifname=vif.2-emu,script=/etc/qemu-ifup,downscript=no \
  -device \
  vmxnet3,id=nic3,netdev=net3,mac=00:0c:29:86:44:be,bus=pciBridge5.0,addr=0x4.0x0 \
  -netdev \
  type=tap,id=net3,ifname=vif.3-emu,script=/etc/qemu-ifup,downscript=no \
  -m 1016


Which under Linux (with this patch)looks like:


[root@C63-min-tools-b ~]# lspci
00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
00:01.1 IDE interface: Intel Corporation 82371SB PIIX3 IDE 
[Natoma/Triton II]
00:01.3 Bridge: Intel Corporation 82371AB/EB/MB PIIX4 ACPI (rev 03)
00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device 
(rev 01)
00:03.0 VGA compatible controller: Cirrus Logic GD 5446
00:11.0 PCI bridge: Red Hat, Inc. Device 0001
00:15.0 PCI bridge: Red Hat, Inc. Device 0001
00:16.0 PCI bridge: Red Hat, Inc. Device 0001
00:18.0 PCI bridge: Red Hat, Inc. Device 0001
01:03.0 SCSI storage controller: VMware PVSCSI SCSI Controller
01:17.0 PCI bridge: Red Hat, Inc. Device 0001
02:01.0 SCSI storage controller: VMware PVSCSI SCSI Controller
03:01.0 SCSI storage controller: VMware PVSCSI SCSI Controller
03:03.0 Ethernet controller: Intel Corporation 82540EM Gigabit Ethernet 
Controller (rev 03)
03:04.0 Ethernet controller: VMware VMXNET3 Ethernet Controller (rev 01)
[root@C63-min-tools-b ~]# lspci -t
-[0000:00]-+-00.0
            +-01.0
            +-01.1
            +-01.3
            +-02.0
            +-03.0
            +-11.0-[01-02]--+-03.0
            |               \-17.0-[02]----01.0
            +-15.0-[03]--+-01.0
            |            +-03.0
            |            \-04.0
            +-16.0-[04]--
            \-18.0-[05]--
[root@C63-min-tools-b ~]#


The issue that I am attempting to fix is that xen is told (without this 
patch):


xen_map_pcidev id: 1 bdf: 00.00.00
xen_map_pcidev id: 1 bdf: 00.01.00
xen_map_pcidev id: 1 bdf: 00.01.01
xen_map_pcidev id: 1 bdf: 00.01.03
xen_map_pcidev id: 1 bdf: 00.02.00
xen_map_pcidev id: 1 bdf: 00.03.00
xen_map_pcidev id: 1 bdf: 00.04.00
xen_map_pcidev id: 1 bdf: 00.05.00
xen_map_pcidev id: 1 bdf: 00.11.00
xen_map_pcidev id: 1 bdf: 00.15.00
xen_map_pcidev id: 1 bdf: 00.16.00
xen_map_pcidev id: 1 bdf: 00.17.00
xen_map_pcidev id: 1 bdf: 00.18.00
xen_map_pcidev id: 1 bdf: 00.01.00
xen_map_pcidev id: 1 bdf: 00.01.00
xen_map_pcidev id: 1 bdf: 00.03.00
xen_map_pcidev id: 1 bdf: 00.03.00
xen_map_pcidev id: 1 bdf: 00.04.00


Now 00.17.00, 00.01.00, 00.01.00,  00.03.00, 00.03.00, and 00.04.00 are 
just wrong.

After the patch you get (when PCI_SECONDARY_BUS of 00.11.0 is set to 1):


xen_map_pcidev id: 1 bdf: 01.03.00
xen_map_pcidev id: 1 bdf: 01.17.00


and then (when PCI_SECONDARY_BUS of 01.17.0 is set to 2):


xen_map_pcidev id: 1 bdf: 02.01.00


and then (when PCI_SECONDARY_BUS of 00.15.0 is set to 3):


xen_map_pcidev id: 1 bdf: 03.01.00
xen_map_pcidev id: 1 bdf: 03.03.00
xen_map_pcidev id: 1 bdf: 03.04.00


Which fixes the bug I ran into.  Did you want me to speed the time to 
open a QEMU bug?



>>>
>>>> diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
>>>> index 40c97b1..042680d 100644
>>>> --- a/hw/pci/pci_bridge.c
>>>> +++ b/hw/pci/pci_bridge.c
>>>> @@ -241,6 +241,18 @@ void pci_bridge_update_mappings(PCIBridge *br)
>>>>       pci_bridge_region_cleanup(br, w);
>>>>   }
>>>>   
>>>> +static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
>>>> +                                   void *opaque)
>>>> +{
>>>> +    device_listener_realize(DEVICE(d));
>>>> +}
>>>> +
>>>> +static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
>>>> +                                     void *opaque)
>>>> +{
>>>> +    device_listener_unrealize(DEVICE(d));
>>>> +}
>>>> +
>>>>   /* default write_config function for PCI-to-PCI bridge */
>>>>   void pci_bridge_write_config(PCIDevice *d,
>>>>                                uint32_t address, uint32_t val, int len)
>>>> @@ -248,6 +260,8 @@ void pci_bridge_write_config(PCIDevice *d,
>>>>       PCIBridge *s = PCI_BRIDGE(d);
>>>>       uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
>>>>       uint16_t newctl;
>>>> +    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
>>>> +    uint8_t newbus;
>>>>   
>>>>       pci_default_write_config(d, address, val, len);
>>>>   
>>>> @@ -265,6 +279,22 @@ void pci_bridge_write_config(PCIDevice *d,
>>>>           pci_bridge_update_mappings(s);
>>>>       }
>>>>   
>>>> +    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
>>>> +    if (newbus != oldbus) {
>>>> +        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
>>>> +
>>>> +        if (oldbus && oldbus != 255) {
>>>> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
>>>> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>> +                                pci_bridge_unrealize_sub, NULL);
>>>> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
>>>> +        }
>>>> +        if (newbus && newbus != 255) {
>>>> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>> +                                pci_bridge_realize_sub, NULL);
>>>> +        }
>>>> +    }
>>>> +
>>>
>>> This is relying on undocumented assumptions and how specific firmware
>>> works. There's nothing special about bus number 255, and 0 is not very
>>> special either (except it happens to be the reset value).
>>>
>> Ok, using the above it would change to (almost):
>>
>>
>> if (newbus != oldbus) {
>>      pci_for_each_device(pci_bridge_get_sec_bus(s),
>>                          pci_bus_num(sec_bus),
>>                          device_listener_change_pci_bus_num,
>>                          oldbus);
>> }
> Not really because it's not just secondary bus number.
> Changing subordinate bus numbers can hide/unhide whole buses.
>

You are right.  I have no idea what Paul Durrant was thinking about this 
case.
And this would apply to PCI_SUBORDINATE_BUS not PCI_SECONDARY_BUS.

Since at QEMU 2.2 Xen sends all pci-config requests to QEMU, things 
"worked".  It is not clear to me that the complexity of tracking bus 
visibility make sense.  Clearly you do.


>
>> Would it be better to have:
>>
>> void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d, void
>> *opaque)
>> {
>>      uint8_t oldbus = (uint8_t)opaque;
>>      DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
>> }
>>
>> So that the above works, or to add a function to convert args?
>>
>>> To know whether device is accessible to config cycles, you
>>> really need to scan the hierarchy from root downwards.
>>>
>> Yes, that is correct.  However what I am doing here is not
>> changing how QEMU checks if the device is accessible, but
>> changing what pci config Xen sends to QEMU.  If the change
>> to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
>> not an issue.
>>
>>
>>     -Don Slutz
>>
>>
> Imagine a bridge with secondary bus number 5
> behind another one with subordinate numbers 1-3.
> You should not send conf cycles for bus number 5 to qemu.
>

That is correct.  How ever unless Paul Durrant has an example of more 
then 1 "QEMU" where this would make a difference, the cases I am aware 
of are:

1) Xen does not send it, and returns 0xffffffff (or smaller).

2) QEMU returns 0xffffffff (or smaller).

I will grant that #1 is faster, but it also is only happening during 
start up and so I do not see the clear win to add more complex code to 
only do #1.

    -Don Slutz
Michael S. Tsirkin May 28, 2015, 9:03 p.m. UTC | #5
On Thu, May 28, 2015 at 03:09:48PM -0400, Don Slutz wrote:
> On 05/28/15 08:28, Michael S. Tsirkin wrote:
> >On Thu, May 28, 2015 at 07:25:50AM -0400, Don Slutz wrote:
> >>On 05/28/15 05:30, Michael S. Tsirkin wrote:
> >>>On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
> >>>>The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
> >>>>PCI device has a static address.  This is not true for PCI devices
> >>>>that are on the secondary bus of a PCI to PCI bridge.
> >>>>
> >>>>BIOS and/or guest OS can change the secondary bus number to a new
> >>>>value at any time.
> >>>>
> >>>>When a PCI to PCI bridge bridge is reset, the secondary bus number
> >>>>is set to zero.  Normally the BIOS will set it to 255 during PCI bus
> >>>>scanning so that only the PCI devices on the root can be accessed
> >>>>via bus 0.  Later it is set to a number between 1 and 254.
> >>>>
> >>>>Adjust xen_map_pcidev() to not register with Xen for secondary bus
> >>>>numbers 0 and 255.
> >>>>
> >>>>Extend the device listener interface to be called when ever the
> >>>>secondary bus number is set to a usable value.  This includes
> >>>>a call on unrealize if the secondary bus number was valid.
> >>>>
> >>>>Signed-off-by: Don Slutz <dslutz@verizon.com>
> >>>>---
> >>>>
> >>>>Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
> >>>>and so SeaBIOS does not have access to PCI device(s) on secondary
> >>>>buses.  How ever domU can setup the secondary bus(es) and this patch
> >>>>will restore access to these PCI devices.
> >>>>
> >>>>  hw/core/qdev.c              | 10 ++++++++++
> >>>>  hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
> >>>>  include/hw/qdev-core.h      |  2 ++
> >>>>  include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
> >>>>  trace-events                |  1 +
> >>>>  5 files changed, 68 insertions(+), 6 deletions(-)
> >>>>
> >>>>diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> >>>>index b0f0f84..6a514ee 100644
> >>>>--- a/hw/core/qdev.c
> >>>>+++ b/hw/core/qdev.c
> >>>>@@ -239,6 +239,16 @@ void device_listener_unregister(DeviceListener *listener)
> >>>>      QTAILQ_REMOVE(&device_listeners, listener, link);
> >>>>  }
> >>>>+void device_listener_realize(DeviceState *dev)
> >>>>+{
> >>>>+    DEVICE_LISTENER_CALL(realize, Forward, dev);
> >>>>+}
> >>>>+
> >>>>+void device_listener_unrealize(DeviceState *dev)
> >>>>+{
> >>>>+    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
> >>>>+}
> >>>>+
> >>>>  static void device_realize(DeviceState *dev, Error **errp)
> >>>>  {
> >>>>      DeviceClass *dc = DEVICE_GET_CLASS(dev);
> >>>
> >>>This looks wrong.  Devices not accessible to config cycles are still
> >>>accessible e.g. to memory or IO.  It's not the same as unrealize.
> >>>
> >>>You need some other API that makes sense,
> >>>probably pci specific.
> >>>
> >>If I am understanding you correctly, you would like:
> >>
> >>void device_listener_change_pci_bus_num(PCIDevice *d, uint8_t oldbus)
> >>{
> >>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> >>}
> >>
> >I'm not sure what oldbus is but basically ok.
> 
> oldbus is the previous value that pci_bus_num(pci_dev->bus) would have
> returned.  Passing it avoids the:
> 
> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
> +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> +                                pci_bridge_unrealize_sub, NULL);
> +            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
> 
> hack.  At least xen wants to know the old value so that an "unrealize" (i.e. unmap in xen terms) can be done for the old value and then pci_bus_num(pci_dev->bus) can be done to get the new mapping.
> 
> 
> 
> >And it must be invoked whenever bus visibility changes,
> >not just its number.
> 
> This is not clear to me.  Maybe Paul Durrant has a better understanding.
> 
>  I look at this patch as a bug fix in that QEMU 2.2 and before "work" with
> pci-bridge.  It is only after Paul's change that it stops working.  Maybe
> part of what is not clear is that the new routine is called for the PCI
> devices on the secondary bus.
> 
> So at start using the example QEMU config (a Xen one):
> 
> /usr/lib/xen/bin/qemu-system-i386 \
>  -xen-domid \
>  13 \
>  -chardev \
>  socket,id=libxl-cmd,path=/var/run/xen/qmp-libxl-13,server,nowait \
>  -no-shutdown \
>  -mon \
>  chardev=libxl-cmd,mode=control \
>  -chardev \
>  socket,id=libxenstat-cmd,path=/var/run/xen/qmp-libxenstat-13,server,nowait \
>  -mon \
>  chardev=libxenstat-cmd,mode=control \
>  -nodefaults \
>  -name \
>  C63-min-tools \
>  -vnc \
>  127.0.0.1:0,to=99 \
>  -display \
>  none \
>  -serial \
>  pty \
>  -device \
>  cirrus-vga,vgamem_mb=8 \
>  -boot \
>  order=cda \
>  -device \
>  vmxnet3,id=nic0,netdev=net0,mac=00:0c:29:86:44:a0 \
>  -netdev \
>  type=tap,id=net0,ifname=vif13.0-emu,script=no,downscript=no \
>  -device \
>  e1000,id=nic1,netdev=net1,mac=00:0c:29:86:44:aa \
>  -netdev \
>  type=tap,id=net1,ifname=vif13.1-emu,script=no,downscript=no \
>  -machine \
>  xenfv \
>  -monitor \
>  pty \
>  -boot \
>  menu=on \
>  -device \
>  pci-bridge,chassis_nr=1,msi=on,id=pciBridge1.0,addr=0x11.0 \
>  -device \
>  pci-bridge,chassis_nr=2,msi=on,id=pciBridge5.0,multifunction=on,addr=0x15.0 \
>  -device \
>  pci-bridge,chassis_nr=3,msi=on,id=pciBridge6.0,multifunction=on,addr=0x16.0 \
>  -device \
>  pci-bridge,chassis_nr=4,msi=on,id=pciBridge7.0,multifunction=on,bus=pciBridge1.0,addr=0x17.0 \
>  -device \
>  pci-bridge,chassis_nr=5,msi=on,id=pciBridge8.0,multifunction=on,addr=0x18.0 \
>  -device \
>  pvscsi,id=scsi0,bus=pciBridge5.0,addr=0x1.0x0 \
>  -drive \
>  if=none,id=disk0-0,file=/dev/etherd/e500.1 \
>  -device \
>  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
>  disk,bus=scsi0.0,scsi-id=0,drive=disk0-0 \
>  -device \
>  pvscsi,id=sas1,bus=pciBridge7.0,addr=0x1.0x0 \
>  -drive \
>  if=none,id=disk1-1,file=/dev/etherd/e500.2 \
>  -device \
>  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
>  disk,bus=sas1.0,scsi-id=1,drive=disk1-1 \
>  -device \
>  pvscsi,id=scsi2,bus=pciBridge1.0,addr=0x3.0x0 \
>  -drive \
>  if=none,id=disk2-2,file=/dev/etherd/e500.3 \
>  -device \
>  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
>  disk,bus=scsi2.0,scsi-id=2,drive=disk2-2 \
>  -device \
>  e1000,id=nic2,netdev=net2,mac=00:0c:29:86:44:b4,bus=pciBridge5.0,addr=0x3.0x0 \
>  -netdev \
>  type=tap,id=net2,ifname=vif.2-emu,script=/etc/qemu-ifup,downscript=no \
>  -device \
>  vmxnet3,id=nic3,netdev=net3,mac=00:0c:29:86:44:be,bus=pciBridge5.0,addr=0x4.0x0 \
>  -netdev \
>  type=tap,id=net3,ifname=vif.3-emu,script=/etc/qemu-ifup,downscript=no \
>  -m 1016
> 
> 
> Which under Linux (with this patch)looks like:
> 
> 
> [root@C63-min-tools-b ~]# lspci
> 00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
> 00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
> 00:01.1 IDE interface: Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton
> II]
> 00:01.3 Bridge: Intel Corporation 82371AB/EB/MB PIIX4 ACPI (rev 03)
> 00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev
> 01)
> 00:03.0 VGA compatible controller: Cirrus Logic GD 5446
> 00:11.0 PCI bridge: Red Hat, Inc. Device 0001
> 00:15.0 PCI bridge: Red Hat, Inc. Device 0001
> 00:16.0 PCI bridge: Red Hat, Inc. Device 0001
> 00:18.0 PCI bridge: Red Hat, Inc. Device 0001
> 01:03.0 SCSI storage controller: VMware PVSCSI SCSI Controller
> 01:17.0 PCI bridge: Red Hat, Inc. Device 0001
> 02:01.0 SCSI storage controller: VMware PVSCSI SCSI Controller
> 03:01.0 SCSI storage controller: VMware PVSCSI SCSI Controller
> 03:03.0 Ethernet controller: Intel Corporation 82540EM Gigabit Ethernet
> Controller (rev 03)
> 03:04.0 Ethernet controller: VMware VMXNET3 Ethernet Controller (rev 01)
> [root@C63-min-tools-b ~]# lspci -t
> -[0000:00]-+-00.0
>            +-01.0
>            +-01.1
>            +-01.3
>            +-02.0
>            +-03.0
>            +-11.0-[01-02]--+-03.0
>            |               \-17.0-[02]----01.0
>            +-15.0-[03]--+-01.0
>            |            +-03.0
>            |            \-04.0
>            +-16.0-[04]--
>            \-18.0-[05]--
> [root@C63-min-tools-b ~]#
> 
> 
> The issue that I am attempting to fix is that xen is told (without this
> patch):
> 
> 
> xen_map_pcidev id: 1 bdf: 00.00.00
> xen_map_pcidev id: 1 bdf: 00.01.00
> xen_map_pcidev id: 1 bdf: 00.01.01
> xen_map_pcidev id: 1 bdf: 00.01.03
> xen_map_pcidev id: 1 bdf: 00.02.00
> xen_map_pcidev id: 1 bdf: 00.03.00
> xen_map_pcidev id: 1 bdf: 00.04.00
> xen_map_pcidev id: 1 bdf: 00.05.00
> xen_map_pcidev id: 1 bdf: 00.11.00
> xen_map_pcidev id: 1 bdf: 00.15.00
> xen_map_pcidev id: 1 bdf: 00.16.00
> xen_map_pcidev id: 1 bdf: 00.17.00
> xen_map_pcidev id: 1 bdf: 00.18.00
> xen_map_pcidev id: 1 bdf: 00.01.00
> xen_map_pcidev id: 1 bdf: 00.01.00
> xen_map_pcidev id: 1 bdf: 00.03.00
> xen_map_pcidev id: 1 bdf: 00.03.00
> xen_map_pcidev id: 1 bdf: 00.04.00
> 
> 
> Now 00.17.00, 00.01.00, 00.01.00,  00.03.00, 00.03.00, and 00.04.00 are just
> wrong.
> 
> After the patch you get (when PCI_SECONDARY_BUS of 00.11.0 is set to 1):
> 
> 
> xen_map_pcidev id: 1 bdf: 01.03.00
> xen_map_pcidev id: 1 bdf: 01.17.00
> 
> 
> and then (when PCI_SECONDARY_BUS of 01.17.0 is set to 2):
> 
> 
> xen_map_pcidev id: 1 bdf: 02.01.00
> 
> 
> and then (when PCI_SECONDARY_BUS of 00.15.0 is set to 3):
> 
> 
> xen_map_pcidev id: 1 bdf: 03.01.00
> xen_map_pcidev id: 1 bdf: 03.03.00
> xen_map_pcidev id: 1 bdf: 03.04.00
> 
> 
> Which fixes the bug I ran into.  Did you want me to speed the time to open a
> QEMU bug?
> 
> 
> 
> >>>
> >>>>diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> >>>>index 40c97b1..042680d 100644
> >>>>--- a/hw/pci/pci_bridge.c
> >>>>+++ b/hw/pci/pci_bridge.c
> >>>>@@ -241,6 +241,18 @@ void pci_bridge_update_mappings(PCIBridge *br)
> >>>>      pci_bridge_region_cleanup(br, w);
> >>>>  }
> >>>>+static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
> >>>>+                                   void *opaque)
> >>>>+{
> >>>>+    device_listener_realize(DEVICE(d));
> >>>>+}
> >>>>+
> >>>>+static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
> >>>>+                                     void *opaque)
> >>>>+{
> >>>>+    device_listener_unrealize(DEVICE(d));
> >>>>+}
> >>>>+
> >>>>  /* default write_config function for PCI-to-PCI bridge */
> >>>>  void pci_bridge_write_config(PCIDevice *d,
> >>>>                               uint32_t address, uint32_t val, int len)
> >>>>@@ -248,6 +260,8 @@ void pci_bridge_write_config(PCIDevice *d,
> >>>>      PCIBridge *s = PCI_BRIDGE(d);
> >>>>      uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
> >>>>      uint16_t newctl;
> >>>>+    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> >>>>+    uint8_t newbus;
> >>>>      pci_default_write_config(d, address, val, len);
> >>>>@@ -265,6 +279,22 @@ void pci_bridge_write_config(PCIDevice *d,
> >>>>          pci_bridge_update_mappings(s);
> >>>>      }
> >>>>+    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> >>>>+    if (newbus != oldbus) {
> >>>>+        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
> >>>>+
> >>>>+        if (oldbus && oldbus != 255) {
> >>>>+            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
> >>>>+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> >>>>+                                pci_bridge_unrealize_sub, NULL);
> >>>>+            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
> >>>>+        }
> >>>>+        if (newbus && newbus != 255) {
> >>>>+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> >>>>+                                pci_bridge_realize_sub, NULL);
> >>>>+        }
> >>>>+    }
> >>>>+
> >>>
> >>>This is relying on undocumented assumptions and how specific firmware
> >>>works. There's nothing special about bus number 255, and 0 is not very
> >>>special either (except it happens to be the reset value).
> >>>
> >>Ok, using the above it would change to (almost):
> >>
> >>
> >>if (newbus != oldbus) {
> >>     pci_for_each_device(pci_bridge_get_sec_bus(s),
> >>                         pci_bus_num(sec_bus),
> >>                         device_listener_change_pci_bus_num,
> >>                         oldbus);
> >>}
> >Not really because it's not just secondary bus number.
> >Changing subordinate bus numbers can hide/unhide whole buses.
> >
> 
> You are right.  I have no idea what Paul Durrant was thinking about this
> case.
> And this would apply to PCI_SUBORDINATE_BUS not PCI_SECONDARY_BUS.
> 
> Since at QEMU 2.2 Xen sends all pci-config requests to QEMU, things
> "worked".

Why "worked"?


> It is not clear to me that the complexity of tracking bus
> visibility make sense.  Clearly you do.

Well what was the point of the change?
If you don't care that we get irrelevant config cycles why not
just send them all to QEMU?



> 
> >
> >>Would it be better to have:
> >>
> >>void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d, void
> >>*opaque)
> >>{
> >>     uint8_t oldbus = (uint8_t)opaque;
> >>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> >>}
> >>
> >>So that the above works, or to add a function to convert args?
> >>
> >>>To know whether device is accessible to config cycles, you
> >>>really need to scan the hierarchy from root downwards.
> >>>
> >>Yes, that is correct.  However what I am doing here is not
> >>changing how QEMU checks if the device is accessible, but
> >>changing what pci config Xen sends to QEMU.  If the change
> >>to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
> >>not an issue.
> >>
> >>
> >>    -Don Slutz
> >>
> >>
> >Imagine a bridge with secondary bus number 5
> >behind another one with subordinate numbers 1-3.
> >You should not send conf cycles for bus number 5 to qemu.
> >
> 
> That is correct.  How ever unless Paul Durrant has an example of more then 1
> "QEMU" where this would make a difference, the cases I am aware of are:
> 
> 1) Xen does not send it, and returns 0xffffffff (or smaller).
> 
> 2) QEMU returns 0xffffffff (or smaller).
> 
> I will grant that #1 is faster, but it also is only happening during start
> up and so I do not see the clear win to add more complex code to only do #1.
> 
>    -Don Slutz

It's not about faster. I assumed you need to get just the correct
stuff out to QEMU to gain the better security as
3996e85c1822e05c50250f8d2d1e57b6bea1229d claims.
Michael S. Tsirkin May 28, 2015, 9:05 p.m. UTC | #6
On Thu, May 28, 2015 at 11:03:07PM +0200, Michael S. Tsirkin wrote:
> On Thu, May 28, 2015 at 03:09:48PM -0400, Don Slutz wrote:
> > On 05/28/15 08:28, Michael S. Tsirkin wrote:
> > >On Thu, May 28, 2015 at 07:25:50AM -0400, Don Slutz wrote:
> > >>On 05/28/15 05:30, Michael S. Tsirkin wrote:
> > >>>On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
> > >>>>The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
> > >>>>PCI device has a static address.  This is not true for PCI devices
> > >>>>that are on the secondary bus of a PCI to PCI bridge.
> > >>>>
> > >>>>BIOS and/or guest OS can change the secondary bus number to a new
> > >>>>value at any time.
> > >>>>
> > >>>>When a PCI to PCI bridge bridge is reset, the secondary bus number
> > >>>>is set to zero.  Normally the BIOS will set it to 255 during PCI bus
> > >>>>scanning so that only the PCI devices on the root can be accessed
> > >>>>via bus 0.  Later it is set to a number between 1 and 254.
> > >>>>
> > >>>>Adjust xen_map_pcidev() to not register with Xen for secondary bus
> > >>>>numbers 0 and 255.
> > >>>>
> > >>>>Extend the device listener interface to be called when ever the
> > >>>>secondary bus number is set to a usable value.  This includes
> > >>>>a call on unrealize if the secondary bus number was valid.
> > >>>>
> > >>>>Signed-off-by: Don Slutz <dslutz@verizon.com>
> > >>>>---
> > >>>>
> > >>>>Note: Right now hvmloader in Xen does not handle PCI to PCI bridges
> > >>>>and so SeaBIOS does not have access to PCI device(s) on secondary
> > >>>>buses.  How ever domU can setup the secondary bus(es) and this patch
> > >>>>will restore access to these PCI devices.
> > >>>>
> > >>>>  hw/core/qdev.c              | 10 ++++++++++
> > >>>>  hw/pci/pci_bridge.c         | 30 ++++++++++++++++++++++++++++++
> > >>>>  include/hw/qdev-core.h      |  2 ++
> > >>>>  include/hw/xen/xen_common.h | 31 +++++++++++++++++++++++++------
> > >>>>  trace-events                |  1 +
> > >>>>  5 files changed, 68 insertions(+), 6 deletions(-)
> > >>>>
> > >>>>diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> > >>>>index b0f0f84..6a514ee 100644
> > >>>>--- a/hw/core/qdev.c
> > >>>>+++ b/hw/core/qdev.c
> > >>>>@@ -239,6 +239,16 @@ void device_listener_unregister(DeviceListener *listener)
> > >>>>      QTAILQ_REMOVE(&device_listeners, listener, link);
> > >>>>  }
> > >>>>+void device_listener_realize(DeviceState *dev)
> > >>>>+{
> > >>>>+    DEVICE_LISTENER_CALL(realize, Forward, dev);
> > >>>>+}
> > >>>>+
> > >>>>+void device_listener_unrealize(DeviceState *dev)
> > >>>>+{
> > >>>>+    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
> > >>>>+}
> > >>>>+
> > >>>>  static void device_realize(DeviceState *dev, Error **errp)
> > >>>>  {
> > >>>>      DeviceClass *dc = DEVICE_GET_CLASS(dev);
> > >>>
> > >>>This looks wrong.  Devices not accessible to config cycles are still
> > >>>accessible e.g. to memory or IO.  It's not the same as unrealize.
> > >>>
> > >>>You need some other API that makes sense,
> > >>>probably pci specific.
> > >>>
> > >>If I am understanding you correctly, you would like:
> > >>
> > >>void device_listener_change_pci_bus_num(PCIDevice *d, uint8_t oldbus)
> > >>{
> > >>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> > >>}
> > >>
> > >I'm not sure what oldbus is but basically ok.
> > 
> > oldbus is the previous value that pci_bus_num(pci_dev->bus) would have
> > returned.  Passing it avoids the:
> > 
> > +            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
> > +            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > +                                pci_bridge_unrealize_sub, NULL);
> > +            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
> > 
> > hack.  At least xen wants to know the old value so that an "unrealize" (i.e. unmap in xen terms) can be done for the old value and then pci_bus_num(pci_dev->bus) can be done to get the new mapping.
> > 
> > 
> > 
> > >And it must be invoked whenever bus visibility changes,
> > >not just its number.
> > 
> > This is not clear to me.  Maybe Paul Durrant has a better understanding.
> > 
> >  I look at this patch as a bug fix in that QEMU 2.2 and before "work" with
> > pci-bridge.  It is only after Paul's change that it stops working.  Maybe
> > part of what is not clear is that the new routine is called for the PCI
> > devices on the secondary bus.
> > 
> > So at start using the example QEMU config (a Xen one):
> > 
> > /usr/lib/xen/bin/qemu-system-i386 \
> >  -xen-domid \
> >  13 \
> >  -chardev \
> >  socket,id=libxl-cmd,path=/var/run/xen/qmp-libxl-13,server,nowait \
> >  -no-shutdown \
> >  -mon \
> >  chardev=libxl-cmd,mode=control \
> >  -chardev \
> >  socket,id=libxenstat-cmd,path=/var/run/xen/qmp-libxenstat-13,server,nowait \
> >  -mon \
> >  chardev=libxenstat-cmd,mode=control \
> >  -nodefaults \
> >  -name \
> >  C63-min-tools \
> >  -vnc \
> >  127.0.0.1:0,to=99 \
> >  -display \
> >  none \
> >  -serial \
> >  pty \
> >  -device \
> >  cirrus-vga,vgamem_mb=8 \
> >  -boot \
> >  order=cda \
> >  -device \
> >  vmxnet3,id=nic0,netdev=net0,mac=00:0c:29:86:44:a0 \
> >  -netdev \
> >  type=tap,id=net0,ifname=vif13.0-emu,script=no,downscript=no \
> >  -device \
> >  e1000,id=nic1,netdev=net1,mac=00:0c:29:86:44:aa \
> >  -netdev \
> >  type=tap,id=net1,ifname=vif13.1-emu,script=no,downscript=no \
> >  -machine \
> >  xenfv \
> >  -monitor \
> >  pty \
> >  -boot \
> >  menu=on \
> >  -device \
> >  pci-bridge,chassis_nr=1,msi=on,id=pciBridge1.0,addr=0x11.0 \
> >  -device \
> >  pci-bridge,chassis_nr=2,msi=on,id=pciBridge5.0,multifunction=on,addr=0x15.0 \
> >  -device \
> >  pci-bridge,chassis_nr=3,msi=on,id=pciBridge6.0,multifunction=on,addr=0x16.0 \
> >  -device \
> >  pci-bridge,chassis_nr=4,msi=on,id=pciBridge7.0,multifunction=on,bus=pciBridge1.0,addr=0x17.0 \
> >  -device \
> >  pci-bridge,chassis_nr=5,msi=on,id=pciBridge8.0,multifunction=on,addr=0x18.0 \
> >  -device \
> >  pvscsi,id=scsi0,bus=pciBridge5.0,addr=0x1.0x0 \
> >  -drive \
> >  if=none,id=disk0-0,file=/dev/etherd/e500.1 \
> >  -device \
> >  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
> >  disk,bus=scsi0.0,scsi-id=0,drive=disk0-0 \
> >  -device \
> >  pvscsi,id=sas1,bus=pciBridge7.0,addr=0x1.0x0 \
> >  -drive \
> >  if=none,id=disk1-1,file=/dev/etherd/e500.2 \
> >  -device \
> >  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
> >  disk,bus=sas1.0,scsi-id=1,drive=disk1-1 \
> >  -device \
> >  pvscsi,id=scsi2,bus=pciBridge1.0,addr=0x3.0x0 \
> >  -drive \
> >  if=none,id=disk2-2,file=/dev/etherd/e500.3 \
> >  -device \
> >  scsi-disk,vendor=VMware,ver=1.0,product=Virtual \
> >  disk,bus=scsi2.0,scsi-id=2,drive=disk2-2 \
> >  -device \
> >  e1000,id=nic2,netdev=net2,mac=00:0c:29:86:44:b4,bus=pciBridge5.0,addr=0x3.0x0 \
> >  -netdev \
> >  type=tap,id=net2,ifname=vif.2-emu,script=/etc/qemu-ifup,downscript=no \
> >  -device \
> >  vmxnet3,id=nic3,netdev=net3,mac=00:0c:29:86:44:be,bus=pciBridge5.0,addr=0x4.0x0 \
> >  -netdev \
> >  type=tap,id=net3,ifname=vif.3-emu,script=/etc/qemu-ifup,downscript=no \
> >  -m 1016
> > 
> > 
> > Which under Linux (with this patch)looks like:
> > 
> > 
> > [root@C63-min-tools-b ~]# lspci
> > 00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
> > 00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
> > 00:01.1 IDE interface: Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton
> > II]
> > 00:01.3 Bridge: Intel Corporation 82371AB/EB/MB PIIX4 ACPI (rev 03)
> > 00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev
> > 01)
> > 00:03.0 VGA compatible controller: Cirrus Logic GD 5446
> > 00:11.0 PCI bridge: Red Hat, Inc. Device 0001
> > 00:15.0 PCI bridge: Red Hat, Inc. Device 0001
> > 00:16.0 PCI bridge: Red Hat, Inc. Device 0001
> > 00:18.0 PCI bridge: Red Hat, Inc. Device 0001
> > 01:03.0 SCSI storage controller: VMware PVSCSI SCSI Controller
> > 01:17.0 PCI bridge: Red Hat, Inc. Device 0001
> > 02:01.0 SCSI storage controller: VMware PVSCSI SCSI Controller
> > 03:01.0 SCSI storage controller: VMware PVSCSI SCSI Controller
> > 03:03.0 Ethernet controller: Intel Corporation 82540EM Gigabit Ethernet
> > Controller (rev 03)
> > 03:04.0 Ethernet controller: VMware VMXNET3 Ethernet Controller (rev 01)
> > [root@C63-min-tools-b ~]# lspci -t
> > -[0000:00]-+-00.0
> >            +-01.0
> >            +-01.1
> >            +-01.3
> >            +-02.0
> >            +-03.0
> >            +-11.0-[01-02]--+-03.0
> >            |               \-17.0-[02]----01.0
> >            +-15.0-[03]--+-01.0
> >            |            +-03.0
> >            |            \-04.0
> >            +-16.0-[04]--
> >            \-18.0-[05]--
> > [root@C63-min-tools-b ~]#
> > 
> > 
> > The issue that I am attempting to fix is that xen is told (without this
> > patch):
> > 
> > 
> > xen_map_pcidev id: 1 bdf: 00.00.00
> > xen_map_pcidev id: 1 bdf: 00.01.00
> > xen_map_pcidev id: 1 bdf: 00.01.01
> > xen_map_pcidev id: 1 bdf: 00.01.03
> > xen_map_pcidev id: 1 bdf: 00.02.00
> > xen_map_pcidev id: 1 bdf: 00.03.00
> > xen_map_pcidev id: 1 bdf: 00.04.00
> > xen_map_pcidev id: 1 bdf: 00.05.00
> > xen_map_pcidev id: 1 bdf: 00.11.00
> > xen_map_pcidev id: 1 bdf: 00.15.00
> > xen_map_pcidev id: 1 bdf: 00.16.00
> > xen_map_pcidev id: 1 bdf: 00.17.00
> > xen_map_pcidev id: 1 bdf: 00.18.00
> > xen_map_pcidev id: 1 bdf: 00.01.00
> > xen_map_pcidev id: 1 bdf: 00.01.00
> > xen_map_pcidev id: 1 bdf: 00.03.00
> > xen_map_pcidev id: 1 bdf: 00.03.00
> > xen_map_pcidev id: 1 bdf: 00.04.00
> > 
> > 
> > Now 00.17.00, 00.01.00, 00.01.00,  00.03.00, 00.03.00, and 00.04.00 are just
> > wrong.
> > 
> > After the patch you get (when PCI_SECONDARY_BUS of 00.11.0 is set to 1):
> > 
> > 
> > xen_map_pcidev id: 1 bdf: 01.03.00
> > xen_map_pcidev id: 1 bdf: 01.17.00
> > 
> > 
> > and then (when PCI_SECONDARY_BUS of 01.17.0 is set to 2):
> > 
> > 
> > xen_map_pcidev id: 1 bdf: 02.01.00
> > 
> > 
> > and then (when PCI_SECONDARY_BUS of 00.15.0 is set to 3):
> > 
> > 
> > xen_map_pcidev id: 1 bdf: 03.01.00
> > xen_map_pcidev id: 1 bdf: 03.03.00
> > xen_map_pcidev id: 1 bdf: 03.04.00
> > 
> > 
> > Which fixes the bug I ran into.  Did you want me to speed the time to open a
> > QEMU bug?
> > 
> > 
> > 
> > >>>
> > >>>>diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
> > >>>>index 40c97b1..042680d 100644
> > >>>>--- a/hw/pci/pci_bridge.c
> > >>>>+++ b/hw/pci/pci_bridge.c
> > >>>>@@ -241,6 +241,18 @@ void pci_bridge_update_mappings(PCIBridge *br)
> > >>>>      pci_bridge_region_cleanup(br, w);
> > >>>>  }
> > >>>>+static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
> > >>>>+                                   void *opaque)
> > >>>>+{
> > >>>>+    device_listener_realize(DEVICE(d));
> > >>>>+}
> > >>>>+
> > >>>>+static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
> > >>>>+                                     void *opaque)
> > >>>>+{
> > >>>>+    device_listener_unrealize(DEVICE(d));
> > >>>>+}
> > >>>>+
> > >>>>  /* default write_config function for PCI-to-PCI bridge */
> > >>>>  void pci_bridge_write_config(PCIDevice *d,
> > >>>>                               uint32_t address, uint32_t val, int len)
> > >>>>@@ -248,6 +260,8 @@ void pci_bridge_write_config(PCIDevice *d,
> > >>>>      PCIBridge *s = PCI_BRIDGE(d);
> > >>>>      uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
> > >>>>      uint16_t newctl;
> > >>>>+    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> > >>>>+    uint8_t newbus;
> > >>>>      pci_default_write_config(d, address, val, len);
> > >>>>@@ -265,6 +279,22 @@ void pci_bridge_write_config(PCIDevice *d,
> > >>>>          pci_bridge_update_mappings(s);
> > >>>>      }
> > >>>>+    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
> > >>>>+    if (newbus != oldbus) {
> > >>>>+        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
> > >>>>+
> > >>>>+        if (oldbus && oldbus != 255) {
> > >>>>+            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
> > >>>>+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > >>>>+                                pci_bridge_unrealize_sub, NULL);
> > >>>>+            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
> > >>>>+        }
> > >>>>+        if (newbus && newbus != 255) {
> > >>>>+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
> > >>>>+                                pci_bridge_realize_sub, NULL);
> > >>>>+        }
> > >>>>+    }
> > >>>>+
> > >>>
> > >>>This is relying on undocumented assumptions and how specific firmware
> > >>>works. There's nothing special about bus number 255, and 0 is not very
> > >>>special either (except it happens to be the reset value).
> > >>>
> > >>Ok, using the above it would change to (almost):
> > >>
> > >>
> > >>if (newbus != oldbus) {
> > >>     pci_for_each_device(pci_bridge_get_sec_bus(s),
> > >>                         pci_bus_num(sec_bus),
> > >>                         device_listener_change_pci_bus_num,
> > >>                         oldbus);
> > >>}
> > >Not really because it's not just secondary bus number.
> > >Changing subordinate bus numbers can hide/unhide whole buses.
> > >
> > 
> > You are right.  I have no idea what Paul Durrant was thinking about this
> > case.
> > And this would apply to PCI_SUBORDINATE_BUS not PCI_SECONDARY_BUS.
> > 
> > Since at QEMU 2.2 Xen sends all pci-config requests to QEMU, things
> > "worked".
> 
> Why "worked"?
> 
> 
> > It is not clear to me that the complexity of tracking bus
> > visibility make sense.  Clearly you do.
> 
> Well what was the point of the change?
> If you don't care that we get irrelevant config cycles why not
> just send them all to QEMU?
> 
> 
> 
> > 
> > >
> > >>Would it be better to have:
> > >>
> > >>void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d, void
> > >>*opaque)
> > >>{
> > >>     uint8_t oldbus = (uint8_t)opaque;
> > >>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> > >>}
> > >>
> > >>So that the above works, or to add a function to convert args?
> > >>
> > >>>To know whether device is accessible to config cycles, you
> > >>>really need to scan the hierarchy from root downwards.
> > >>>
> > >>Yes, that is correct.  However what I am doing here is not
> > >>changing how QEMU checks if the device is accessible, but
> > >>changing what pci config Xen sends to QEMU.  If the change
> > >>to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
> > >>not an issue.
> > >>
> > >>
> > >>    -Don Slutz
> > >>
> > >>
> > >Imagine a bridge with secondary bus number 5
> > >behind another one with subordinate numbers 1-3.
> > >You should not send conf cycles for bus number 5 to qemu.
> > >
> > 
> > That is correct.  How ever unless Paul Durrant has an example of more then 1
> > "QEMU" where this would make a difference, the cases I am aware of are:
> > 
> > 1) Xen does not send it, and returns 0xffffffff (or smaller).
> > 
> > 2) QEMU returns 0xffffffff (or smaller).
> > 
> > I will grant that #1 is faster, but it also is only happening during start
> > up and so I do not see the clear win to add more complex code to only do #1.
> > 
> >    -Don Slutz
> 
> It's not about faster. I assumed you need to get just the correct
> stuff out to QEMU to gain the better security as
> 3996e85c1822e05c50250f8d2d1e57b6bea1229d claims.

And if that's not the case, please educate me.

> -- 
> MST
Don Slutz May 29, 2015, 1:23 p.m. UTC | #7
On 05/28/15 17:05, Michael S. Tsirkin wrote:
> On Thu, May 28, 2015 at 11:03:07PM +0200, Michael S. Tsirkin wrote:
>> On Thu, May 28, 2015 at 03:09:48PM -0400, Don Slutz wrote:
>>> On 05/28/15 08:28, Michael S. Tsirkin wrote:
>>>> On Thu, May 28, 2015 at 07:25:50AM -0400, Don Slutz wrote:
>>>>> On 05/28/15 05:30, Michael S. Tsirkin wrote:
>>>>>> On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
>>>>>>> The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed that a
>>>>>>> PCI device has a static address.  This is not true for PCI devices
>>>>>>> that are on the secondary bus of a PCI to PCI bridge.
>>>>>>>

...

>>>> Not really because it's not just secondary bus number.
>>>> Changing subordinate bus numbers can hide/unhide whole buses.
>>>>
>>> You are right.  I have no idea what Paul Durrant was thinking about this
>>> case.
>>> And this would apply to PCI_SUBORDINATE_BUS not PCI_SECONDARY_BUS.
>>>
>>> Since at QEMU 2.2 Xen sends all pci-config requests to QEMU, things
>>> "worked".
>> Why "worked"?
>>

The issue is that PCI to PCI bridges were not configured by hvmloader
nor by SeaBIOS.  I do have a patch
to Xen that fixes this (I was rebasing on the latest Xen to post it and
my test case stopped working).  So PCI devices that exist on the
secondary side of a PCI to PCI bridge could only be used (by Linux with
at least pci=assign-busses) after booting.


>>> It is not clear to me that the complexity of tracking bus
>>> visibility make sense.  Clearly you do.
>> Well what was the point of the change?

To get config cycles that are valid that you do not get today.

>> If you don't care that we get irrelevant config cycles why not
>> just send them all to QEMU?
>>

That would need to be answered by Paul Durrant and/or other people (see
below)

>>
>>>>> Would it be better to have:
>>>>>
>>>>> void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d, void
>>>>> *opaque)
>>>>> {
>>>>>     uint8_t oldbus = (uint8_t)opaque;
>>>>>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
>>>>> }
>>>>>
>>>>> So that the above works, or to add a function to convert args?
>>>>>
>>>>>> To know whether device is accessible to config cycles, you
>>>>>> really need to scan the hierarchy from root downwards.
>>>>>>
>>>>> Yes, that is correct.  However what I am doing here is not
>>>>> changing how QEMU checks if the device is accessible, but
>>>>> changing what pci config Xen sends to QEMU.  If the change
>>>>> to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
>>>>> not an issue.
>>>>>
>>>>>
>>>>>    -Don Slutz
>>>>>
>>>>>
>>>> Imagine a bridge with secondary bus number 5
>>>> behind another one with subordinate numbers 1-3.
>>>> You should not send conf cycles for bus number 5 to qemu.
>>>>
>>> That is correct.  How ever unless Paul Durrant has an example of more then 1
>>> "QEMU" where this would make a difference, the cases I am aware of are:
>>>
>>> 1) Xen does not send it, and returns 0xffffffff (or smaller).
>>>
>>> 2) QEMU returns 0xffffffff (or smaller).
>>>
>>> I will grant that #1 is faster, but it also is only happening during start
>>> up and so I do not see the clear win to add more complex code to only do #1.
>>>
>>>    -Don Slutz
>> It's not about faster. I assumed you need to get just the correct
>> stuff out to QEMU to gain the better security as
>> 3996e85c1822e05c50250f8d2d1e57b6bea1229d claims.
> And if that's not the case, please educate me.

I do not know enough about this to answer here.  I have added xen-devel
list to this in the hope
that someone there can answer the "better security" question.

   -Don Slutz


>> -- 
>> MST
Paul Durrant May 29, 2015, 2:22 p.m. UTC | #8
> -----Original Message-----
> From: Don Slutz [mailto:don.slutz@gmail.com]
> Sent: 29 May 2015 14:23
> To: Michael S. Tsirkin; Paul Durrant
> Cc: Don Slutz; qemu-devel@nongnu.org; Stefano Stabellini; xen-devel
> Subject: Re: [Qemu-devel] [PATCH 1/1] Fix device listener interface for PCI to
> PCI bridges
> 
> On 05/28/15 17:05, Michael S. Tsirkin wrote:
> > On Thu, May 28, 2015 at 11:03:07PM +0200, Michael S. Tsirkin wrote:
> >> On Thu, May 28, 2015 at 03:09:48PM -0400, Don Slutz wrote:
> >>> On 05/28/15 08:28, Michael S. Tsirkin wrote:
> >>>> On Thu, May 28, 2015 at 07:25:50AM -0400, Don Slutz wrote:
> >>>>> On 05/28/15 05:30, Michael S. Tsirkin wrote:
> >>>>>> On Thu, May 28, 2015 at 04:46:37AM -0400, Don Slutz wrote:
> >>>>>>> The commit 707ff80021ccd7a68f4b3d2c44eebf87efbb41c4 assumed
> that a
> >>>>>>> PCI device has a static address.  This is not true for PCI devices
> >>>>>>> that are on the secondary bus of a PCI to PCI bridge.
> >>>>>>>
> 
> ...
> 
> >>>> Not really because it's not just secondary bus number.
> >>>> Changing subordinate bus numbers can hide/unhide whole buses.
> >>>>
> >>> You are right.  I have no idea what Paul Durrant was thinking about this
> >>> case.
> >>> And this would apply to PCI_SUBORDINATE_BUS not
> PCI_SECONDARY_BUS.
> >>>
> >>> Since at QEMU 2.2 Xen sends all pci-config requests to QEMU, things
> >>> "worked".
> >> Why "worked"?
> >>
> 
> The issue is that PCI to PCI bridges were not configured by hvmloader
> nor by SeaBIOS.  I do have a patch
> to Xen that fixes this (I was rebasing on the latest Xen to post it and
> my test case stopped working).  So PCI devices that exist on the
> secondary side of a PCI to PCI bridge could only be used (by Linux with
> at least pci=assign-busses) after booting.
> 
> 
> >>> It is not clear to me that the complexity of tracking bus
> >>> visibility make sense.  Clearly you do.
> >> Well what was the point of the change?
> 
> To get config cycles that are valid that you do not get today.
> 
> >> If you don't care that we get irrelevant config cycles why not
> >> just send them all to QEMU?
> >>
> 
> That would need to be answered by Paul Durrant and/or other people (see
> below)
> 

We could broadcast config space ioreqs to all emulators, the problem is how do we know (in the case of a read) which emulator is actually the one supplying the data? At some level Xen needs to know who is implementing what.

  Paul

> >>
> >>>>> Would it be better to have:
> >>>>>
> >>>>> void device_listener_change_pci_bus_num(PCIBus *b, PCIDevice *d,
> void
> >>>>> *opaque)
> >>>>> {
> >>>>>     uint8_t oldbus = (uint8_t)opaque;
> >>>>>     DEVICE_LISTENER_CALL(change_pci_bus_num, Forward, d, oldbus);
> >>>>> }
> >>>>>
> >>>>> So that the above works, or to add a function to convert args?
> >>>>>
> >>>>>> To know whether device is accessible to config cycles, you
> >>>>>> really need to scan the hierarchy from root downwards.
> >>>>>>
> >>>>> Yes, that is correct.  However what I am doing here is not
> >>>>> changing how QEMU checks if the device is accessible, but
> >>>>> changing what pci config Xen sends to QEMU.  If the change
> >>>>> to PCI_SECONDARY_BUS hides this PCI to PCI bridge, that is
> >>>>> not an issue.
> >>>>>
> >>>>>
> >>>>>    -Don Slutz
> >>>>>
> >>>>>
> >>>> Imagine a bridge with secondary bus number 5
> >>>> behind another one with subordinate numbers 1-3.
> >>>> You should not send conf cycles for bus number 5 to qemu.
> >>>>
> >>> That is correct.  How ever unless Paul Durrant has an example of more
> then 1
> >>> "QEMU" where this would make a difference, the cases I am aware of
> are:
> >>>
> >>> 1) Xen does not send it, and returns 0xffffffff (or smaller).
> >>>
> >>> 2) QEMU returns 0xffffffff (or smaller).
> >>>
> >>> I will grant that #1 is faster, but it also is only happening during start
> >>> up and so I do not see the clear win to add more complex code to only
> do #1.
> >>>
> >>>    -Don Slutz
> >> It's not about faster. I assumed you need to get just the correct
> >> stuff out to QEMU to gain the better security as
> >> 3996e85c1822e05c50250f8d2d1e57b6bea1229d claims.
> > And if that's not the case, please educate me.
> 
> I do not know enough about this to answer here.  I have added xen-devel
> list to this in the hope
> that someone there can answer the "better security" question.
> 
>    -Don Slutz
> 
> 
> >> --
> >> MST
diff mbox

Patch

diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index b0f0f84..6a514ee 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -239,6 +239,16 @@  void device_listener_unregister(DeviceListener *listener)
     QTAILQ_REMOVE(&device_listeners, listener, link);
 }
 
+void device_listener_realize(DeviceState *dev)
+{
+    DEVICE_LISTENER_CALL(realize, Forward, dev);
+}
+
+void device_listener_unrealize(DeviceState *dev)
+{
+    DEVICE_LISTENER_CALL(unrealize, Forward, dev);
+}
+
 static void device_realize(DeviceState *dev, Error **errp)
 {
     DeviceClass *dc = DEVICE_GET_CLASS(dev);
diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 40c97b1..042680d 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -241,6 +241,18 @@  void pci_bridge_update_mappings(PCIBridge *br)
     pci_bridge_region_cleanup(br, w);
 }
 
+static void pci_bridge_realize_sub(PCIBus *b, PCIDevice *d,
+                                   void *opaque)
+{
+    device_listener_realize(DEVICE(d));
+}
+
+static void pci_bridge_unrealize_sub(PCIBus *b, PCIDevice *d,
+                                     void *opaque)
+{
+    device_listener_unrealize(DEVICE(d));
+}
+
 /* default write_config function for PCI-to-PCI bridge */
 void pci_bridge_write_config(PCIDevice *d,
                              uint32_t address, uint32_t val, int len)
@@ -248,6 +260,8 @@  void pci_bridge_write_config(PCIDevice *d,
     PCIBridge *s = PCI_BRIDGE(d);
     uint16_t oldctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
     uint16_t newctl;
+    uint8_t oldbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
+    uint8_t newbus;
 
     pci_default_write_config(d, address, val, len);
 
@@ -265,6 +279,22 @@  void pci_bridge_write_config(PCIDevice *d,
         pci_bridge_update_mappings(s);
     }
 
+    newbus = pci_get_byte(d->config + PCI_SECONDARY_BUS);
+    if (newbus != oldbus) {
+        PCIBus *sec_bus = pci_bridge_get_sec_bus(s);
+
+        if (oldbus && oldbus != 255) {
+            pci_set_byte(d->config + PCI_SECONDARY_BUS, oldbus);
+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
+                                pci_bridge_unrealize_sub, NULL);
+            pci_set_byte(d->config + PCI_SECONDARY_BUS, newbus);
+        }
+        if (newbus && newbus != 255) {
+            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
+                                pci_bridge_realize_sub, NULL);
+        }
+    }
+
     newctl = pci_get_word(d->config + PCI_BRIDGE_CONTROL);
     if (~oldctl & newctl & PCI_BRIDGE_CTL_BUS_RESET) {
         /* Trigger hot reset on 0->1 transition. */
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index d4be92f..8bd38af 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -387,5 +387,7 @@  static inline bool qbus_is_hotpluggable(BusState *bus)
 
 void device_listener_register(DeviceListener *listener);
 void device_listener_unregister(DeviceListener *listener);
+void device_listener_realize(DeviceState *dev);
+void device_listener_unrealize(DeviceState *dev);
 
 #endif
diff --git a/include/hw/xen/xen_common.h b/include/hw/xen/xen_common.h
index 38f29fb..1c27d51 100644
--- a/include/hw/xen/xen_common.h
+++ b/include/hw/xen/xen_common.h
@@ -347,12 +347,31 @@  static inline void xen_map_pcidev(XenXC xc, domid_t dom,
                                   ioservid_t ioservid,
                                   PCIDevice *pci_dev)
 {
-    trace_xen_map_pcidev(ioservid, pci_bus_num(pci_dev->bus),
-                         PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn));
-    xc_hvm_map_pcidev_to_ioreq_server(xc, dom, ioservid,
-                                      0, pci_bus_num(pci_dev->bus),
-                                      PCI_SLOT(pci_dev->devfn),
-                                      PCI_FUNC(pci_dev->devfn));
+    if (pci_bus_is_root(pci_dev->bus)) {
+        trace_xen_map_pcidev(ioservid, 0,
+                             PCI_SLOT(pci_dev->devfn),
+                             PCI_FUNC(pci_dev->devfn));
+        xc_hvm_map_pcidev_to_ioreq_server(xc, dom, ioservid,
+                                          0, 0,
+                                          PCI_SLOT(pci_dev->devfn),
+                                          PCI_FUNC(pci_dev->devfn));
+    } else {
+        int subbus = pci_bus_num(pci_dev->bus);
+
+        if (subbus && subbus != 255) {
+            trace_xen_map_pcidev(ioservid, subbus,
+                                 PCI_SLOT(pci_dev->devfn),
+                                 PCI_FUNC(pci_dev->devfn));
+            xc_hvm_map_pcidev_to_ioreq_server(xc, dom, ioservid,
+                                              0, subbus,
+                                              PCI_SLOT(pci_dev->devfn),
+                                              PCI_FUNC(pci_dev->devfn));
+        } else {
+            trace_xen_map_pcidev_unknown(ioservid,
+                                         PCI_SLOT(pci_dev->devfn),
+                                         PCI_FUNC(pci_dev->devfn));
+        }
+    }
 }
 
 static inline void xen_unmap_pcidev(XenXC xc, domid_t dom,
diff --git a/trace-events b/trace-events
index 11387c3..63c0361 100644
--- a/trace-events
+++ b/trace-events
@@ -931,6 +931,7 @@  xen_unmap_mmio_range(uint32_t id, uint64_t start_addr, uint64_t end_addr) "id: %
 xen_map_portio_range(uint32_t id, uint64_t start_addr, uint64_t end_addr) "id: %u start: %#"PRIx64" end: %#"PRIx64
 xen_unmap_portio_range(uint32_t id, uint64_t start_addr, uint64_t end_addr) "id: %u start: %#"PRIx64" end: %#"PRIx64
 xen_map_pcidev(uint32_t id, uint8_t bus, uint8_t dev, uint8_t func) "id: %u bdf: %02x.%02x.%02x"
+xen_map_pcidev_unknown(uint32_t id, uint8_t dev, uint8_t func) "id: %u bdf: ??.%02x.%02x"
 xen_unmap_pcidev(uint32_t id, uint8_t bus, uint8_t dev, uint8_t func) "id: %u bdf: %02x.%02x.%02x"
 
 # xen-mapcache.c